Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_GpuLaunchMacrosG.nolint.H
Go to the documentation of this file.
1// Do not include this header anywhere other than AMReX_GpuLaunchMacrosG.H.
2// The purpose of this file is to avoid clang-tidy.
3
4#ifdef AMREX_USE_SYCL
5#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
6 { auto const& amrex_i_tn = TN; \
7 if (!amrex::isEmpty(amrex_i_tn)) { \
8 if (amrex::Gpu::inLaunchRegion()) \
9 { \
10 const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
11 int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
12 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
13 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
14 try { \
15 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
16 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
17 sycl::range<1>(amrex_i_nthreads_per_block)), \
18 [=] (sycl::nd_item<1> amrex_i_item) \
19 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
20 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
21 { \
22 for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
23 block \
24 } \
25 }); \
26 }); \
27 } catch (sycl::exception const& ex) { \
28 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
29 } \
30 } \
31 else { \
32 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
33 }}}
34
35#if 0
36 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
37 block \
38 } \
39 }}}
40#endif
41
42#else
43#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
44 { auto const& amrex_i_tn = TN; \
45 if (!amrex::isEmpty(amrex_i_tn)) { \
46 if (amrex::Gpu::inLaunchRegion()) \
47 { \
48 const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
49 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
50 [=] AMREX_GPU_DEVICE () noexcept { \
51 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
52 block \
53 } \
54 }); \
55 AMREX_GPU_ERROR_CHECK(); \
56 } \
57 else { \
58 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
59 block \
60 } \
61 }}}
62#endif
63
64// two fused launches
65#ifdef AMREX_USE_SYCL
66#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
67 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
68 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
69 if (amrex::Gpu::inLaunchRegion()) \
70 { \
71 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
72 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
73 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
74 amrex_i_ec2.numBlocks.x); \
75 amrex_i_nblocks.y = 2; \
76 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
77 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
78 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
79 try { \
80 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
81 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
82 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
83 [=] (sycl::nd_item<2> amrex_i_item) \
84 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
85 { \
86 switch (amrex_i_item.get_group(1)) { \
87 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
88 block1 \
89 } \
90 break; \
91 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
92 block2 \
93 } \
94 } \
95 }); \
96 }); \
97 } catch (sycl::exception const& ex) { \
98 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
99 } \
100 } \
101 else { \
102 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
103 }}}
104
105#if 0
106 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
107 block1 \
108 } \
109 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
110 block2 \
111 } \
112 }}}
113#endif
114
115#else
116#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
117 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
118 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
119 if (amrex::Gpu::inLaunchRegion()) \
120 { \
121 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
122 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
123 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
124 amrex_i_ec2.numBlocks.x); \
125 amrex_i_nblocks.y = 2; \
126 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
127 [=] AMREX_GPU_DEVICE () noexcept { \
128 switch (blockIdx.y) { \
129 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
130 block1 \
131 } \
132 break; \
133 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
134 block2 \
135 } \
136 } \
137 }); \
138 AMREX_GPU_ERROR_CHECK(); \
139 } \
140 else { \
141 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
142 block1 \
143 } \
144 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
145 block2 \
146 } \
147 }}}
148#endif
149
150// three fused launches
151#ifdef AMREX_USE_SYCL
152#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
153 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
154 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
155 if (amrex::Gpu::inLaunchRegion()) \
156 { \
157 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
158 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
159 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
160 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
161 amrex_i_ec2.numBlocks.x), \
162 amrex_i_ec3.numBlocks.x); \
163 amrex_i_nblocks.y = 3; \
164 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
165 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
166 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
167 try { \
168 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
169 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
170 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
171 [=] (sycl::nd_item<2> amrex_i_item) \
172 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
173 { \
174 switch (amrex_i_item.get_group(1)) { \
175 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
176 block1 \
177 } \
178 break; \
179 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
180 block2 \
181 } \
182 break; \
183 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
184 block3 \
185 } \
186 } \
187 }); \
188 }); \
189 } catch (sycl::exception const& ex) { \
190 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
191 } \
192 } \
193 else { \
194 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
195 }}}
196
197#if 0
198 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
199 block1 \
200 } \
201 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
202 block2 \
203 } \
204 for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
205 block3 \
206 } \
207 }}}
208#endif
209
210#else
211#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
212 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
213 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
214 if (amrex::Gpu::inLaunchRegion()) \
215 { \
216 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
217 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
218 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
219 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
220 amrex_i_ec2.numBlocks.x), \
221 amrex_i_ec3.numBlocks.x); \
222 amrex_i_nblocks.y = 3; \
223 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
224 [=] AMREX_GPU_DEVICE () noexcept { \
225 switch (blockIdx.y) { \
226 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
227 block1 \
228 } \
229 break; \
230 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
231 block2 \
232 } \
233 break; \
234 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
235 block3 \
236 } \
237 } \
238 }); \
239 AMREX_GPU_ERROR_CHECK(); \
240 } \
241 else { \
242 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
243 block1 \
244 } \
245 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
246 block2 \
247 } \
248 for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
249 block3 \
250 } \
251 }}}
252#endif
253
254#ifdef AMREX_USE_SYCL
255#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
256 { auto const& amrex_i_tn = TN; \
257 if (!amrex::isEmpty(amrex_i_tn)) { \
258 if (amrex::Gpu::inLaunchRegion()) \
259 { \
260 auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
261 int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
262 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
263 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
264 try { \
265 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
266 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
267 sycl::range<1>(amrex_i_nthreads_per_block)), \
268 [=] (sycl::nd_item<1> amrex_i_item) \
269 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
270 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
271 { \
272 for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
273 block \
274 } \
275 }); \
276 }); \
277 } catch (sycl::exception const& ex) { \
278 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
279 } \
280 } \
281 else { \
282 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
283 }}}
284#else
285#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
286 { auto const& amrex_i_tn = TN; \
287 if (!amrex::isEmpty(amrex_i_tn)) { \
288 if (amrex::Gpu::inLaunchRegion()) \
289 { \
290 auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
291 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
292 [=] AMREX_GPU_DEVICE () noexcept { \
293 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
294 block \
295 } \
296 }); \
297 AMREX_GPU_ERROR_CHECK(); \
298 } \
299 else { \
300 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
301 }}}
302#endif
303
304// two fused launches
305#ifdef AMREX_USE_SYCL
306#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
307 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
308 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
309 if (amrex::Gpu::inLaunchRegion()) \
310 { \
311 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
312 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
313 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
314 amrex_i_ec2.numBlocks.x); \
315 amrex_i_nblocks.y = 2; \
316 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
317 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
318 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
319 try { \
320 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
321 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
322 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
323 [=] (sycl::nd_item<2> amrex_i_item) \
324 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
325 { \
326 switch (amrex_i_item.get_group(1)) { \
327 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
328 block1 \
329 } \
330 break; \
331 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
332 block2 \
333 } \
334 } \
335 }); \
336 }); \
337 } catch (sycl::exception const& ex) { \
338 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
339 } \
340 } \
341 else { \
342 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
343 }}}
344#else
345#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
346 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
347 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
348 if (amrex::Gpu::inLaunchRegion()) \
349 { \
350 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
351 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
352 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
353 amrex_i_ec2.numBlocks.x); \
354 amrex_i_nblocks.y = 2; \
355 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
356 [=] AMREX_GPU_DEVICE () noexcept { \
357 switch (blockIdx.y) { \
358 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
359 block1 \
360 } \
361 break; \
362 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
363 block2 \
364 } \
365 } \
366 }); \
367 AMREX_GPU_ERROR_CHECK(); \
368 } \
369 else { \
370 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
371 }}}
372#endif
373
374// three fused launches
375#ifdef AMREX_USE_SYCL
376#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
377 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
378 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
379 if (amrex::Gpu::inLaunchRegion()) \
380 { \
381 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
382 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
383 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
384 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
385 amrex_i_ec2.numBlocks.x), \
386 amrex_i_ec3.numBlocks.x); \
387 amrex_i_nblocks.y = 3; \
388 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
389 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
390 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
391 try { \
392 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
393 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
394 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
395 [=] (sycl::nd_item<2> amrex_i_item) \
396 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
397 { \
398 switch (amrex_i_item.get_group(1)) { \
399 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
400 block1 \
401 } \
402 break; \
403 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
404 block2 \
405 } \
406 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
407 block3 \
408 } \
409 } \
410 }); \
411 }); \
412 } catch (sycl::exception const& ex) { \
413 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
414 } \
415 } \
416 else { \
417 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
418 }}}
419#else
420#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
421 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
422 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
423 if (amrex::Gpu::inLaunchRegion()) \
424 { \
425 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
426 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
427 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
428 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
429 amrex_i_ec2.numBlocks.x), \
430 amrex_i_ec3.numBlocks.x); \
431 amrex_i_nblocks.y = 3; \
432 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
433 [=] AMREX_GPU_DEVICE () noexcept { \
434 switch (blockIdx.y) { \
435 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
436 block1 \
437 } \
438 break; \
439 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
440 block2 \
441 } \
442 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
443 block3 \
444 } \
445 } \
446 }); \
447 AMREX_GPU_ERROR_CHECK(); \
448 } \
449 else { \
450 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
451 }}}
452#endif
453
454// FOR_1D
455
456#ifdef AMREX_USE_SYCL
457#define AMREX_HOST_DEVICE_FOR_1D(n,i,block) \
458{ \
459 auto const& amrex_i_n = n; \
460 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
461 if (amrex::Gpu::inLaunchRegion()) { \
462 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
463 } else { \
464 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
465 } \
466}
467#else
468#define AMREX_HOST_DEVICE_FOR_1D(n,i,block) \
469{ \
470 auto const& amrex_i_n = n; \
471 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
472 if (amrex::Gpu::inLaunchRegion()) { \
473 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
474 } else { \
475 auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
476 AMREX_PRAGMA_SIMD \
477 for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
478 } \
479}
480#endif
481
482#ifdef AMREX_USE_SYCL
483#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D(n,i,block) \
484{ \
485 auto const& amrex_i_n = n; \
486 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
487 if (amrex::Gpu::inLaunchRegion()) { \
488 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
489 } else { \
490 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
491 } \
492}
493#else
494#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D(n,i,block) \
495{ \
496 auto const& amrex_i_n = n; \
497 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
498 if (amrex::Gpu::inLaunchRegion()) { \
499 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
500 } else { \
501 auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
502 AMREX_PRAGMA_SIMD \
503 for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
504 } \
505}
506#endif
507
508#define AMREX_FOR_1D(n,i,block) \
509{ \
510 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
511 amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
512}
513
514#define AMREX_PARALLEL_FOR_1D(n,i,block) \
515{ \
516 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
517 amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
518}
519
520// FOR_3D
521
522#ifdef AMREX_USE_SYCL
523#define AMREX_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
524{ \
525 auto const& amrex_i_box = box; \
526 if (amrex::Gpu::inLaunchRegion()) { \
527 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
528 } else { \
529 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
530 } \
531}
532#else
533#define AMREX_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
534{ \
535 auto const& amrex_i_box = box; \
536 if (amrex::Gpu::inLaunchRegion()) { \
537 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
538 } else { \
539 amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
540 } \
541}
542#endif
543
544#ifdef AMREX_USE_SYCL
545#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(box,i,j,k,block) \
546{ \
547 auto const& amrex_i_box = box; \
548 if (amrex::Gpu::inLaunchRegion()) { \
549 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
550 } else { \
551 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
552 } \
553}
554#else
555#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(box,i,j,k,block) \
556{ \
557 auto const& amrex_i_box = box; \
558 if (amrex::Gpu::inLaunchRegion()) { \
559 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
560 } else { \
561 amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
562 } \
563}
564#endif
565
566#define AMREX_FOR_3D(box,i,j,k,block) \
567{ \
568 amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
569}
570
571#define AMREX_PARALLEL_FOR_3D(box,i,j,k,block) \
572{ \
573 amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
574}
575
576// FOR_4D
577
578#ifdef AMREX_USE_SYCL
579#define AMREX_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
580{ \
581 auto const& amrex_i_box = box; \
582 auto const& amrex_i_ncomp = ncomp; \
583 if (amrex::Gpu::inLaunchRegion()) { \
584 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
585 } else { \
586 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
587 } \
588}
589#else
590#define AMREX_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
591{ \
592 auto const& amrex_i_box = box; \
593 auto const& amrex_i_ncomp = ncomp; \
594 if (amrex::Gpu::inLaunchRegion()) { \
595 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
596 } else { \
597 amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
598 } \
599}
600#endif
601
602#ifdef AMREX_USE_SYCL
603#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
604{ \
605 auto const& amrex_i_box = box; \
606 auto const& amrex_i_ncomp = ncomp; \
607 if (amrex::Gpu::inLaunchRegion()) { \
608 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
609 } else { \
610 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
611 } \
612}
613#else
614#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
615{ \
616 auto const& amrex_i_box = box; \
617 auto const& amrex_i_ncomp = ncomp; \
618 if (amrex::Gpu::inLaunchRegion()) { \
619 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
620 } else { \
621 amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
622 } \
623}
624#endif
625
626#define AMREX_FOR_4D(box,ncomp,i,j,k,n,block) \
627{ \
628 amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
629}
630
631#define AMREX_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
632{ \
633 amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
634}
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:125
Definition AMReX_Amr.cpp:49