Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuLaunchMacrosG.nolint.H
Go to the documentation of this file.
1// Do not include this header anywhere other than AMReX_GpuLaunchMacrosG.H.
2// The purpose of this file is to avoid clang-tidy.
3
4#ifdef AMREX_USE_SYCL
5#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
6 { auto const& amrex_i_tn = TN; \
7 if (!amrex::isEmpty(amrex_i_tn)) { \
8 if (amrex::Gpu::inLaunchRegion()) \
9 { \
10 const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
11 int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
12 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
13 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
14 try { \
15 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
16 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
17 sycl::range<1>(amrex_i_nthreads_per_block)), \
18 [=] (sycl::nd_item<1> amrex_i_item) \
19 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
20 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
21 { \
22 for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
23 block \
24 } \
25 }); \
26 }); \
27 } catch (sycl::exception const& ex) { \
28 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
29 } \
30 } \
31 else { \
32 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
33 }}}
34
35#if 0
36 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
37 block \
38 } \
39 }}}
40#endif
41
42#else
43#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
44 { auto const& amrex_i_tn = TN; \
45 if (!amrex::isEmpty(amrex_i_tn)) { \
46 if (amrex::Gpu::inLaunchRegion()) \
47 { \
48 const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
49 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
50 [=] AMREX_GPU_DEVICE () noexcept { \
51 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
52 block \
53 } \
54 }); \
55 AMREX_GPU_ERROR_CHECK(); \
56 } \
57 else { \
58 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
59 block \
60 } \
61 }}}
62#endif
63
64// two fused launches
65#ifdef AMREX_USE_SYCL
66#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
67 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
68 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
69 if (amrex::Gpu::inLaunchRegion()) \
70 { \
71 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
72 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
73 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
74 amrex_i_ec2.numBlocks.x); \
75 amrex_i_nblocks.y = 2; \
76 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
77 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
78 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
79 try { \
80 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
81 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
82 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
83 [=] (sycl::nd_item<2> amrex_i_item) \
84 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
85 { \
86 switch (amrex_i_item.get_group(1)) { \
87 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
88 block1 \
89 } \
90 break; \
91 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
92 block2 \
93 } \
94 } \
95 }); \
96 }); \
97 } catch (sycl::exception const& ex) { \
98 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
99 } \
100 } \
101 else { \
102 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
103 }}}
104
105#if 0
106 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
107 block1 \
108 } \
109 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
110 block2 \
111 } \
112 }}}
113#endif
114
115#else
116#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
117 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
118 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
119 if (amrex::Gpu::inLaunchRegion()) \
120 { \
121 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
122 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
123 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
124 amrex_i_ec2.numBlocks.x); \
125 amrex_i_nblocks.y = 2; \
126 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
127 [=] AMREX_GPU_DEVICE () noexcept { \
128 switch (blockIdx.y) { \
129 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
130 block1 \
131 } \
132 break; \
133 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
134 block2 \
135 } \
136 } \
137 }); \
138 AMREX_GPU_ERROR_CHECK(); \
139 } \
140 else { \
141 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
142 block1 \
143 } \
144 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
145 block2 \
146 } \
147 }}}
148#endif
149
150// three fused launches
151#ifdef AMREX_USE_SYCL
152#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
153 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
154 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
155 if (amrex::Gpu::inLaunchRegion()) \
156 { \
157 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
158 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
159 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
160 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
161 amrex_i_ec2.numBlocks.x), \
162 amrex_i_ec3.numBlocks.x); \
163 amrex_i_nblocks.y = 3; \
164 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
165 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
166 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
167 try { \
168 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
169 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
170 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
171 [=] (sycl::nd_item<2> amrex_i_item) \
172 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
173 { \
174 switch (amrex_i_item.get_group(1)) { \
175 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
176 block1 \
177 } \
178 break; \
179 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
180 block2 \
181 } \
182 break; \
183 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
184 block3 \
185 } \
186 } \
187 }); \
188 }); \
189 } catch (sycl::exception const& ex) { \
190 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
191 } \
192 } \
193 else { \
194 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
195 }}}
196
197#if 0
198 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
199 block1 \
200 } \
201 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
202 block2 \
203 } \
204 for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
205 block3 \
206 } \
207 }}}
208#endif
209
210#else
211#define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
212 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
213 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
214 if (amrex::Gpu::inLaunchRegion()) \
215 { \
216 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
217 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
218 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
219 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
220 amrex_i_ec2.numBlocks.x), \
221 amrex_i_ec3.numBlocks.x); \
222 amrex_i_nblocks.y = 3; \
223 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
224 [=] AMREX_GPU_DEVICE () noexcept { \
225 switch (blockIdx.y) { \
226 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
227 block1 \
228 } \
229 break; \
230 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
231 block2 \
232 } \
233 break; \
234 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
235 block3 \
236 } \
237 } \
238 }); \
239 AMREX_GPU_ERROR_CHECK(); \
240 } \
241 else { \
242 for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
243 block1 \
244 } \
245 for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
246 block2 \
247 } \
248 for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
249 block3 \
250 } \
251 }}}
252#endif
253
254#ifdef AMREX_USE_SYCL
255#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
256 { auto const& amrex_i_tn = TN; \
257 if (!amrex::isEmpty(amrex_i_tn)) { \
258 if (amrex::Gpu::inLaunchRegion()) \
259 { \
260 auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
261 int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
262 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
263 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
264 try { \
265 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
266 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
267 sycl::range<1>(amrex_i_nthreads_per_block)), \
268 [=] (sycl::nd_item<1> amrex_i_item) \
269 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
270 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
271 { \
272 for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
273 block \
274 } \
275 }); \
276 }); \
277 } catch (sycl::exception const& ex) { \
278 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
279 } \
280 } \
281 else { \
282 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
283 }}}
284#else
285#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
286 { auto const& amrex_i_tn = TN; \
287 if (!amrex::isEmpty(amrex_i_tn)) { \
288 if (amrex::Gpu::inLaunchRegion()) \
289 { \
290 auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
291 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
292 [=] AMREX_GPU_DEVICE () noexcept { \
293 for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
294 block \
295 } \
296 }); \
297 AMREX_GPU_ERROR_CHECK(); \
298 } \
299 else { \
300 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
301 }}}
302#endif
303
304// two fused launches
305#ifdef AMREX_USE_SYCL
306#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
307 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
308 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
309 if (amrex::Gpu::inLaunchRegion()) \
310 { \
311 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
312 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
313 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
314 amrex_i_ec2.numBlocks.x); \
315 amrex_i_nblocks.y = 2; \
316 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
317 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
318 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
319 try { \
320 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
321 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
322 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
323 [=] (sycl::nd_item<2> amrex_i_item) \
324 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
325 { \
326 switch (amrex_i_item.get_group(1)) { \
327 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
328 block1 \
329 } \
330 break; \
331 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
332 block2 \
333 } \
334 } \
335 }); \
336 }); \
337 } catch (sycl::exception const& ex) { \
338 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
339 } \
340 } \
341 else { \
342 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
343 }}}
344#else
345#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
346 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
347 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
348 if (amrex::Gpu::inLaunchRegion()) \
349 { \
350 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
351 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
352 dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
353 amrex_i_ec2.numBlocks.x); \
354 amrex_i_nblocks.y = 2; \
355 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
356 [=] AMREX_GPU_DEVICE () noexcept { \
357 switch (blockIdx.y) { \
358 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
359 block1 \
360 } \
361 break; \
362 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
363 block2 \
364 } \
365 } \
366 }); \
367 AMREX_GPU_ERROR_CHECK(); \
368 } \
369 else { \
370 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
371 }}}
372#endif
373
374// three fused launches
375#ifdef AMREX_USE_SYCL
376#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
377 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
378 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
379 if (amrex::Gpu::inLaunchRegion()) \
380 { \
381 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
382 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
383 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
384 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
385 amrex_i_ec2.numBlocks.x), \
386 amrex_i_ec3.numBlocks.x); \
387 amrex_i_nblocks.y = 3; \
388 int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
389 int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
390 auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
391 try { \
392 amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
393 amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
394 sycl::range<2>(amrex_i_nthreads_per_block,1)), \
395 [=] (sycl::nd_item<2> amrex_i_item) \
396 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
397 { \
398 switch (amrex_i_item.get_group(1)) { \
399 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
400 block1 \
401 } \
402 break; \
403 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
404 block2 \
405 } \
406 break; \
407 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
408 block3 \
409 } \
410 } \
411 }); \
412 }); \
413 } catch (sycl::exception const& ex) { \
414 amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
415 } \
416 } \
417 else { \
418 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
419 }}}
420#else
421#define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
422 { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
423 if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
424 if (amrex::Gpu::inLaunchRegion()) \
425 { \
426 const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
427 const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
428 const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
429 dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
430 amrex_i_ec2.numBlocks.x), \
431 amrex_i_ec3.numBlocks.x); \
432 amrex_i_nblocks.y = 3; \
433 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
434 [=] AMREX_GPU_DEVICE () noexcept { \
435 switch (blockIdx.y) { \
436 case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
437 block1 \
438 } \
439 break; \
440 case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
441 block2 \
442 } \
443 break; \
444 case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
445 block3 \
446 } \
447 } \
448 }); \
449 AMREX_GPU_ERROR_CHECK(); \
450 } \
451 else { \
452 amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
453 }}}
454#endif
455
456// FOR_1D
457
458#ifdef AMREX_USE_SYCL
459#define AMREX_HOST_DEVICE_FOR_1D(n,i,block) \
460{ \
461 auto const& amrex_i_n = n; \
462 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
463 if (amrex::Gpu::inLaunchRegion()) { \
464 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
465 } else { \
466 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
467 } \
468}
469#else
470#define AMREX_HOST_DEVICE_FOR_1D(n,i,block) \
471{ \
472 auto const& amrex_i_n = n; \
473 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
474 if (amrex::Gpu::inLaunchRegion()) { \
475 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
476 } else { \
477 auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
478 AMREX_PRAGMA_SIMD \
479 for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
480 } \
481}
482#endif
483
484#ifdef AMREX_USE_SYCL
485#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D(n,i,block) \
486{ \
487 auto const& amrex_i_n = n; \
488 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
489 if (amrex::Gpu::inLaunchRegion()) { \
490 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
491 } else { \
492 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
493 } \
494}
495#else
496#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D(n,i,block) \
497{ \
498 auto const& amrex_i_n = n; \
499 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
500 if (amrex::Gpu::inLaunchRegion()) { \
501 amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
502 } else { \
503 auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
504 AMREX_PRAGMA_SIMD \
505 for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
506 } \
507}
508#endif
509
510#define AMREX_FOR_1D(n,i,block) \
511{ \
512 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
513 amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
514}
515
516#define AMREX_PARALLEL_FOR_1D(n,i,block) \
517{ \
518 using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
519 amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
520}
521
522// FOR_3D
523
524#ifdef AMREX_USE_SYCL
525#define AMREX_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
526{ \
527 auto const& amrex_i_box = box; \
528 if (amrex::Gpu::inLaunchRegion()) { \
529 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
530 } else { \
531 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
532 } \
533}
534#else
535#define AMREX_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
536{ \
537 auto const& amrex_i_box = box; \
538 if (amrex::Gpu::inLaunchRegion()) { \
539 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
540 } else { \
541 amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
542 } \
543}
544#endif
545
546#ifdef AMREX_USE_SYCL
547#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(box,i,j,k,block) \
548{ \
549 auto const& amrex_i_box = box; \
550 if (amrex::Gpu::inLaunchRegion()) { \
551 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
552 } else { \
553 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
554 } \
555}
556#else
557#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(box,i,j,k,block) \
558{ \
559 auto const& amrex_i_box = box; \
560 if (amrex::Gpu::inLaunchRegion()) { \
561 amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
562 } else { \
563 amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
564 } \
565}
566#endif
567
568#define AMREX_FOR_3D(box,i,j,k,block) \
569{ \
570 amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
571}
572
573#define AMREX_PARALLEL_FOR_3D(box,i,j,k,block) \
574{ \
575 amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
576}
577
578// FOR_4D
579
580#ifdef AMREX_USE_SYCL
581#define AMREX_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
582{ \
583 auto const& amrex_i_box = box; \
584 auto const& amrex_i_ncomp = ncomp; \
585 if (amrex::Gpu::inLaunchRegion()) { \
586 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
587 } else { \
588 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
589 } \
590}
591#else
592#define AMREX_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
593{ \
594 auto const& amrex_i_box = box; \
595 auto const& amrex_i_ncomp = ncomp; \
596 if (amrex::Gpu::inLaunchRegion()) { \
597 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
598 } else { \
599 amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
600 } \
601}
602#endif
603
604#ifdef AMREX_USE_SYCL
605#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
606{ \
607 auto const& amrex_i_box = box; \
608 auto const& amrex_i_ncomp = ncomp; \
609 if (amrex::Gpu::inLaunchRegion()) { \
610 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
611 } else { \
612 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
613 } \
614}
615#else
616#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
617{ \
618 auto const& amrex_i_box = box; \
619 auto const& amrex_i_ncomp = ncomp; \
620 if (amrex::Gpu::inLaunchRegion()) { \
621 amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
622 } else { \
623 amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
624 } \
625}
626#endif
627
628#define AMREX_FOR_4D(box,ncomp,i,j,k,n,block) \
629{ \
630 amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
631}
632
633#define AMREX_PARALLEL_FOR_4D(box,ncomp,i,j,k,n,block) \
634{ \
635 amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
636}
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
Definition AMReX_Amr.cpp:49