Block-Structured AMR Software Framework
AMReX_GpuLaunchMacrosG.nolint.H
Go to the documentation of this file.
1 // Do not include this header anywhere other than AMReX_GpuLaunchMacrosG.H.
2 // The purpose of this file is to avoid clang-tidy.
3 
4 #ifdef AMREX_USE_SYCL
5 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
6  { auto const& amrex_i_tn = TN; \
7  if (!amrex::isEmpty(amrex_i_tn)) { \
8  if (amrex::Gpu::inLaunchRegion()) \
9  { \
10  const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
11  int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
12  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
13  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
14  try { \
15  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
16  amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
17  sycl::range<1>(amrex_i_nthreads_per_block)), \
18  [=] (sycl::nd_item<1> amrex_i_item) \
19  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
20  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
21  { \
22  for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
23  block \
24  } \
25  }); \
26  }); \
27  } catch (sycl::exception const& ex) { \
28  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
29  } \
30  } \
31  else { \
32  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
33  }}}
34 
35 #if 0
36  for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
37  block \
38  } \
39  }}}
40 #endif
41 
42 #else
43 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
44  { auto const& amrex_i_tn = TN; \
45  if (!amrex::isEmpty(amrex_i_tn)) { \
46  if (amrex::Gpu::inLaunchRegion()) \
47  { \
48  const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
49  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
50  [=] AMREX_GPU_DEVICE () noexcept { \
51  for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
52  block \
53  } \
54  }); \
55  AMREX_GPU_ERROR_CHECK(); \
56  } \
57  else { \
58  for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
59  block \
60  } \
61  }}}
62 #endif
63 
64 // two fused launches
65 #ifdef AMREX_USE_SYCL
66 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
67  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
68  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
69  if (amrex::Gpu::inLaunchRegion()) \
70  { \
71  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
72  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
73  dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
74  amrex_i_ec2.numBlocks.x); \
75  amrex_i_nblocks.y = 2; \
76  int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
77  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
78  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
79  try { \
80  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
81  amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
82  sycl::range<2>(amrex_i_nthreads_per_block,1)), \
83  [=] (sycl::nd_item<2> amrex_i_item) \
84  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
85  { \
86  switch (amrex_i_item.get_group(1)) { \
87  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
88  block1 \
89  } \
90  break; \
91  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
92  block2 \
93  } \
94  } \
95  }); \
96  }); \
97  } catch (sycl::exception const& ex) { \
98  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
99  } \
100  } \
101  else { \
102  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
103  }}}
104 
105 #if 0
106  for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
107  block1 \
108  } \
109  for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
110  block2 \
111  } \
112  }}}
113 #endif
114 
115 #else
116 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
117  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
118  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
119  if (amrex::Gpu::inLaunchRegion()) \
120  { \
121  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
122  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
123  dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
124  amrex_i_ec2.numBlocks.x); \
125  amrex_i_nblocks.y = 2; \
126  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
127  [=] AMREX_GPU_DEVICE () noexcept { \
128  switch (blockIdx.y) { \
129  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
130  block1 \
131  } \
132  break; \
133  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
134  block2 \
135  } \
136  } \
137  }); \
138  AMREX_GPU_ERROR_CHECK(); \
139  } \
140  else { \
141  for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
142  block1 \
143  } \
144  for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
145  block2 \
146  } \
147  }}}
148 #endif
149 
150 // three fused launches
151 #ifdef AMREX_USE_SYCL
152 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
153  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
154  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
155  if (amrex::Gpu::inLaunchRegion()) \
156  { \
157  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
158  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
159  const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
160  dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
161  amrex_i_ec2.numBlocks.x), \
162  amrex_i_ec3.numBlocks.x); \
163  amrex_i_nblocks.y = 3; \
164  int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
165  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
166  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
167  try { \
168  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
169  amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
170  sycl::range<2>(amrex_i_nthreads_per_block,1)), \
171  [=] (sycl::nd_item<2> amrex_i_item) \
172  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
173  { \
174  switch (amrex_i_item.get_group(1)) { \
175  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
176  block1 \
177  } \
178  break; \
179  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
180  block2 \
181  } \
182  break; \
183  case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
184  block3 \
185  } \
186  } \
187  }); \
188  }); \
189  } catch (sycl::exception const& ex) { \
190  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
191  } \
192  } \
193  else { \
194  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
195  }}}
196 
197 #if 0
198  for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
199  block1 \
200  } \
201  for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
202  block2 \
203  } \
204  for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
205  block3 \
206  } \
207  }}}
208 #endif
209 
210 #else
211 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
212  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
213  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
214  if (amrex::Gpu::inLaunchRegion()) \
215  { \
216  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
217  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
218  const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
219  dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
220  amrex_i_ec2.numBlocks.x), \
221  amrex_i_ec3.numBlocks.x); \
222  amrex_i_nblocks.y = 3; \
223  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
224  [=] AMREX_GPU_DEVICE () noexcept { \
225  switch (blockIdx.y) { \
226  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
227  block1 \
228  } \
229  break; \
230  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
231  block2 \
232  } \
233  break; \
234  case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
235  block3 \
236  } \
237  } \
238  }); \
239  AMREX_GPU_ERROR_CHECK(); \
240  } \
241  else { \
242  for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
243  block1 \
244  } \
245  for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
246  block2 \
247  } \
248  for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
249  block3 \
250  } \
251  }}}
252 #endif
253 
254 #ifdef AMREX_USE_SYCL
255 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
256  { auto const& amrex_i_tn = TN; \
257  if (!amrex::isEmpty(amrex_i_tn)) { \
258  if (amrex::Gpu::inLaunchRegion()) \
259  { \
260  auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
261  int amrex_i_nthreads_per_block = amrex_i_ec.numThreads.x; \
262  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_ec.numBlocks.x; \
263  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
264  try { \
265  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
266  amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
267  sycl::range<1>(amrex_i_nthreads_per_block)), \
268  [=] (sycl::nd_item<1> amrex_i_item) \
269  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
270  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
271  { \
272  for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
273  block \
274  } \
275  }); \
276  }); \
277  } catch (sycl::exception const& ex) { \
278  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
279  } \
280  } \
281  else { \
282  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
283  }}}
284 #else
285 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE(TN,TI,block) \
286  { auto const& amrex_i_tn = TN; \
287  if (!amrex::isEmpty(amrex_i_tn)) { \
288  if (amrex::Gpu::inLaunchRegion()) \
289  { \
290  auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
291  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
292  [=] AMREX_GPU_DEVICE () noexcept { \
293  for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
294  block \
295  } \
296  }); \
297  AMREX_GPU_ERROR_CHECK(); \
298  } \
299  else { \
300  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE: cannot call device function from host"); \
301  }}}
302 #endif
303 
304 // two fused launches
305 #ifdef AMREX_USE_SYCL
306 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
307  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
308  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
309  if (amrex::Gpu::inLaunchRegion()) \
310  { \
311  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
312  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
313  dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
314  amrex_i_ec2.numBlocks.x); \
315  amrex_i_nblocks.y = 2; \
316  int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
317  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
318  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
319  try { \
320  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
321  amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,2), \
322  sycl::range<2>(amrex_i_nthreads_per_block,1)), \
323  [=] (sycl::nd_item<2> amrex_i_item) \
324  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
325  { \
326  switch (amrex_i_item.get_group(1)) { \
327  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
328  block1 \
329  } \
330  break; \
331  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
332  block2 \
333  } \
334  } \
335  }); \
336  }); \
337  } catch (sycl::exception const& ex) { \
338  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
339  } \
340  } \
341  else { \
342  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
343  }}}
344 #else
345 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
346  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
347  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2)) { \
348  if (amrex::Gpu::inLaunchRegion()) \
349  { \
350  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
351  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
352  dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
353  amrex_i_ec2.numBlocks.x); \
354  amrex_i_nblocks.y = 2; \
355  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
356  [=] AMREX_GPU_DEVICE () noexcept { \
357  switch (blockIdx.y) { \
358  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
359  block1 \
360  } \
361  break; \
362  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
363  block2 \
364  } \
365  } \
366  }); \
367  AMREX_GPU_ERROR_CHECK(); \
368  } \
369  else { \
370  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
371  }}}
372 #endif
373 
374 // three fused launches
375 #ifdef AMREX_USE_SYCL
376 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
377  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
378  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
379  if (amrex::Gpu::inLaunchRegion()) \
380  { \
381  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
382  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
383  const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
384  dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
385  amrex_i_ec2.numBlocks.x), \
386  amrex_i_ec3.numBlocks.x); \
387  amrex_i_nblocks.y = 3; \
388  int amrex_i_nthreads_per_block = amrex_i_ec1.numThreads.x; \
389  int amrex_i_nthreads_total = amrex_i_nthreads_per_block * amrex_i_nblocks.x; \
390  auto& amrex_i_q = amrex::Gpu::Device::streamQueue(); \
391  try { \
392  amrex_i_q.submit([&] (sycl::handler& amrex_i_h) { \
393  amrex_i_h.parallel_for(sycl::nd_range<2>(sycl::range<2>(amrex_i_nthreads_total,3), \
394  sycl::range<2>(amrex_i_nthreads_per_block,1)), \
395  [=] (sycl::nd_item<2> amrex_i_item) \
396  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
397  { \
398  switch (amrex_i_item.get_group(1)) { \
399  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
400  block1 \
401  } \
402  break; \
403  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
404  block2 \
405  } \
406  case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
407  block3 \
408  } \
409  } \
410  }); \
411  }); \
412  } catch (sycl::exception const& ex) { \
413  amrex::Abort(std::string("LAUNCH: ")+ex.what()+"!!!!!"); \
414  } \
415  } \
416  else { \
417  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
418  }}}
419 #else
420 #define AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
421  { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
422  if (!amrex::isEmpty(amrex_i_tn1) || !amrex::isEmpty(amrex_i_tn2) || !amrex::isEmpty(amrex_i_tn3)) { \
423  if (amrex::Gpu::inLaunchRegion()) \
424  { \
425  const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
426  const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
427  const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
428  dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
429  amrex_i_ec2.numBlocks.x), \
430  amrex_i_ec3.numBlocks.x); \
431  amrex_i_nblocks.y = 3; \
432  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
433  [=] AMREX_GPU_DEVICE () noexcept { \
434  switch (blockIdx.y) { \
435  case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
436  block1 \
437  } \
438  break; \
439  case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
440  block2 \
441  } \
442  case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
443  block3 \
444  } \
445  } \
446  }); \
447  AMREX_GPU_ERROR_CHECK(); \
448  } \
449  else { \
450  amrex::Abort("AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2: cannot call device function from host"); \
451  }}}
452 #endif
453 
454 // FOR_1D
455 
456 #ifdef AMREX_USE_SYCL
457 #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
458 { \
459  auto const& amrex_i_n = n; \
460  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
461  if (amrex::Gpu::inLaunchRegion()) { \
462  amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
463  } else { \
464  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
465  } \
466 }
467 #else
468 #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
469 { \
470  auto const& amrex_i_n = n; \
471  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
472  if (amrex::Gpu::inLaunchRegion()) { \
473  amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
474  } else { \
475  auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
476  AMREX_PRAGMA_SIMD \
477  for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
478  } \
479 }
480 #endif
481 
482 #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \
483 { \
484  using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
485  amrex::ParallelFor(n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
486 }
487 
488 // FOR_3D
489 
490 #ifdef AMREX_USE_SYCL
491 #define AMREX_GPU_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
492 { \
493  auto const& amrex_i_box = box; \
494  if (amrex::Gpu::inLaunchRegion()) { \
495  amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
496  } else { \
497  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
498  } \
499 }
500 #else
501 #define AMREX_GPU_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
502 { \
503  auto const& amrex_i_box = box; \
504  if (amrex::Gpu::inLaunchRegion()) { \
505  amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
506  } else { \
507  amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
508  } \
509 }
510 #endif
511 
512 #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \
513 { \
514  amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
515 }
516 
517 // FOR_4D
518 
519 #ifdef AMREX_USE_SYCL
520 #define AMREX_GPU_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
521 { \
522  auto const& amrex_i_box = box; \
523  auto const& amrex_i_ncomp = ncomp; \
524  if (amrex::Gpu::inLaunchRegion()) { \
525  amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
526  } else { \
527  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
528  } \
529 }
530 #else
531 #define AMREX_GPU_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
532 { \
533  auto const& amrex_i_box = box; \
534  auto const& amrex_i_ncomp = ncomp; \
535  if (amrex::Gpu::inLaunchRegion()) { \
536  amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
537  } else { \
538  amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
539  } \
540 }
541 #endif
542 
543 #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
544 { \
545  amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
546 }
547 
548 #define AMREX_GPU_DEVICE_PARALLEL_FOR_1D(...) AMREX_GPU_DEVICE_FOR_1D(__VA_ARGS__)
549 #define AMREX_GPU_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_DEVICE_FOR_3D(__VA_ARGS__)
550 #define AMREX_GPU_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_DEVICE_FOR_4D(__VA_ARGS__)
551 
552 #define AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_1D(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
553 #define AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
554 #define AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition: AMReX_GpuRange.H:125