Block-Structured AMR Software Framework
AMReX_GpuLaunch.nolint.H
Go to the documentation of this file.
1 // Do not include this header anywhere other than AMReX_GpuLaunch.H.
2 // The purpose of this file is to avoid clang-tidy.
3 
4 #define AMREX_GET_LAUNCH_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
5 #define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
6  AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3, \
7  AMREX_WRONG_NUM_ARGS, \
8  AMREX_WRONG_NUM_ARGS, \
9  AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2, \
10  AMREX_WRONG_NUM_ARGS, \
11  AMREX_WRONG_NUM_ARGS, \
12  AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE, \
13  AMREX_WRONG_NUM_ARGS, \
14  AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
15 
16 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
17  AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3, \
18  AMREX_WRONG_NUM_ARGS, \
19  AMREX_WRONG_NUM_ARGS, \
20  AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2, \
21  AMREX_WRONG_NUM_ARGS, \
22  AMREX_WRONG_NUM_ARGS, \
23  AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE, \
24  AMREX_WRONG_NUM_ARGS, \
25  AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
26 
27 #if (AMREX_SPACEDIM == 1)
28 #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE (a1,a2,a3)
29 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(a1,a2,a3)
30 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(fl,a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_FLAG(fl,a1,a2,a3)
31 #elif (AMREX_SPACEDIM == 2)
32 #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_2 (a1,a2,a3,b1,b2,b3)
33 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(a1,a2,a3,b1,b2,b3)
34 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(fl,a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2_FLAG(fl,a1,a2,a3,b1,b2,b3)
35 #elif (AMREX_SPACEDIM == 3)
36 #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(...) AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3 (__VA_ARGS__)
37 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM(...) AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(__VA_ARGS__)
38 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM_FLAG(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3_FLAG(__VA_ARGS__)
39 #endif
40 
41 #define AMREX_FOR_1D(...) AMREX_GPU_DEVICE_FOR_1D(__VA_ARGS__)
42 #define AMREX_FOR_3D(...) AMREX_GPU_DEVICE_FOR_3D(__VA_ARGS__)
43 #define AMREX_FOR_4D(...) AMREX_GPU_DEVICE_FOR_4D(__VA_ARGS__)
44 
45 #define AMREX_PARALLEL_FOR_1D(...) AMREX_GPU_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
46 #define AMREX_PARALLEL_FOR_3D(...) AMREX_GPU_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
47 #define AMREX_PARALLEL_FOR_4D(...) AMREX_GPU_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
48 
49 #define AMREX_HOST_DEVICE_FOR_1D(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
50 #define AMREX_HOST_DEVICE_FOR_3D(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
51 #define AMREX_HOST_DEVICE_FOR_4D(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
52 
53 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
54 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
55 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
56 
57 #ifdef AMREX_USE_GPU
58 
59 #ifndef AMREX_USE_SYCL
60 
61 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
62  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
63  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
64  { \
65  amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
66  block \
67  ); \
68  } \
69  else { \
70  AMREX_PRAGMA_SIMD \
71  for (amrex_i_inttype i = 0; i < n; ++i) { \
72  block \
73  } \
74  }}
75 
76 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
77  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
78  { \
79  amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
80  block \
81  ); \
82  } \
83  else { \
84  amrex::LoopConcurrentOnCpu(box, [=] (int i, int j, int k) noexcept \
85  block \
86  ); \
87  }
88 
89 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
90  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
91  { \
92  amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
93  block \
94  ); \
95  } \
96  else { \
97  amrex::LoopConcurrentOnCpu(box, nc, [=] (int i, int j, int k, int n) noexcept \
98  block \
99  ); \
100  }
101 
102 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
103  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
104  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
105  { \
106  amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
107  block \
108  ); \
109  } \
110  else { \
111  for (amrex_i_inttype i = 0; i < n; ++i) { \
112  block \
113  } \
114  }}
115 
116 #define AMREX_HOST_DEVICE_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
117  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
118  { \
119  amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
120  block \
121  ); \
122  } \
123  else { \
124  amrex::LoopOnCpu(box, [=] (int i, int j, int k) noexcept \
125  block \
126  ); \
127  }
128 
129 #define AMREX_HOST_DEVICE_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
130  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
131  { \
132  amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
133  block \
134  ); \
135  } \
136  else { \
137  amrex::LoopOnCpu(box, nc, [=] (int i, int j, int k, int n) noexcept \
138  block \
139  ); \
140  }
141 
142 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_FLAG(where_to_run,box,tbox,block) \
143  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
144  { \
145  AMREX_LAUNCH_DEVICE_LAMBDA(box,tbox,block); \
146  } else { \
147  auto tbox = box; \
148  block; \
149  }
150 
151 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_FLAG(where_to_run,bx1,tbx1,block1) \
152  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
153  { \
154  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1); \
155  } else { \
156  auto tbx1 = bx1; \
157  block1; \
158  }
159 
160 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2) \
161  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
162  { \
163  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2); \
164  } else { \
165  auto tbx1 = bx1; \
166  auto tbx2 = bx2; \
167  block1; \
168  block2; \
169  }
170 
171 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3) \
172  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
173  { \
174  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3); \
175  } else { \
176  auto tbx1 = bx1; \
177  auto tbx2 = bx2; \
178  auto tbx3 = bx3; \
179  block1; \
180  block2; \
181  block3; \
182  }
183 
184 #else
185 // xxxxx SYCL todo: host disabled in host device
186 
187 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
188  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
189  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
190  { \
191  amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
192  block \
193  ); \
194  } \
195  else { \
196  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
197  }}
198 
199 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
200  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
201  { \
202  amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
203  block \
204  ); \
205  } \
206  else { \
207  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
208  }
209 
210 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
211  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
212  { \
213  amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
214  block \
215  ); \
216  } \
217  else { \
218  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
219  }
220 
221 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
222  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
223  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
224  { \
225  amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
226  block \
227  ); \
228  } \
229  else { \
230  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
231  }}
232 
233 #define AMREX_HOST_DEVICE_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
234  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
235  { \
236  amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
237  block \
238  ); \
239  } \
240  else { \
241  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
242  }
243 
244 #define AMREX_HOST_DEVICE_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
245  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
246  { \
247  amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
248  block \
249  ); \
250  } \
251  else { \
252  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
253  }
254 
255 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_FLAG(where_to_run,box,tbox,block) \
256  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
257  { \
258  AMREX_LAUNCH_DEVICE_LAMBDA(box,tbox,block); \
259  } else { \
260  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
261  }
262 
263 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_FLAG(where_to_run,bx1,tbx1,block1) \
264  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
265  { \
266  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1); \
267  } else { \
268  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
269  }
270 
271 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2) \
272  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
273  { \
274  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2); \
275  } else { \
276  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
277  }
278 
279 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3) \
280  if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
281  { \
282  AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3); \
283  } else { \
284  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile"); \
285  }
286 
287 #endif
288 
289 #else
290 
291 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
292  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
293  amrex::ignore_unused(where_to_run); \
294  AMREX_PRAGMA_SIMD \
295  for (amrex_i_inttype i = 0; i < n; ++i) { \
296  block \
297  }}
298 
299 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
300  amrex::ignore_unused(where_to_run); \
301  amrex::LoopConcurrentOnCpu(box, [=] (int i, int j, int k) noexcept \
302  block \
303  );
304 
305 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
306  amrex::ignore_unused(where_to_run); \
307  amrex::LoopConcurrentOnCpu(box, nc, [=] (int i, int j, int k, int n) noexcept \
308  block \
309  );
310 
311 #define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
312  { using amrex_i_inttype = std::remove_const_t<decltype(n)>; \
313  amrex::ignore_unused(where_to_run); \
314  for (amrex_i_inttype i = 0; i < n; ++i) { \
315  block \
316  }}
317 
318 #define AMREX_HOST_DEVICE_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
319  amrex::ignore_unused(where_to_run); \
320  amrex::LoopOnCpu(box, [=] (int i, int j, int k) noexcept \
321  block \
322  );
323 
324 #define AMREX_HOST_DEVICE_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
325  amrex::ignore_unused(where_to_run); \
326  amrex::LoopOnCpu(box, nc, [=] (int i, int j, int k, int n) noexcept \
327  block \
328  );
329 
330 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_FLAG(where_to_run,box,tbox,block) \
331  amrex::ignore_unused(where_to_run); \
332  { \
333  auto tbox = box; \
334  block; \
335  }
336 
337 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_FLAG(where_to_run,bx1,tbx1,block1) \
338  amrex::ignore_unused(where_to_run); \
339  { \
340  auto tbx1 = bx1; \
341  block1; \
342  }
343 
344 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2) \
345  amrex::ignore_unused(where_to_run); \
346  { \
347  auto tbx1 = bx1; \
348  auto tbx2 = bx2; \
349  block1; \
350  block2; \
351  }
352 
353 #define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3) \
354  amrex::ignore_unused(where_to_run); \
355  { \
356  auto tbx1 = bx1; \
357  auto tbx2 = bx2; \
358  auto tbx3 = bx3; \
359  block1; \
360  block2; \
361  block3; \
362  }
363 
364 #endif