Block-Structured AMR Software Framework
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2 #define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3 #include <AMReX_Config.H>
4 
5 namespace amrex {
6 
7 namespace detail {
8 
9  // call_f_scalar_handler
10 
11  template <typename F, typename N>
13  auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
14  noexcept -> decltype(f(0))
15  {
16  f(i);
17  }
18 
19  template <typename F, typename N>
21  auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
22  noexcept -> decltype(f(0,Gpu::Handler{}))
23  {
24  f(i, handler);
25  }
26 
27  // call_f_intvect_inner
28 
29  template <typename F, std::size_t...Ns, class...Args>
31  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
32  noexcept -> decltype(f(0, 0, 0, args...))
33  {
34  f(iv[0], 0, 0, args...);
35  }
36 
37  template <typename F, std::size_t...Ns, class...Args>
39  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
40  noexcept -> decltype(f(0, 0, 0, args...))
41  {
42  f(iv[0], iv[1], 0, args...);
43  }
44 
45  template <typename F, int dim, std::size_t...Ns, class...Args>
47  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
48  noexcept -> decltype(f(iv, args...))
49  {
50  f(iv, args...);
51  }
52 
53  template <typename F, int dim, std::size_t...Ns, class...Args>
55  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
56  noexcept -> decltype(f(iv[Ns]..., args...))
57  {
58  f(iv[Ns]..., args...);
59  }
60 
61  // call_f_intvect
62 
63  template <typename F, int dim>
65  auto call_f_intvect (F const& f, IntVectND<dim> iv)
66  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
67  {
68  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
69  }
70 
71  // call_f_intvect_engine
72 
73  template <typename F, int dim>
75  auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
76  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
77  {
78  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
79  }
80 
81  // call_f_intvect_handler
82 
83  template <typename F, int dim>
86  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
87  {
88  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
89  }
90 
91  template <typename F, int dim>
93  auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
94  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
95  {
96  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
97  }
98 
99  // call_f_intvect_ncomp
100 
101  template <typename F, typename T, int dim>
103  auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
104  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
105  {
106  for (T n = 0; n < ncomp; ++n) {
107  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
108  }
109  }
110 
111  // call_f_intvect_ncomp_engine
112 
113  template <typename F, typename T, int dim>
115  auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
116  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
117  {
118  for (T n = 0; n < ncomp; ++n) {
119  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
120  }
121  }
122 
123  // call_f_intvect_ncomp_handler
124 
125  template <typename F, typename T, int dim>
127  auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
128  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
129  {
130  for (T n = 0; n < ncomp; ++n) {
131  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
132  }
133  }
134 
135  template <typename F, typename T, int dim>
137  auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
138  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
139  {
140  for (T n = 0; n < ncomp; ++n) {
141  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
142  }
143  }
144 
145 }
146 
147 #ifdef AMREX_USE_SYCL
148 
149 template <typename L>
150 void single_task (gpuStream_t stream, L const& f) noexcept
151 {
152  auto& q = *(stream.queue);
153  try {
154  q.submit([&] (sycl::handler& h) {
155  h.single_task([=] () { f(); });
156  });
157  } catch (sycl::exception const& ex) {
158  amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
159  }
160 }
161 
162 template<typename L>
163 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
164  gpuStream_t stream, L const& f) noexcept
165 {
166  const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167  const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
168  / sizeof(unsigned long long);
169  auto& q = *(stream.queue);
170  try {
171  q.submit([&] (sycl::handler& h) {
172  sycl::local_accessor<unsigned long long>
173  shared_data(sycl::range<1>(shared_mem_numull), h);
174  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175  sycl::range<1>(nthreads_per_block)),
176  [=] (sycl::nd_item<1> item)
177  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
178  {
179  f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
180  });
181  });
182  } catch (sycl::exception const& ex) {
183  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
184  }
185 }
186 
187 template<typename L>
188 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
189 {
190  const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191  auto& q = *(stream.queue);
192  try {
193  q.submit([&] (sycl::handler& h) {
194  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195  sycl::range<1>(nthreads_per_block)),
196  [=] (sycl::nd_item<1> item)
197  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
198  {
199  f(item);
200  });
201  });
202  } catch (sycl::exception const& ex) {
203  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
204  }
205 }
206 
207 template <int MT, typename L>
208 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
209  L const& f) noexcept
210 {
211  const auto nthreads_total = MT * std::size_t(nblocks);
212  const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
213  / sizeof(unsigned long long);
214  auto& q = *(stream.queue);
215  try {
216  q.submit([&] (sycl::handler& h) {
217  sycl::local_accessor<unsigned long long>
218  shared_data(sycl::range<1>(shared_mem_numull), h);
219  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
220  sycl::range<1>(MT)),
221  [=] (sycl::nd_item<1> item)
222  [[sycl::reqd_work_group_size(MT)]]
223  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
224  {
225  f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
226  });
227  });
228  } catch (sycl::exception const& ex) {
229  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230  }
231 }
232 
233 template <int MT, typename L>
234 void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
235 {
236  const auto nthreads_total = MT * std::size_t(nblocks);
237  auto& q = *(stream.queue);
238  try {
239  q.submit([&] (sycl::handler& h) {
240  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
241  sycl::range<1>(MT)),
242  [=] (sycl::nd_item<1> item)
243  [[sycl::reqd_work_group_size(MT)]]
244  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
245  {
246  f(item);
247  });
248  });
249  } catch (sycl::exception const& ex) {
250  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
251  }
252 }
253 
254 template<int MT, typename T, typename L>
255 void launch (T const& n, L const& f) noexcept
256 {
257  if (amrex::isEmpty(n)) { return; }
258  const auto ec = Gpu::makeExecutionConfig<MT>(n);
259  const auto nthreads_per_block = ec.numThreads.x;
260  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
261  auto& q = Gpu::Device::streamQueue();
262  try {
263  q.submit([&] (sycl::handler& h) {
264  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
265  sycl::range<1>(nthreads_per_block)),
266  [=] (sycl::nd_item<1> item)
267  [[sycl::reqd_work_group_size(MT)]]
268  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
269  {
270  for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
271  f(i);
272  }
273  });
274  });
275  } catch (sycl::exception const& ex) {
276  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
277  }
278 }
279 
280 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
281 void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
282 {
283  if (amrex::isEmpty(n)) { return; }
284  const auto ec = Gpu::makeExecutionConfig<MT>(n);
285  const auto nthreads_per_block = ec.numThreads.x;
286  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
287  auto& q = Gpu::Device::streamQueue();
288  try {
289  if (info.hasReduction()) {
290  q.submit([&] (sycl::handler& h) {
291  sycl::local_accessor<unsigned long long>
292  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
293  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
294  sycl::range<1>(nthreads_per_block)),
295  [=] (sycl::nd_item<1> item)
296  [[sycl::reqd_work_group_size(MT)]]
297  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
298  {
299  for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
300  i < std::size_t(n); i += stride) {
301  int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
302  item.get_local_range(0));
303  detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
304  n_active_threads});
305  }
306  });
307  });
308  } else {
309  q.submit([&] (sycl::handler& h) {
310  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
311  sycl::range<1>(nthreads_per_block)),
312  [=] (sycl::nd_item<1> item)
313  [[sycl::reqd_work_group_size(MT)]]
314  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
315  {
316  for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
317  i < std::size_t(n); i += stride) {
318  detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
319  }
320  });
321  });
322  }
323  } catch (sycl::exception const& ex) {
324  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
325  }
326 }
327 
328 template <int MT, typename L, int dim>
329 void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
330 {
331  if (amrex::isEmpty(box)) { return; }
332  const BoxIndexerND<dim> indexer(box);
333  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
334  const auto nthreads_per_block = ec.numThreads.x;
335  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
336  auto& q = Gpu::Device::streamQueue();
337  try {
338  if (info.hasReduction()) {
339  q.submit([&] (sycl::handler& h) {
340  sycl::local_accessor<unsigned long long>
341  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
342  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
343  sycl::range<1>(nthreads_per_block)),
344  [=] (sycl::nd_item<1> item)
345  [[sycl::reqd_work_group_size(MT)]]
346  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
347  {
348  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
349  icell < indexer.numPts(); icell += stride) {
350  auto iv = indexer.intVect(icell);
351  int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
352  std::uint64_t(item.get_local_range(0)));
353  detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
354  n_active_threads});
355  }
356  });
357  });
358  } else {
359  q.submit([&] (sycl::handler& h) {
360  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
361  sycl::range<1>(nthreads_per_block)),
362  [=] (sycl::nd_item<1> item)
363  [[sycl::reqd_work_group_size(MT)]]
364  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
365  {
366  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
367  icell < indexer.numPts(); icell += stride) {
368  auto iv = indexer.intVect(icell);
369  detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
370  }
371  });
372  });
373  }
374  } catch (sycl::exception const& ex) {
375  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
376  }
377 }
378 
379 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
380 void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
381 {
382  if (amrex::isEmpty(box)) { return; }
383  const BoxIndexerND<dim> indexer(box);
384  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
385  const auto nthreads_per_block = ec.numThreads.x;
386  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
387  auto& q = Gpu::Device::streamQueue();
388  try {
389  if (info.hasReduction()) {
390  q.submit([&] (sycl::handler& h) {
391  sycl::local_accessor<unsigned long long>
392  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
393  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
394  sycl::range<1>(nthreads_per_block)),
395  [=] (sycl::nd_item<1> item)
396  [[sycl::reqd_work_group_size(MT)]]
397  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
398  {
399  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
400  icell < indexer.numPts(); icell += stride) {
401  auto iv = indexer.intVect(icell);
402  int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
403  std::uint64_t(item.get_local_range(0)));
404  detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
405  Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
406  n_active_threads});
407  }
408  });
409  });
410  } else {
411  q.submit([&] (sycl::handler& h) {
412  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
413  sycl::range<1>(nthreads_per_block)),
414  [=] (sycl::nd_item<1> item)
415  [[sycl::reqd_work_group_size(MT)]]
416  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
417  {
418  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
419  icell < indexer.numPts(); icell += stride) {
420  auto iv = indexer.intVect(icell);
421  detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
422  }
423  });
424  });
425  }
426  } catch (sycl::exception const& ex) {
427  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
428  }
429 }
430 
431 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
432 void ParallelForRNG (T n, L const& f) noexcept
433 {
434  if (amrex::isEmpty(n)) { return; }
435  const auto ec = Gpu::ExecutionConfig(n);
436  const auto nthreads_per_block = ec.numThreads.x;
437  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
438  auto& q = Gpu::Device::streamQueue();
439  auto& engdescr = *(getRandEngineDescriptor());
440  try {
441  q.submit([&] (sycl::handler& h) {
442  auto engine_acc = engdescr.get_access(h);
443  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
444  sycl::range<1>(nthreads_per_block)),
445  [=] (sycl::nd_item<1> item)
446  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
447  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
448  {
449  auto const tid = item.get_global_id(0);
450  auto engine = engine_acc.load(tid);
451  RandomEngine rand_eng{&engine};
452  for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
453  f(T(i),rand_eng);
454  }
455  engine_acc.store(engine, tid);
456  });
457  });
458  q.wait_and_throw(); // because next launch might be on a different queue
459  } catch (sycl::exception const& ex) {
460  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461  }
462 }
463 
464 template <typename L, int dim>
465 void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
466 {
467  if (amrex::isEmpty(box)) { return; }
468  const BoxIndexerND<dim> indexer(box);
469  const auto ec = Gpu::ExecutionConfig(box.numPts());
470  const auto nthreads_per_block = ec.numThreads.x;
471  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
472  auto& q = Gpu::Device::streamQueue();
473  auto& engdescr = *(getRandEngineDescriptor());
474  try {
475  q.submit([&] (sycl::handler& h) {
476  auto engine_acc = engdescr.get_access(h);
477  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
478  sycl::range<1>(nthreads_per_block)),
479  [=] (sycl::nd_item<1> item)
480  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
481  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
482  {
483  auto const tid = item.get_global_id(0);
484  auto engine = engine_acc.load(tid);
485  RandomEngine rand_eng{&engine};
486  for (std::uint64_t icell = tid, stride = item.get_global_range(0);
487  icell < indexer.numPts(); icell += stride) {
488  auto iv = indexer.intVect(icell);
489  detail::call_f_intvect_engine(f,iv,rand_eng);
490  }
491  engine_acc.store(engine, tid);
492  });
493  });
494  q.wait_and_throw(); // because next launch might be on a different queue
495  } catch (sycl::exception const& ex) {
496  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
497  }
498 }
499 
500 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
501 void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
502 {
503  if (amrex::isEmpty(box)) { return; }
504  const BoxIndexerND<dim> indexer(box);
505  const auto ec = Gpu::ExecutionConfig(box.numPts());
506  const auto nthreads_per_block = ec.numThreads.x;
507  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
508  auto& q = Gpu::Device::streamQueue();
509  auto& engdescr = *(getRandEngineDescriptor());
510  try {
511  q.submit([&] (sycl::handler& h) {
512  auto engine_acc = engdescr.get_access(h);
513  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
514  sycl::range<1>(nthreads_per_block)),
515  [=] (sycl::nd_item<1> item)
516  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
517  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
518  {
519  auto const tid = item.get_global_id(0);
520  auto engine = engine_acc.load(tid);
521  RandomEngine rand_eng{&engine};
522  for (std::uint64_t icell = tid, stride = item.get_global_range(0);
523  icell < indexer.numPts(); icell += stride) {
524  auto iv = indexer.intVect(icell);
525  detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
526  }
527  engine_acc.store(engine, tid);
528  });
529  });
530  q.wait_and_throw(); // because next launch might be on a different queue
531  } catch (sycl::exception const& ex) {
532  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
533  }
534 }
535 
536 template <int MT, typename L1, typename L2, int dim>
537 void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
538 {
539  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
540  const BoxIndexerND<dim> indexer1(box1);
541  const BoxIndexerND<dim> indexer2(box2);
542  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
543  const auto nthreads_per_block = ec.numThreads.x;
544  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
545  auto& q = Gpu::Device::streamQueue();
546  try {
547  q.submit([&] (sycl::handler& h) {
548  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549  sycl::range<1>(nthreads_per_block)),
550  [=] (sycl::nd_item<1> item)
551  [[sycl::reqd_work_group_size(MT)]]
552  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553  {
554  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
555  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
556  icell < ncells; icell += stride) {
557  if (icell < indexer1.numPts()) {
558  auto iv = indexer1.intVect(icell);
559  detail::call_f_intvect(f1,iv);
560  }
561  if (icell < indexer2.numPts()) {
562  auto iv = indexer2.intVect(icell);
563  detail::call_f_intvect(f2,iv);
564  }
565  }
566  });
567  });
568  } catch (sycl::exception const& ex) {
569  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570  }
571 }
572 
573 template <int MT, typename L1, typename L2, typename L3, int dim>
574 void ParallelFor (Gpu::KernelInfo const& /*info*/,
575  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
576  L1&& f1, L2&& f2, L3&& f3) noexcept
577 {
578  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
579  const BoxIndexerND<dim> indexer1(box1);
580  const BoxIndexerND<dim> indexer2(box2);
581  const BoxIndexerND<dim> indexer3(box3);
582  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
583  const auto nthreads_per_block = ec.numThreads.x;
584  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
585  auto& q = Gpu::Device::streamQueue();
586  try {
587  q.submit([&] (sycl::handler& h) {
588  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
589  sycl::range<1>(nthreads_per_block)),
590  [=] (sycl::nd_item<1> item)
591  [[sycl::reqd_work_group_size(MT)]]
592  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
593  {
594  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
595  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
596  icell < ncells; icell += stride) {
597  if (icell < indexer1.numPts()) {
598  auto iv = indexer1.intVect(icell);
599  detail::call_f_intvect(f1,iv);
600  }
601  if (icell < indexer2.numPts()) {
602  auto iv = indexer2.intVect(icell);
603  detail::call_f_intvect(f2,iv);
604  }
605  if (icell < indexer3.numPts()) {
606  auto iv = indexer3.intVect(icell);
607  detail::call_f_intvect(f3,iv);
608  }
609  }
610  });
611  });
612  } catch (sycl::exception const& ex) {
613  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
614  }
615 }
616 
617 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
618  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
619  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
620 void ParallelFor (Gpu::KernelInfo const& /*info*/,
621  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
622  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
623 {
624  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
625  const BoxIndexerND<dim> indexer1(box1);
626  const BoxIndexerND<dim> indexer2(box2);
627  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
628  const auto nthreads_per_block = ec.numThreads.x;
629  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
630  auto& q = Gpu::Device::streamQueue();
631  try {
632  q.submit([&] (sycl::handler& h) {
633  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
634  sycl::range<1>(nthreads_per_block)),
635  [=] (sycl::nd_item<1> item)
636  [[sycl::reqd_work_group_size(MT)]]
637  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
638  {
639  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
640  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
641  icell < ncells; icell += stride) {
642  if (icell < indexer1.numPts()) {
643  auto iv = indexer1.intVect(icell);
644  detail::call_f_intvect_ncomp(f1,iv,ncomp1);
645  }
646  if (icell < indexer2.numPts()) {
647  auto iv = indexer2.intVect(icell);
648  detail::call_f_intvect_ncomp(f2,iv,ncomp2);
649  }
650  }
651  });
652  });
653  } catch (sycl::exception const& ex) {
654  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
655  }
656 }
657 
658 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
659  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
660  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
661  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
662 void ParallelFor (Gpu::KernelInfo const& /*info*/,
663  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
664  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
665  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
666 {
667  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
668  const BoxIndexerND<dim> indexer1(box1);
669  const BoxIndexerND<dim> indexer2(box2);
670  const BoxIndexerND<dim> indexer3(box3);
671  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
672  const auto nthreads_per_block = ec.numThreads.x;
673  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
674  auto& q = Gpu::Device::streamQueue();
675  try {
676  q.submit([&] (sycl::handler& h) {
677  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
678  sycl::range<1>(nthreads_per_block)),
679  [=] (sycl::nd_item<1> item)
680  [[sycl::reqd_work_group_size(MT)]]
681  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
682  {
683  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
684  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
685  icell < ncells; icell += stride) {
686  if (icell < indexer1.numPts()) {
687  auto iv = indexer1.intVect(icell);
688  detail::call_f_intvect_ncomp(f1,iv,ncomp1);
689  }
690  if (icell < indexer2.numPts()) {
691  auto iv = indexer2.intVect(icell);
692  detail::call_f_intvect_ncomp(f2,iv,ncomp2);
693  }
694  if (icell < indexer3.numPts()) {
695  auto iv = indexer3.intVect(icell);
696  detail::call_f_intvect_ncomp(f3,iv,ncomp3);
697  }
698  }
699  });
700  });
701  } catch (sycl::exception const& ex) {
702  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
703  }
704 }
705 
706 #else
707 // CUDA or HIP
708 
709 template <typename L>
710 void single_task (gpuStream_t stream, L const& f) noexcept
711 {
712  AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream,
713  [=] AMREX_GPU_DEVICE () noexcept {f();});
715 }
716 
717 template <int MT, typename L>
718 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
719  L const& f) noexcept
720 {
721  AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
722  [=] AMREX_GPU_DEVICE () noexcept { f(); });
724 }
725 
726 template <int MT, typename L>
727 void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
728 {
729  AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
730  [=] AMREX_GPU_DEVICE () noexcept { f(); });
732 }
733 
734 template<typename L>
735 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736  gpuStream_t stream, L const& f) noexcept
737 {
738  AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740  stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
742 }
743 
744 template<typename L>
745 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
746 {
747  launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
748 }
749 
750 template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
751 void launch (T const& n, L const& f) noexcept
752 {
753  static_assert(sizeof(T) >= 2);
754  if (amrex::isEmpty(n)) { return; }
755  const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
756  for (auto const& ec : nec) {
757  const T start_idx = T(ec.start_idx);
758  const T nleft = n - start_idx;
759  AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
760  [=] AMREX_GPU_DEVICE () noexcept {
761  // This will not overflow, even though nblocks*MT might.
762  auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
763  if (tid < nleft) {
764  f(tid+start_idx);
765  }
766  });
767  }
769 }
770 
771 template<int MT, int dim, typename L>
772 void launch (BoxND<dim> const& box, L const& f) noexcept
773 {
774  if (box.isEmpty()) { return; }
775  const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
776  const BoxIndexerND<dim> indexer(box);
777  const auto type = box.ixType();
778  for (auto const& ec : nec) {
779  const auto start_idx = std::uint64_t(ec.start_idx);
780  AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
781  [=] AMREX_GPU_DEVICE () noexcept {
782  auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
783  if (icell < indexer.numPts()) {
784  auto iv = indexer.intVect(icell);
785  f(BoxND<dim>(iv,iv,type));
786  }
787  });
788  }
790 }
791 
792 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
793 std::enable_if_t<MaybeDeviceRunnable<L>::value>
794 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
795 {
796  static_assert(sizeof(T) >= 2);
797  if (amrex::isEmpty(n)) { return; }
798  const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
799  for (auto const& ec : nec) {
800  const T start_idx = T(ec.start_idx);
801  const T nleft = n - start_idx;
802  AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
803  [=] AMREX_GPU_DEVICE () noexcept {
804  // This will not overflow, even though nblocks*MT might.
805  auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
806  if (tid < nleft) {
807  detail::call_f_scalar_handler(f, tid+start_idx,
808  Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
809  (std::uint64_t)MT)));
810  }
811  });
812  }
814 }
815 
816 template <int MT, typename L, int dim>
817 std::enable_if_t<MaybeDeviceRunnable<L>::value>
818 ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
819 {
820  if (amrex::isEmpty(box)) { return; }
821  const BoxIndexerND<dim> indexer(box);
822  const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
823  for (auto const& ec : nec) {
824  const auto start_idx = std::uint64_t(ec.start_idx);
825  AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
826  [=] AMREX_GPU_DEVICE () noexcept {
827  auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
828  if (icell < indexer.numPts()) {
829  auto iv = indexer.intVect(icell);
831  Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
832  (std::uint64_t)MT)));
833  }
834  });
835  }
837 }
838 
839 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
840 std::enable_if_t<MaybeDeviceRunnable<L>::value>
841 ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
842 {
843  if (amrex::isEmpty(box)) { return; }
844  const BoxIndexerND<dim> indexer(box);
845  const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
846  for (auto const& ec : nec) {
847  const auto start_idx = std::uint64_t(ec.start_idx);
848  AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
849  [=] AMREX_GPU_DEVICE () noexcept {
850  auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
851  if (icell < indexer.numPts()) {
852  auto iv = indexer.intVect(icell);
854  Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
855  (std::uint64_t)MT)));
856  }
857  });
858  }
860 }
861 
862 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
863 std::enable_if_t<MaybeDeviceRunnable<L>::value>
864 ParallelForRNG (T n, L const& f) noexcept
865 {
866  if (amrex::isEmpty(n)) { return; }
867  randState_t* rand_state = getRandState();
868  const auto ec = Gpu::ExecutionConfig(n);
869  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
870  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
871  ec.numThreads, 0, Gpu::gpuStream(),
872  [=] AMREX_GPU_DEVICE () noexcept {
873  Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
874  RandomEngine engine{&(rand_state[tid])};
875  for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
876  f(T(i),engine);
877  }
878  });
879  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
881 }
882 
883 template <typename L, int dim>
884 std::enable_if_t<MaybeDeviceRunnable<L>::value>
885 ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
886 {
887  if (amrex::isEmpty(box)) { return; }
888  randState_t* rand_state = getRandState();
889  const BoxIndexerND<dim> indexer(box);
890  const auto ec = Gpu::ExecutionConfig(box.numPts());
891  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
892  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
893  ec.numThreads, 0, Gpu::gpuStream(),
894  [=] AMREX_GPU_DEVICE () noexcept {
895  auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
896  RandomEngine engine{&(rand_state[tid])};
897  for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
898  auto iv = indexer.intVect(icell);
899  detail::call_f_intvect_engine(f, iv, engine);
900  }
901  });
902  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
904 }
905 
906 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
907 std::enable_if_t<MaybeDeviceRunnable<L>::value>
908 ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
909 {
910  if (amrex::isEmpty(box)) { return; }
911  randState_t* rand_state = getRandState();
912  const BoxIndexerND<dim> indexer(box);
913  const auto ec = Gpu::ExecutionConfig(box.numPts());
914  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
915  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
916  ec.numThreads, 0, Gpu::gpuStream(),
917  [=] AMREX_GPU_DEVICE () noexcept {
918  auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
919  RandomEngine engine{&(rand_state[tid])};
920  for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
921  auto iv = indexer.intVect(icell);
922  detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
923  }
924  });
925  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
927 }
928 
929 template <int MT, typename L1, typename L2, int dim>
930 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
932  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
933 {
934  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
935  const BoxIndexerND<dim> indexer1(box1);
936  const BoxIndexerND<dim> indexer2(box2);
937  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
938  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
939  [=] AMREX_GPU_DEVICE () noexcept {
940  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
941  for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
942  icell < ncells; icell += stride) {
943  if (icell < indexer1.numPts()) {
944  auto iv = indexer1.intVect(icell);
945  detail::call_f_intvect(f1, iv);
946  }
947  if (icell < indexer2.numPts()) {
948  auto iv = indexer2.intVect(icell);
949  detail::call_f_intvect(f2, iv);
950  }
951  }
952  });
954 }
955 
956 template <int MT, typename L1, typename L2, typename L3, int dim>
957 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
959  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
960  L1&& f1, L2&& f2, L3&& f3) noexcept
961 {
962  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
963  const BoxIndexerND<dim> indexer1(box1);
964  const BoxIndexerND<dim> indexer2(box2);
965  const BoxIndexerND<dim> indexer3(box3);
966  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
967  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
968  [=] AMREX_GPU_DEVICE () noexcept {
969  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
970  for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
971  icell < ncells; icell += stride) {
972  if (icell < indexer1.numPts()) {
973  auto iv = indexer1.intVect(icell);
974  detail::call_f_intvect(f1, iv);
975  }
976  if (icell < indexer2.numPts()) {
977  auto iv = indexer2.intVect(icell);
978  detail::call_f_intvect(f2, iv);
979  }
980  if (icell < indexer3.numPts()) {
981  auto iv = indexer3.intVect(icell);
982  detail::call_f_intvect(f3, iv);
983  }
984  }
985  });
987 }
988 
989 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
990  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
991  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
992 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
994  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
995  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
996 {
997  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
998  const BoxIndexerND<dim> indexer1(box1);
999  const BoxIndexerND<dim> indexer2(box2);
1000  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1001  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1002  [=] AMREX_GPU_DEVICE () noexcept {
1003  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1004  for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1005  icell < ncells; icell += stride) {
1006  if (icell < indexer1.numPts()) {
1007  auto iv = indexer1.intVect(icell);
1008  detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1009  }
1010  if (icell < indexer2.numPts()) {
1011  auto iv = indexer2.intVect(icell);
1012  detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1013  }
1014  }
1015  });
1017 }
1018 
1019 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1020  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1021  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1022  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1023 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1025  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1026  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1027  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1028 {
1029  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1030  const BoxIndexerND<dim> indexer1(box1);
1031  const BoxIndexerND<dim> indexer2(box2);
1032  const BoxIndexerND<dim> indexer3(box3);
1033  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1034  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1035  [=] AMREX_GPU_DEVICE () noexcept {
1036  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1037  for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1038  icell < ncells; icell += stride) {
1039  if (icell < indexer1.numPts()) {
1040  auto iv = indexer1.intVect(icell);
1041  detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1042  }
1043  if (icell < indexer2.numPts()) {
1044  auto iv = indexer2.intVect(icell);
1045  detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1046  }
1047  if (icell < indexer3.numPts()) {
1048  auto iv = indexer3.intVect(icell);
1049  detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1050  }
1051  }
1052  });
1054 }
1055 
1056 #endif
1057 
1058 template <typename L>
1059 void single_task (L&& f) noexcept
1060 {
1061  single_task(Gpu::gpuStream(), std::forward<L>(f));
1062 }
1063 
1064 template<typename T, typename L>
1065 void launch (T const& n, L&& f) noexcept
1066 {
1067  launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1068 }
1069 
1070 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1071 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1072 ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1073 {
1074  ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1075 }
1076 
1077 template <typename L, int dim>
1078 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1079 ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1080 {
1081  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1082 }
1083 
1084 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1085 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1086 ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1087 {
1088  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1089 }
1090 
1091 template <typename L1, typename L2, int dim>
1092 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1094  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1095 {
1096  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1097  std::forward<L2>(f2));
1098 }
1099 
1100 template <typename L1, typename L2, typename L3, int dim>
1101 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1103  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1104  L1&& f1, L2&& f2, L3&& f3) noexcept
1105 {
1106  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1107  std::forward<L2>(f2), std::forward<L3>(f3));
1108 }
1109 
1110 template <typename T1, typename T2, typename L1, typename L2, int dim,
1111  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1112  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1113 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1115  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1116  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1117 {
1118  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1119  box2, ncomp2, std::forward<L2>(f2));
1120 }
1121 
1122 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1123  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1124  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1125  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1126 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1128  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1129  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1130  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1131 {
1132  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1133  box2, ncomp2, std::forward<L2>(f2),
1134  box3, ncomp3, std::forward<L3>(f3));
1135 }
1136 
1137 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1138 void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1139 {
1140  ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1141 }
1142 
1143 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1144 void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1145 {
1146  ParallelFor<MT>(info, n,std::forward<L>(f));
1147 }
1148 
1149 template <typename L, int dim>
1150 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1151 {
1152  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1153 }
1154 
1155 template <int MT, typename L, int dim>
1156 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1157 {
1158  ParallelFor<MT>(info, box,std::forward<L>(f));
1159 }
1160 
1161 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1162 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1163 {
1164  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1165 }
1166 
1167 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1168 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1169 {
1170  ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1171 }
1172 
1173 template <typename L1, typename L2, int dim>
1174 void For (Gpu::KernelInfo const& info,
1175  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1176 {
1177  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1178 }
1179 
1180 template <int MT, typename L1, typename L2, int dim>
1181 void For (Gpu::KernelInfo const& info,
1182  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1183 {
1184  ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1185 }
1186 
1187 template <typename L1, typename L2, typename L3, int dim>
1188 void For (Gpu::KernelInfo const& info,
1189  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1190  L1&& f1, L2&& f2, L3&& f3) noexcept
1191 {
1192  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1193 }
1194 
1195 template <int MT, typename L1, typename L2, typename L3, int dim>
1196 void For (Gpu::KernelInfo const& info,
1197  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1198  L1&& f1, L2&& f2, L3&& f3) noexcept
1199 {
1200  ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1201 }
1202 
1203 template <typename T1, typename T2, typename L1, typename L2, int dim,
1204  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1205  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1206 void For (Gpu::KernelInfo const& info,
1207  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1208  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1209 {
1210  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1211 }
1212 
1213 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1214  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1215  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1216 void For (Gpu::KernelInfo const& info,
1217  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1218  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1219 {
1220  ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1221 }
1222 
1223 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1224  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1225  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1226  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1227 void For (Gpu::KernelInfo const& info,
1228  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1229  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1230  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1231 {
1232  ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1233  box1,ncomp1,std::forward<L1>(f1),
1234  box2,ncomp2,std::forward<L2>(f2),
1235  box3,ncomp3,std::forward<L3>(f3));
1236 }
1237 
1238 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1239  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1240  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1241  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1242 void For (Gpu::KernelInfo const& info,
1243  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1244  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1245  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1246 {
1247  ParallelFor<MT>(info,
1248  box1,ncomp1,std::forward<L1>(f1),
1249  box2,ncomp2,std::forward<L2>(f2),
1250  box3,ncomp3,std::forward<L3>(f3));
1251 }
1252 
1253 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1254 void ParallelFor (T n, L&& f) noexcept
1255 {
1256  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1257 }
1258 
1259 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1260 void ParallelFor (T n, L&& f) noexcept
1261 {
1262  ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1263 }
1264 
1265 template <typename L, int dim>
1266 void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1267 {
1268  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1269 }
1270 
1271 template <int MT, typename L, int dim>
1272 void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1273 {
1274  ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1275 }
1276 
1277 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1278 void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1279 {
1280  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1281 }
1282 
1283 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1284 void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1285 {
1286  ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1287 }
1288 
1289 template <typename L1, typename L2, int dim>
1290 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1291 {
1292  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1293 }
1294 
1295 template <int MT, typename L1, typename L2, int dim>
1296 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1297 {
1298  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1299 }
1300 
1301 template <typename L1, typename L2, typename L3, int dim>
1302 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1303  L1&& f1, L2&& f2, L3&& f3) noexcept
1304 {
1305  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1306 }
1307 
1308 template <int MT, typename L1, typename L2, typename L3, int dim>
1309 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1310  L1&& f1, L2&& f2, L3&& f3) noexcept
1311 {
1312  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1313 }
1314 
1315 template <typename T1, typename T2, typename L1, typename L2, int dim,
1316  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1317  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1318 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1319  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1320 {
1321  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1322 }
1323 
1324 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1325  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1326  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1327 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1328  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1329 {
1330  ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1331 }
1332 
1333 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1334  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1335  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1336  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1337 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1338  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1339  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1340 {
1341  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1342  box1,ncomp1,std::forward<L1>(f1),
1343  box2,ncomp2,std::forward<L2>(f2),
1344  box3,ncomp3,std::forward<L3>(f3));
1345 }
1346 
1347 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1348  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1349  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1350  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1351 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1352  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1353  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1354 {
1355  ParallelFor<MT>(Gpu::KernelInfo{},
1356  box1,ncomp1,std::forward<L1>(f1),
1357  box2,ncomp2,std::forward<L2>(f2),
1358  box3,ncomp3,std::forward<L3>(f3));
1359 }
1360 
1361 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1362 void For (T n, L&& f) noexcept
1363 {
1364  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1365 }
1366 
1367 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1368 void For (T n, L&& f) noexcept
1369 {
1370  ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1371 }
1372 
1373 template <typename L, int dim>
1374 void For (BoxND<dim> const& box, L&& f) noexcept
1375 {
1376  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1377 }
1378 
1379 template <int MT, typename L, int dim>
1380 void For (BoxND<dim> const& box, L&& f) noexcept
1381 {
1382  ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1383 }
1384 
1385 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1386 void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1387 {
1388  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1389 }
1390 
1391 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1392 void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1393 {
1394  ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1395 }
1396 
1397 template <typename L1, typename L2, int dim>
1398 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1399 {
1400  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1401 }
1402 
1403 template <int MT, typename L1, typename L2, int dim>
1404 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1405 {
1406  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1407 }
1408 
1409 template <typename L1, typename L2, typename L3, int dim>
1410 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1411  L1&& f1, L2&& f2, L3&& f3) noexcept
1412 {
1413  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1414 }
1415 
1416 template <int MT, typename L1, typename L2, typename L3, int dim>
1417 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1418  L1&& f1, L2&& f2, L3&& f3) noexcept
1419 {
1420  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1421 }
1422 
1423 template <typename T1, typename T2, typename L1, typename L2, int dim,
1424  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1425  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1426 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1427  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1428 {
1429  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1430 }
1431 
1432 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1433  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1434  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1435 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1436  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1437 {
1438  ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1439 }
1440 
1441 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1442  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1443  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1444  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1445 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1446  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1447  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1448 {
1449  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1450  box1,ncomp1,std::forward<L1>(f1),
1451  box2,ncomp2,std::forward<L2>(f2),
1452  box3,ncomp3,std::forward<L3>(f3));
1453 }
1454 
1455 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1456  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1457  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1458  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1459 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1460  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1461  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1462 {
1463  ParallelFor<MT>(Gpu::KernelInfo{},
1464  box1,ncomp1,std::forward<L1>(f1),
1465  box2,ncomp2,std::forward<L2>(f2),
1466  box3,ncomp3,std::forward<L3>(f3));
1467 }
1468 
1469 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1470 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1471 HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1472 {
1473  if (Gpu::inLaunchRegion()) {
1474  ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1475  } else {
1476 #ifdef AMREX_USE_SYCL
1477  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1478 #else
1480  for (T i = 0; i < n; ++i) { f(i); }
1481 #endif
1482  }
1483 }
1484 
1485 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1486 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1487 HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1488 {
1489  if (Gpu::inLaunchRegion()) {
1490  ParallelFor<MT>(info,n,std::forward<L>(f));
1491  } else {
1492 #ifdef AMREX_USE_SYCL
1493  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1494 #else
1496  for (T i = 0; i < n; ++i) { f(i); }
1497 #endif
1498  }
1499 }
1500 
1501 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1502 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1503 HostDeviceParallelFor (T n, L&& f) noexcept
1504 {
1505  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1506 }
1507 
1508 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1509 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1510 HostDeviceParallelFor (T n, L&& f) noexcept
1511 {
1512  HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1513 }
1514 
1515 template <typename L, int dim>
1516 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1517 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1518 {
1519  if (Gpu::inLaunchRegion()) {
1520  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1521  } else {
1522 #ifdef AMREX_USE_SYCL
1523  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1524 #else
1525  LoopConcurrentOnCpu(box,std::forward<L>(f));
1526 #endif
1527  }
1528 }
1529 
1530 template <int MT, typename L, int dim>
1531 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1532 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1533 {
1534  if (Gpu::inLaunchRegion()) {
1535  ParallelFor<MT>(info, box,std::forward<L>(f));
1536  } else {
1537 #ifdef AMREX_USE_SYCL
1538  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1539 #else
1540  LoopConcurrentOnCpu(box,std::forward<L>(f));
1541 #endif
1542  }
1543 }
1544 
1545 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1546 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1547 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1548 {
1549  if (Gpu::inLaunchRegion()) {
1550  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1551  } else {
1552 #ifdef AMREX_USE_SYCL
1553  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1554 #else
1555  LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1556 #endif
1557  }
1558 }
1559 
1560 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1561 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1562 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1563 {
1564  if (Gpu::inLaunchRegion()) {
1565  ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1566  } else {
1567 #ifdef AMREX_USE_SYCL
1568  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1569 #else
1570  LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1571 #endif
1572  }
1573 }
1574 
1575 template <typename L1, typename L2, int dim>
1576 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1578  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1579 {
1580  if (Gpu::inLaunchRegion()) {
1581  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1582  } else {
1583 #ifdef AMREX_USE_SYCL
1584  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1585 #else
1586  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1587  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1588 #endif
1589  }
1590 }
1591 
1592 template <int MT, typename L1, typename L2, int dim>
1593 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1595  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1596 {
1597  if (Gpu::inLaunchRegion()) {
1598  ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1599  } else {
1600 #ifdef AMREX_USE_SYCL
1601  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1602 #else
1603  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1604  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1605 #endif
1606  }
1607 }
1608 
1609 template <int MT, typename L1, typename L2, typename L3, int dim>
1610 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1612  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1613  L1&& f1, L2&& f2, L3&& f3) noexcept
1614 {
1615  if (Gpu::inLaunchRegion()) {
1616  ParallelFor<MT>(info,box1,box2,box3,
1617  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1618  } else {
1619 #ifdef AMREX_USE_SYCL
1620  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1621 #else
1622  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1623  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1624  LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1625 #endif
1626  }
1627 }
1628 
1629 template <typename T1, typename T2, typename L1, typename L2, int dim,
1630  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1631  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1632 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1634  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1635  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1636 {
1637  if (Gpu::inLaunchRegion()) {
1638  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1639  } else {
1640 #ifdef AMREX_USE_SYCL
1641  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1642 #else
1643  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1644  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1645 #endif
1646  }
1647 }
1648 
1649 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1650  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1651  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1652 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1654  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1655  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1656 {
1657  if (Gpu::inLaunchRegion()) {
1658  ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1659  } else {
1660 #ifdef AMREX_USE_SYCL
1661  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1662 #else
1663  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1664  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1665 #endif
1666  }
1667 }
1668 
1669 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1670  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1671  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1672  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1673 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1675  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1676  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1677  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1678 {
1679  if (Gpu::inLaunchRegion()) {
1680  ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1681  box1,ncomp1,std::forward<L1>(f1),
1682  box2,ncomp2,std::forward<L2>(f2),
1683  box3,ncomp3,std::forward<L3>(f3));
1684  } else {
1685 #ifdef AMREX_USE_SYCL
1686  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1687 #else
1688  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1689  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1690  LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1691 #endif
1692  }
1693 }
1694 
1695 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1696  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1697  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1698  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1699 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1701  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1702  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1703  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1704 {
1705  if (Gpu::inLaunchRegion()) {
1706  ParallelFor<MT>(info,
1707  box1,ncomp1,std::forward<L1>(f1),
1708  box2,ncomp2,std::forward<L2>(f2),
1709  box3,ncomp3,std::forward<L3>(f3));
1710  } else {
1711 #ifdef AMREX_USE_SYCL
1712  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1713 #else
1714  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1715  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1716  LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1717 #endif
1718  }
1719 }
1720 
1721 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1722 void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1723 {
1724  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1725 }
1726 
1727 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1728 void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1729 {
1730  HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1731 }
1732 
1733 template <typename L, int dim>
1734 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1735 {
1736  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1737 }
1738 
1739 template <int MT, typename L, int dim>
1740 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1741 {
1742  HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1743 }
1744 
1745 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1746 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1747 {
1748  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1749 }
1750 
1751 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1752 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1753 {
1754  HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1755 }
1756 
1757 template <typename L1, typename L2, int dim>
1758 void HostDeviceFor (Gpu::KernelInfo const& info,
1759  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1760 {
1761  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1762 }
1763 
1764 template <int MT, typename L1, typename L2, int dim>
1765 void HostDeviceFor (Gpu::KernelInfo const& info,
1766  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1767 {
1768  HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1769 }
1770 
1771 template <typename L1, typename L2, typename L3, int dim>
1772 void HostDeviceFor (Gpu::KernelInfo const& info,
1773  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1774  L1&& f1, L2&& f2, L3&& f3) noexcept
1775 {
1776  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1777  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1778 }
1779 
1780 template <int MT, typename L1, typename L2, typename L3, int dim>
1781 void HostDeviceFor (Gpu::KernelInfo const& info,
1782  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1783  L1&& f1, L2&& f2, L3&& f3) noexcept
1784 {
1785  HostDeviceParallelFor<MT>(info, box1,box2,box3,
1786  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1787 }
1788 
1789 template <typename T1, typename T2, typename L1, typename L2, int dim,
1790  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1791  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1792 void HostDeviceFor (Gpu::KernelInfo const& info,
1793  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1794  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1795 {
1796  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1797 }
1798 
1799 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1800  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1801  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1802 void HostDeviceFor (Gpu::KernelInfo const& info,
1803  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1804  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1805 {
1806  HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1807 }
1808 
1809 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1810  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1811  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1812  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1813 void HostDeviceFor (Gpu::KernelInfo const& info,
1814  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1815  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1816  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1817 {
1818  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1819  box1,ncomp1,std::forward<L1>(f1),
1820  box2,ncomp2,std::forward<L2>(f2),
1821  box3,ncomp3,std::forward<L3>(f3));
1822 }
1823 
1824 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1825  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1826  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1827  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1828 void HostDeviceFor (Gpu::KernelInfo const& info,
1829  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1830  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1831  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1832 {
1833  HostDeviceParallelFor<MT>(info,
1834  box1,ncomp1,std::forward<L1>(f1),
1835  box2,ncomp2,std::forward<L2>(f2),
1836  box3,ncomp3,std::forward<L3>(f3));
1837 }
1838 
1839 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1840 void HostDeviceParallelFor (T n, L&& f) noexcept
1841 {
1842  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1843 }
1844 
1845 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1846 void HostDeviceParallelFor (T n, L&& f) noexcept
1847 {
1848  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1849 }
1850 
1851 template <typename L, int dim>
1852 void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1853 {
1854  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1855 }
1856 
1857 template <int MT, typename L, int dim>
1858 void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1859 {
1860  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1861 }
1862 
1863 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1864 void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1865 {
1866  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1867 }
1868 
1869 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1870 void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1871 {
1872  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1873 }
1874 
1875 template <typename L1, typename L2, int dim>
1876 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1877 {
1878  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1879 }
1880 
1881 template <int MT, typename L1, typename L2, int dim>
1882 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1883 {
1884  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1885 }
1886 
1887 template <typename L1, typename L2, typename L3, int dim>
1888 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1889  L1&& f1, L2&& f2, L3&& f3) noexcept
1890 {
1891  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
1892  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1893 }
1894 
1895 template <int MT, typename L1, typename L2, typename L3, int dim>
1896 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1897  L1&& f1, L2&& f2, L3&& f3) noexcept
1898 {
1899  HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
1900  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1901 }
1902 
1903 template <typename T1, typename T2, typename L1, typename L2, int dim,
1904  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1905  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1906 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1907  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1908 {
1909  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1910 }
1911 
1912 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1913  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1914  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1915 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1916  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1917 {
1918  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1919 }
1920 
1921 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1922  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1923  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1924  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1925 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1926  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1927  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1928 {
1929  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1930  box1,ncomp1,std::forward<L1>(f1),
1931  box2,ncomp2,std::forward<L2>(f2),
1932  box3,ncomp3,std::forward<L3>(f3));
1933 }
1934 
1935 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1936  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1937  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1938  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1939 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1940  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1941  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1942 {
1943  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
1944  box1,ncomp1,std::forward<L1>(f1),
1945  box2,ncomp2,std::forward<L2>(f2),
1946  box3,ncomp3,std::forward<L3>(f3));
1947 }
1948 
1949 }
1950 
1951 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition: AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition: AMReX_GpuError.H:125
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition: AMReX_Box.H:43
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
Definition: AMReX_GpuKernelInfo.H:8
Definition: AMReX_IntVect.H:48
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition: AMReX_GpuRange.H:125
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
bool inLaunchRegion() noexcept
Definition: AMReX_GpuControl.H:86
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsC.H:75
@ max
Definition: AMReX_ParallelReduce.H:17
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T n) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n))
Definition: AMReX_GpuLaunchFunctsC.H:103
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T ncomp, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:127
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i, Gpu::Handler const &) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsG.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp(F const &f, IntVectND< dim > iv, T ncomp) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:103
AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsC.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:65
AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsC.H:65
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsG.H:75
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T n, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n, engine))
Definition: AMReX_GpuLaunchFunctsC.H:93
AMREX_FORCE_INLINE auto call_f_intvect_inner(std::index_sequence< Ns... >, F const &f, IntVectND< 1 > iv, Args...args) noexcept -> decltype(f(0, 0, 0, args...))
Definition: AMReX_GpuLaunchFunctsC.H:31
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T ncomp, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0, engine))
Definition: AMReX_GpuLaunchFunctsG.H:115
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
void ParallelFor(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1278
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition: AMReX_Loop.H:378
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
constexpr AMREX_GPU_HOST_DEVICE GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition: AMReX_Tuple.H:179
void For(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1386
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
void launch(BoxND< dim > const &box, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:772
void HostDeviceFor(T n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:869
AMREX_FORCE_INLINE randState_t * getRandState()
Definition: AMReX_RandomEngine.H:55
void single_task(gpuStream_t stream, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:710
bool isEmpty(T n) noexcept
Definition: AMReX_GpuRange.H:14
void single_task(L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:1307
curandState_t randState_t
Definition: AMReX_RandomEngine.H:48
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:225
std::enable_if_t< MaybeDeviceRunnable< L >::value > ParallelForRNG(BoxND< dim > const &box, T ncomp, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:908
std::enable_if_t< MaybeHostDeviceRunnable< L1 >::value &&MaybeHostDeviceRunnable< L2 >::value &&MaybeHostDeviceRunnable< L3 >::value > HostDeviceParallelFor(Gpu::KernelInfo const &info, BoxND< dim > const &box1, T1 ncomp1, L1 &&f1, BoxND< dim > const &box2, T2 ncomp2, L2 &&f2, BoxND< dim > const &box3, T3 ncomp3, L3 &&f3) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1700
Definition: AMReX_FabArrayCommI.H:896
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE IntVectND< dim > intVect(std::uint64_t icell) const
Definition: AMReX_Box.H:2044
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuLaunch.H:128
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_RandomEngine.H:57