Block-Structured AMR Software Framework
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2 #define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3 #include <AMReX_Config.H>
4 
5 namespace amrex {
6 
7 namespace detail {
8 
9  // call_f_scalar_handler
10 
11  template <typename F, typename N>
13  auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
14  noexcept -> decltype(f(0))
15  {
16  f(i);
17  }
18 
19  template <typename F, typename N>
21  auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
22  noexcept -> decltype(f(0,Gpu::Handler{}))
23  {
24  f(i, handler);
25  }
26 
27  // call_f_intvect_inner
28 
29  template <typename F, std::size_t...Ns, class...Args>
31  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
32  noexcept -> decltype(f(0, 0, 0, args...))
33  {
34  f(iv[0], 0, 0, args...);
35  }
36 
37  template <typename F, std::size_t...Ns, class...Args>
39  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
40  noexcept -> decltype(f(0, 0, 0, args...))
41  {
42  f(iv[0], iv[1], 0, args...);
43  }
44 
45  template <typename F, int dim, std::size_t...Ns, class...Args>
47  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
48  noexcept -> decltype(f(iv, args...))
49  {
50  f(iv, args...);
51  }
52 
53  template <typename F, int dim, std::size_t...Ns, class...Args>
55  auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
56  noexcept -> decltype(f(iv[Ns]..., args...))
57  {
58  f(iv[Ns]..., args...);
59  }
60 
61  // call_f_intvect
62 
63  template <typename F, int dim>
65  auto call_f_intvect (F const& f, IntVectND<dim> iv)
66  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
67  {
68  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
69  }
70 
71  // call_f_intvect_engine
72 
73  template <typename F, int dim>
75  auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
76  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
77  {
78  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
79  }
80 
81  // call_f_intvect_handler
82 
83  template <typename F, int dim>
86  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
87  {
88  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
89  }
90 
91  template <typename F, int dim>
93  auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
94  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
95  {
96  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
97  }
98 
99  // call_f_intvect_ncomp
100 
101  template <typename F, typename T, int dim>
103  auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
104  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
105  {
106  for (T n = 0; n < ncomp; ++n) {
107  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
108  }
109  }
110 
111  // call_f_intvect_ncomp_engine
112 
113  template <typename F, typename T, int dim>
115  auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
116  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
117  {
118  for (T n = 0; n < ncomp; ++n) {
119  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
120  }
121  }
122 
123  // call_f_intvect_ncomp_handler
124 
125  template <typename F, typename T, int dim>
127  auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
128  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
129  {
130  for (T n = 0; n < ncomp; ++n) {
131  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
132  }
133  }
134 
135  template <typename F, typename T, int dim>
137  auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
138  noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
139  {
140  for (T n = 0; n < ncomp; ++n) {
141  call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
142  }
143  }
144 
145 }
146 
147 #ifdef AMREX_USE_SYCL
148 
149 template <typename L>
150 void single_task (gpuStream_t stream, L const& f) noexcept
151 {
152  auto& q = *(stream.queue);
153  try {
154  q.submit([&] (sycl::handler& h) {
155  h.single_task([=] () { f(); });
156  });
157  } catch (sycl::exception const& ex) {
158  amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
159  }
160 }
161 
162 template<typename L>
163 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
164  gpuStream_t stream, L const& f) noexcept
165 {
166  const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167  const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
168  / sizeof(unsigned long long);
169  auto& q = *(stream.queue);
170  try {
171  q.submit([&] (sycl::handler& h) {
172  sycl::local_accessor<unsigned long long>
173  shared_data(sycl::range<1>(shared_mem_numull), h);
174  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175  sycl::range<1>(nthreads_per_block)),
176  [=] (sycl::nd_item<1> item)
177  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
178  {
179  f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
180  });
181  });
182  } catch (sycl::exception const& ex) {
183  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
184  }
185 }
186 
187 template<typename L>
188 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
189 {
190  const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191  auto& q = *(stream.queue);
192  try {
193  q.submit([&] (sycl::handler& h) {
194  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195  sycl::range<1>(nthreads_per_block)),
196  [=] (sycl::nd_item<1> item)
197  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
198  {
199  f(item);
200  });
201  });
202  } catch (sycl::exception const& ex) {
203  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
204  }
205 }
206 
207 template <int MT, typename L>
208 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
209  L const& f) noexcept
210 {
211  const auto nthreads_total = MT * std::size_t(nblocks);
212  const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
213  / sizeof(unsigned long long);
214  auto& q = *(stream.queue);
215  try {
216  q.submit([&] (sycl::handler& h) {
217  sycl::local_accessor<unsigned long long>
218  shared_data(sycl::range<1>(shared_mem_numull), h);
219  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
220  sycl::range<1>(MT)),
221  [=] (sycl::nd_item<1> item)
222  [[sycl::reqd_work_group_size(MT)]]
223  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
224  {
225  f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
226  });
227  });
228  } catch (sycl::exception const& ex) {
229  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230  }
231 }
232 
233 template <int MT, typename L>
234 void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
235 {
236  const auto nthreads_total = MT * std::size_t(nblocks);
237  auto& q = *(stream.queue);
238  try {
239  q.submit([&] (sycl::handler& h) {
240  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
241  sycl::range<1>(MT)),
242  [=] (sycl::nd_item<1> item)
243  [[sycl::reqd_work_group_size(MT)]]
244  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
245  {
246  f(item);
247  });
248  });
249  } catch (sycl::exception const& ex) {
250  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
251  }
252 }
253 
254 template<int MT, typename T, typename L>
255 void launch (T const& n, L const& f) noexcept
256 {
257  if (amrex::isEmpty(n)) { return; }
258  const auto ec = Gpu::makeExecutionConfig<MT>(n);
259  const auto nthreads_per_block = ec.numThreads.x;
260  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
261  auto& q = Gpu::Device::streamQueue();
262  try {
263  q.submit([&] (sycl::handler& h) {
264  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
265  sycl::range<1>(nthreads_per_block)),
266  [=] (sycl::nd_item<1> item)
267  [[sycl::reqd_work_group_size(MT)]]
268  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
269  {
270  for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
271  f(i);
272  }
273  });
274  });
275  } catch (sycl::exception const& ex) {
276  amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
277  }
278 }
279 
280 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
281 void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
282 {
283  if (amrex::isEmpty(n)) { return; }
284  const auto ec = Gpu::makeExecutionConfig<MT>(n);
285  const auto nthreads_per_block = ec.numThreads.x;
286  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
287  auto& q = Gpu::Device::streamQueue();
288  try {
289  if (info.hasReduction()) {
290  q.submit([&] (sycl::handler& h) {
291  sycl::local_accessor<unsigned long long>
292  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
293  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
294  sycl::range<1>(nthreads_per_block)),
295  [=] (sycl::nd_item<1> item)
296  [[sycl::reqd_work_group_size(MT)]]
297  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
298  {
299  for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
300  i < std::size_t(n); i += stride) {
301  int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
302  item.get_local_range(0));
303  detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
304  n_active_threads});
305  }
306  });
307  });
308  } else {
309  q.submit([&] (sycl::handler& h) {
310  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
311  sycl::range<1>(nthreads_per_block)),
312  [=] (sycl::nd_item<1> item)
313  [[sycl::reqd_work_group_size(MT)]]
314  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
315  {
316  for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
317  i < std::size_t(n); i += stride) {
318  detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
319  }
320  });
321  });
322  }
323  } catch (sycl::exception const& ex) {
324  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
325  }
326 }
327 
328 template <int MT, typename L, int dim>
329 void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
330 {
331  if (amrex::isEmpty(box)) { return; }
332  const BoxIndexerND<dim> indexer(box);
333  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
334  const auto nthreads_per_block = ec.numThreads.x;
335  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
336  auto& q = Gpu::Device::streamQueue();
337  try {
338  if (info.hasReduction()) {
339  q.submit([&] (sycl::handler& h) {
340  sycl::local_accessor<unsigned long long>
341  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
342  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
343  sycl::range<1>(nthreads_per_block)),
344  [=] (sycl::nd_item<1> item)
345  [[sycl::reqd_work_group_size(MT)]]
346  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
347  {
348  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
349  icell < indexer.numPts(); icell += stride) {
350  auto iv = indexer.intVect(icell);
351  int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
352  std::uint64_t(item.get_local_range(0)));
353  detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
354  n_active_threads});
355  }
356  });
357  });
358  } else {
359  q.submit([&] (sycl::handler& h) {
360  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
361  sycl::range<1>(nthreads_per_block)),
362  [=] (sycl::nd_item<1> item)
363  [[sycl::reqd_work_group_size(MT)]]
364  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
365  {
366  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
367  icell < indexer.numPts(); icell += stride) {
368  auto iv = indexer.intVect(icell);
369  detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
370  }
371  });
372  });
373  }
374  } catch (sycl::exception const& ex) {
375  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
376  }
377 }
378 
379 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
380 void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
381 {
382  if (amrex::isEmpty(box)) { return; }
383  const BoxIndexerND<dim> indexer(box);
384  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
385  const auto nthreads_per_block = ec.numThreads.x;
386  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
387  auto& q = Gpu::Device::streamQueue();
388  try {
389  if (info.hasReduction()) {
390  q.submit([&] (sycl::handler& h) {
391  sycl::local_accessor<unsigned long long>
392  shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
393  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
394  sycl::range<1>(nthreads_per_block)),
395  [=] (sycl::nd_item<1> item)
396  [[sycl::reqd_work_group_size(MT)]]
397  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
398  {
399  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
400  icell < indexer.numPts(); icell += stride) {
401  auto iv = indexer.intVect(icell);
402  int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
403  std::uint64_t(item.get_local_range(0)));
404  detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
405  Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
406  n_active_threads});
407  }
408  });
409  });
410  } else {
411  q.submit([&] (sycl::handler& h) {
412  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
413  sycl::range<1>(nthreads_per_block)),
414  [=] (sycl::nd_item<1> item)
415  [[sycl::reqd_work_group_size(MT)]]
416  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
417  {
418  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
419  icell < indexer.numPts(); icell += stride) {
420  auto iv = indexer.intVect(icell);
421  detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
422  }
423  });
424  });
425  }
426  } catch (sycl::exception const& ex) {
427  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
428  }
429 }
430 
431 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
432 void ParallelForRNG (T n, L const& f) noexcept
433 {
434  if (amrex::isEmpty(n)) { return; }
435  const auto ec = Gpu::ExecutionConfig(n);
436  const auto nthreads_per_block = ec.numThreads.x;
437  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
438  auto& q = Gpu::Device::streamQueue();
439  auto& engdescr = *(getRandEngineDescriptor());
440  try {
441  q.submit([&] (sycl::handler& h) {
442  auto engine_acc = engdescr.get_access(h);
443  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
444  sycl::range<1>(nthreads_per_block)),
445  [=] (sycl::nd_item<1> item)
446  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
447  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
448  {
449  auto const tid = item.get_global_id(0);
450  auto engine = engine_acc.load(tid);
451  RandomEngine rand_eng{&engine};
452  for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
453  f(T(i),rand_eng);
454  }
455  engine_acc.store(engine, tid);
456  });
457  });
458  q.wait_and_throw(); // because next launch might be on a different queue
459  } catch (sycl::exception const& ex) {
460  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461  }
462 }
463 
464 template <typename L, int dim>
465 void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
466 {
467  if (amrex::isEmpty(box)) { return; }
468  const BoxIndexerND<dim> indexer(box);
469  const auto ec = Gpu::ExecutionConfig(box.numPts());
470  const auto nthreads_per_block = ec.numThreads.x;
471  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
472  auto& q = Gpu::Device::streamQueue();
473  auto& engdescr = *(getRandEngineDescriptor());
474  try {
475  q.submit([&] (sycl::handler& h) {
476  auto engine_acc = engdescr.get_access(h);
477  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
478  sycl::range<1>(nthreads_per_block)),
479  [=] (sycl::nd_item<1> item)
480  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
481  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
482  {
483  auto const tid = item.get_global_id(0);
484  auto engine = engine_acc.load(tid);
485  RandomEngine rand_eng{&engine};
486  for (std::uint64_t icell = tid, stride = item.get_global_range(0);
487  icell < indexer.numPts(); icell += stride) {
488  auto iv = indexer.intVect(icell);
489  detail::call_f_intvect_engine(f,iv,rand_eng);
490  }
491  engine_acc.store(engine, tid);
492  });
493  });
494  q.wait_and_throw(); // because next launch might be on a different queue
495  } catch (sycl::exception const& ex) {
496  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
497  }
498 }
499 
500 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
501 void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
502 {
503  if (amrex::isEmpty(box)) { return; }
504  const BoxIndexerND<dim> indexer(box);
505  const auto ec = Gpu::ExecutionConfig(box.numPts());
506  const auto nthreads_per_block = ec.numThreads.x;
507  const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
508  auto& q = Gpu::Device::streamQueue();
509  auto& engdescr = *(getRandEngineDescriptor());
510  try {
511  q.submit([&] (sycl::handler& h) {
512  auto engine_acc = engdescr.get_access(h);
513  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
514  sycl::range<1>(nthreads_per_block)),
515  [=] (sycl::nd_item<1> item)
516  [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
517  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
518  {
519  auto const tid = item.get_global_id(0);
520  auto engine = engine_acc.load(tid);
521  RandomEngine rand_eng{&engine};
522  for (std::uint64_t icell = tid, stride = item.get_global_range(0);
523  icell < indexer.numPts(); icell += stride) {
524  auto iv = indexer.intVect(icell);
525  detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
526  }
527  engine_acc.store(engine, tid);
528  });
529  });
530  q.wait_and_throw(); // because next launch might be on a different queue
531  } catch (sycl::exception const& ex) {
532  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
533  }
534 }
535 
536 template <int MT, typename L1, typename L2, int dim>
537 void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
538 {
539  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
540  const BoxIndexerND<dim> indexer1(box1);
541  const BoxIndexerND<dim> indexer2(box2);
542  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
543  const auto nthreads_per_block = ec.numThreads.x;
544  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
545  auto& q = Gpu::Device::streamQueue();
546  try {
547  q.submit([&] (sycl::handler& h) {
548  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549  sycl::range<1>(nthreads_per_block)),
550  [=] (sycl::nd_item<1> item)
551  [[sycl::reqd_work_group_size(MT)]]
552  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553  {
554  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
555  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
556  icell < ncells; icell += stride) {
557  if (icell < indexer1.numPts()) {
558  auto iv = indexer1.intVect(icell);
559  detail::call_f_intvect(f1,iv);
560  }
561  if (icell < indexer2.numPts()) {
562  auto iv = indexer2.intVect(icell);
563  detail::call_f_intvect(f2,iv);
564  }
565  }
566  });
567  });
568  } catch (sycl::exception const& ex) {
569  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570  }
571 }
572 
573 template <int MT, typename L1, typename L2, typename L3, int dim>
574 void ParallelFor (Gpu::KernelInfo const& /*info*/,
575  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
576  L1&& f1, L2&& f2, L3&& f3) noexcept
577 {
578  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
579  const BoxIndexerND<dim> indexer1(box1);
580  const BoxIndexerND<dim> indexer2(box2);
581  const BoxIndexerND<dim> indexer3(box3);
582  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
583  const auto nthreads_per_block = ec.numThreads.x;
584  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
585  auto& q = Gpu::Device::streamQueue();
586  try {
587  q.submit([&] (sycl::handler& h) {
588  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
589  sycl::range<1>(nthreads_per_block)),
590  [=] (sycl::nd_item<1> item)
591  [[sycl::reqd_work_group_size(MT)]]
592  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
593  {
594  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
595  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
596  icell < ncells; icell += stride) {
597  if (icell < indexer1.numPts()) {
598  auto iv = indexer1.intVect(icell);
599  detail::call_f_intvect(f1,iv);
600  }
601  if (icell < indexer2.numPts()) {
602  auto iv = indexer2.intVect(icell);
603  detail::call_f_intvect(f2,iv);
604  }
605  if (icell < indexer3.numPts()) {
606  auto iv = indexer3.intVect(icell);
607  detail::call_f_intvect(f3,iv);
608  }
609  }
610  });
611  });
612  } catch (sycl::exception const& ex) {
613  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
614  }
615 }
616 
617 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
618  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
619  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
620 void ParallelFor (Gpu::KernelInfo const& /*info*/,
621  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
622  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
623 {
624  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
625  const BoxIndexerND<dim> indexer1(box1);
626  const BoxIndexerND<dim> indexer2(box2);
627  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
628  const auto nthreads_per_block = ec.numThreads.x;
629  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
630  auto& q = Gpu::Device::streamQueue();
631  try {
632  q.submit([&] (sycl::handler& h) {
633  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
634  sycl::range<1>(nthreads_per_block)),
635  [=] (sycl::nd_item<1> item)
636  [[sycl::reqd_work_group_size(MT)]]
637  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
638  {
639  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
640  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
641  icell < ncells; icell += stride) {
642  if (icell < indexer1.numPts()) {
643  auto iv = indexer1.intVect(icell);
644  detail::call_f_intvect_ncomp(f1,iv,ncomp1);
645  }
646  if (icell < indexer2.numPts()) {
647  auto iv = indexer2.intVect(icell);
648  detail::call_f_intvect_ncomp(f2,iv,ncomp2);
649  }
650  }
651  });
652  });
653  } catch (sycl::exception const& ex) {
654  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
655  }
656 }
657 
658 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
659  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
660  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
661  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
662 void ParallelFor (Gpu::KernelInfo const& /*info*/,
663  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
664  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
665  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
666 {
667  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
668  const BoxIndexerND<dim> indexer1(box1);
669  const BoxIndexerND<dim> indexer2(box2);
670  const BoxIndexerND<dim> indexer3(box3);
671  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
672  const auto nthreads_per_block = ec.numThreads.x;
673  const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
674  auto& q = Gpu::Device::streamQueue();
675  try {
676  q.submit([&] (sycl::handler& h) {
677  h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
678  sycl::range<1>(nthreads_per_block)),
679  [=] (sycl::nd_item<1> item)
680  [[sycl::reqd_work_group_size(MT)]]
681  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
682  {
683  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
684  for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
685  icell < ncells; icell += stride) {
686  if (icell < indexer1.numPts()) {
687  auto iv = indexer1.intVect(icell);
688  detail::call_f_intvect_ncomp(f1,iv,ncomp1);
689  }
690  if (icell < indexer2.numPts()) {
691  auto iv = indexer2.intVect(icell);
692  detail::call_f_intvect_ncomp(f2,iv,ncomp2);
693  }
694  if (icell < indexer3.numPts()) {
695  auto iv = indexer3.intVect(icell);
696  detail::call_f_intvect_ncomp(f3,iv,ncomp3);
697  }
698  }
699  });
700  });
701  } catch (sycl::exception const& ex) {
702  amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
703  }
704 }
705 
706 #else
707 // CUDA or HIP
708 
709 template <typename L>
710 void single_task (gpuStream_t stream, L const& f) noexcept
711 {
712  AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream,
713  [=] AMREX_GPU_DEVICE () noexcept {f();});
715 }
716 
717 template <int MT, typename L>
718 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
719  L const& f) noexcept
720 {
721  AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
722  [=] AMREX_GPU_DEVICE () noexcept { f(); });
724 }
725 
726 template <int MT, typename L>
727 void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
728 {
729  AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
730  [=] AMREX_GPU_DEVICE () noexcept { f(); });
732 }
733 
734 template<typename L>
735 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736  gpuStream_t stream, L const& f) noexcept
737 {
738  AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740  stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
742 }
743 
744 template<typename L>
745 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
746 {
747  launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
748 }
749 
750 template<int MT, typename T, typename L>
751 void launch (T const& n, L const& f) noexcept
752 {
753  if (amrex::isEmpty(n)) { return; }
754  const auto ec = Gpu::makeExecutionConfig<MT>(n);
755  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
756  [=] AMREX_GPU_DEVICE () noexcept {
757  for (auto const i : Gpu::Range(n)) {
758  f(i);
759  }
760  });
762 }
763 
764 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
765 std::enable_if_t<MaybeDeviceRunnable<L>::value>
766 ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
767 {
768  if (amrex::isEmpty(n)) { return; }
769  const auto ec = Gpu::makeExecutionConfig<MT>(n);
770  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
771  [=] AMREX_GPU_DEVICE () noexcept {
772  for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
773  i < Long(n); i += stride) {
775  Gpu::Handler(amrex::min((std::uint64_t(n)-i+(std::uint64_t)threadIdx.x),
776  (std::uint64_t)blockDim.x)));
777  }
778  });
780 }
781 
782 template <int MT, typename L, int dim>
783 std::enable_if_t<MaybeDeviceRunnable<L>::value>
784 ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
785 {
786  if (amrex::isEmpty(box)) { return; }
787  const BoxIndexerND<dim> indexer(box);
788  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
789  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
790  [=] AMREX_GPU_DEVICE () noexcept {
791  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
792  icell < indexer.numPts(); icell += stride)
793  {
794  auto iv = indexer.intVect(icell);
795  detail::call_f_intvect_handler(f, iv,
796  Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
797  (std::uint64_t)blockDim.x)));
798  }
799  });
801 }
802 
803 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
804 std::enable_if_t<MaybeDeviceRunnable<L>::value>
805 ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
806 {
807  if (amrex::isEmpty(box)) { return; }
808  const BoxIndexerND<dim> indexer(box);
809  const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
810  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
811  [=] AMREX_GPU_DEVICE () noexcept {
812  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
813  icell < indexer.numPts(); icell += stride) {
814  auto iv = indexer.intVect(icell);
815  detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
816  Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
817  (std::uint64_t)blockDim.x)));
818  }
819  });
821 }
822 
823 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
824 std::enable_if_t<MaybeDeviceRunnable<L>::value>
825 ParallelForRNG (T n, L const& f) noexcept
826 {
827  if (amrex::isEmpty(n)) { return; }
828  randState_t* rand_state = getRandState();
829  const auto ec = Gpu::ExecutionConfig(n);
830  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
831  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
832  ec.numThreads, 0, Gpu::gpuStream(),
833  [=] AMREX_GPU_DEVICE () noexcept {
834  Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
835  RandomEngine engine{&(rand_state[tid])};
836  for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
837  f(T(i),engine);
838  }
839  });
840  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
842 }
843 
844 template <typename L, int dim>
845 std::enable_if_t<MaybeDeviceRunnable<L>::value>
846 ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
847 {
848  if (amrex::isEmpty(box)) { return; }
849  randState_t* rand_state = getRandState();
850  const BoxIndexerND<dim> indexer(box);
851  const auto ec = Gpu::ExecutionConfig(box.numPts());
852  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
853  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
854  ec.numThreads, 0, Gpu::gpuStream(),
855  [=] AMREX_GPU_DEVICE () noexcept {
856  auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
857  RandomEngine engine{&(rand_state[tid])};
858  for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
859  auto iv = indexer.intVect(icell);
860  detail::call_f_intvect_engine(f, iv, engine);
861  }
862  });
863  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
865 }
866 
867 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
868 std::enable_if_t<MaybeDeviceRunnable<L>::value>
869 ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
870 {
871  if (amrex::isEmpty(box)) { return; }
872  randState_t* rand_state = getRandState();
873  const BoxIndexerND<dim> indexer(box);
874  const auto ec = Gpu::ExecutionConfig(box.numPts());
875  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
876  amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
877  ec.numThreads, 0, Gpu::gpuStream(),
878  [=] AMREX_GPU_DEVICE () noexcept {
879  auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
880  RandomEngine engine{&(rand_state[tid])};
881  for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
882  auto iv = indexer.intVect(icell);
883  detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
884  }
885  });
886  Gpu::streamSynchronize(); // To avoid multiple streams using RNG
888 }
889 
890 template <int MT, typename L1, typename L2, int dim>
891 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
893  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
894 {
895  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
896  const BoxIndexerND<dim> indexer1(box1);
897  const BoxIndexerND<dim> indexer2(box2);
898  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
899  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
900  [=] AMREX_GPU_DEVICE () noexcept {
901  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
902  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
903  icell < ncells; icell += stride) {
904  if (icell < indexer1.numPts()) {
905  auto iv = indexer1.intVect(icell);
906  detail::call_f_intvect(f1, iv);
907  }
908  if (icell < indexer2.numPts()) {
909  auto iv = indexer2.intVect(icell);
910  detail::call_f_intvect(f2, iv);
911  }
912  }
913  });
915 }
916 
917 template <int MT, typename L1, typename L2, typename L3, int dim>
918 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
920  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
921  L1&& f1, L2&& f2, L3&& f3) noexcept
922 {
923  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
924  const BoxIndexerND<dim> indexer1(box1);
925  const BoxIndexerND<dim> indexer2(box2);
926  const BoxIndexerND<dim> indexer3(box3);
927  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
928  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
929  [=] AMREX_GPU_DEVICE () noexcept {
930  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
931  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
932  icell < ncells; icell += stride) {
933  if (icell < indexer1.numPts()) {
934  auto iv = indexer1.intVect(icell);
935  detail::call_f_intvect(f1, iv);
936  }
937  if (icell < indexer2.numPts()) {
938  auto iv = indexer2.intVect(icell);
939  detail::call_f_intvect(f2, iv);
940  }
941  if (icell < indexer3.numPts()) {
942  auto iv = indexer3.intVect(icell);
943  detail::call_f_intvect(f3, iv);
944  }
945  }
946  });
948 }
949 
950 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
951  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
952  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
953 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
955  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
956  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
957 {
958  if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
959  const BoxIndexerND<dim> indexer1(box1);
960  const BoxIndexerND<dim> indexer2(box2);
961  const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
962  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
963  [=] AMREX_GPU_DEVICE () noexcept {
964  auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
965  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
966  icell < ncells; icell += stride) {
967  if (icell < indexer1.numPts()) {
968  auto iv = indexer1.intVect(icell);
969  detail::call_f_intvect_ncomp(f1, iv, ncomp1);
970  }
971  if (icell < indexer2.numPts()) {
972  auto iv = indexer2.intVect(icell);
973  detail::call_f_intvect_ncomp(f2, iv, ncomp2);
974  }
975  }
976  });
978 }
979 
980 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
981  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
982  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
983  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
984 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
986  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
987  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
988  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
989 {
990  if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
991  const BoxIndexerND<dim> indexer1(box1);
992  const BoxIndexerND<dim> indexer2(box2);
993  const BoxIndexerND<dim> indexer3(box3);
994  const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
995  AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
996  [=] AMREX_GPU_DEVICE () noexcept {
997  auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
998  for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
999  icell < ncells; icell += stride) {
1000  if (icell < indexer1.numPts()) {
1001  auto iv = indexer1.intVect(icell);
1002  detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1003  }
1004  if (icell < indexer2.numPts()) {
1005  auto iv = indexer2.intVect(icell);
1006  detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1007  }
1008  if (icell < indexer3.numPts()) {
1009  auto iv = indexer3.intVect(icell);
1010  detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1011  }
1012  }
1013  });
1015 }
1016 
1017 #endif
1018 
1019 template <typename L>
1020 void single_task (L&& f) noexcept
1021 {
1022  single_task(Gpu::gpuStream(), std::forward<L>(f));
1023 }
1024 
1025 template<typename T, typename L>
1026 void launch (T const& n, L&& f) noexcept
1027 {
1028  launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1029 }
1030 
1031 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1032 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1033 ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1034 {
1035  ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1036 }
1037 
1038 template <typename L, int dim>
1039 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1040 ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1041 {
1042  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1043 }
1044 
1045 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1046 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1047 ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1048 {
1049  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1050 }
1051 
1052 template <typename L1, typename L2, int dim>
1053 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1055  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1056 {
1057  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1058  std::forward<L2>(f2));
1059 }
1060 
1061 template <typename L1, typename L2, typename L3, int dim>
1062 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1064  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1065  L1&& f1, L2&& f2, L3&& f3) noexcept
1066 {
1067  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1068  std::forward<L2>(f2), std::forward<L3>(f3));
1069 }
1070 
1071 template <typename T1, typename T2, typename L1, typename L2, int dim,
1072  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1073  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1074 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1076  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1077  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1078 {
1079  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1080  box2, ncomp2, std::forward<L2>(f2));
1081 }
1082 
1083 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1084  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1085  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1086  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1087 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1089  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1090  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1091  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1092 {
1093  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1094  box2, ncomp2, std::forward<L2>(f2),
1095  box3, ncomp3, std::forward<L3>(f3));
1096 }
1097 
1098 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1099 void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1100 {
1101  ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1102 }
1103 
1104 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1105 void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1106 {
1107  ParallelFor<MT>(info, n,std::forward<L>(f));
1108 }
1109 
1110 template <typename L, int dim>
1111 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1112 {
1113  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1114 }
1115 
1116 template <int MT, typename L, int dim>
1117 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1118 {
1119  ParallelFor<MT>(info, box,std::forward<L>(f));
1120 }
1121 
1122 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1123 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1124 {
1125  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1126 }
1127 
1128 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1129 void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1130 {
1131  ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1132 }
1133 
1134 template <typename L1, typename L2, int dim>
1135 void For (Gpu::KernelInfo const& info,
1136  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1137 {
1138  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1139 }
1140 
1141 template <int MT, typename L1, typename L2, int dim>
1142 void For (Gpu::KernelInfo const& info,
1143  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1144 {
1145  ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1146 }
1147 
1148 template <typename L1, typename L2, typename L3, int dim>
1149 void For (Gpu::KernelInfo const& info,
1150  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1151  L1&& f1, L2&& f2, L3&& f3) noexcept
1152 {
1153  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1154 }
1155 
1156 template <int MT, typename L1, typename L2, typename L3, int dim>
1157 void For (Gpu::KernelInfo const& info,
1158  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1159  L1&& f1, L2&& f2, L3&& f3) noexcept
1160 {
1161  ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1162 }
1163 
1164 template <typename T1, typename T2, typename L1, typename L2, int dim,
1165  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1166  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1167 void For (Gpu::KernelInfo const& info,
1168  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1169  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1170 {
1171  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1172 }
1173 
1174 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1175  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1176  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1177 void For (Gpu::KernelInfo const& info,
1178  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1179  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1180 {
1181  ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1182 }
1183 
1184 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1185  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1186  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1187  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1188 void For (Gpu::KernelInfo const& info,
1189  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1190  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1191  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1192 {
1193  ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1194  box1,ncomp1,std::forward<L1>(f1),
1195  box2,ncomp2,std::forward<L2>(f2),
1196  box3,ncomp3,std::forward<L3>(f3));
1197 }
1198 
1199 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1200  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1201  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1202  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1203 void For (Gpu::KernelInfo const& info,
1204  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1205  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1206  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1207 {
1208  ParallelFor<MT>(info,
1209  box1,ncomp1,std::forward<L1>(f1),
1210  box2,ncomp2,std::forward<L2>(f2),
1211  box3,ncomp3,std::forward<L3>(f3));
1212 }
1213 
1214 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1215 void ParallelFor (T n, L&& f) noexcept
1216 {
1217  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1218 }
1219 
1220 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1221 void ParallelFor (T n, L&& f) noexcept
1222 {
1223  ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1224 }
1225 
1226 template <typename L, int dim>
1227 void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1228 {
1229  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1230 }
1231 
1232 template <int MT, typename L, int dim>
1233 void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1234 {
1235  ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1236 }
1237 
1238 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1239 void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1240 {
1241  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1242 }
1243 
1244 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1245 void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1246 {
1247  ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1248 }
1249 
1250 template <typename L1, typename L2, int dim>
1251 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1252 {
1253  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1254 }
1255 
1256 template <int MT, typename L1, typename L2, int dim>
1257 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1258 {
1259  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1260 }
1261 
1262 template <typename L1, typename L2, typename L3, int dim>
1263 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1264  L1&& f1, L2&& f2, L3&& f3) noexcept
1265 {
1266  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1267 }
1268 
1269 template <int MT, typename L1, typename L2, typename L3, int dim>
1270 void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1271  L1&& f1, L2&& f2, L3&& f3) noexcept
1272 {
1273  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1274 }
1275 
1276 template <typename T1, typename T2, typename L1, typename L2, int dim,
1277  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1278  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1279 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1280  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1281 {
1282  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1283 }
1284 
1285 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1286  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1287  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1288 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1289  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1290 {
1291  ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1292 }
1293 
1294 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1295  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1296  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1297  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1298 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1299  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1300  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1301 {
1302  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1303  box1,ncomp1,std::forward<L1>(f1),
1304  box2,ncomp2,std::forward<L2>(f2),
1305  box3,ncomp3,std::forward<L3>(f3));
1306 }
1307 
1308 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1309  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1310  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1311  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1312 void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1313  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1314  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1315 {
1316  ParallelFor<MT>(Gpu::KernelInfo{},
1317  box1,ncomp1,std::forward<L1>(f1),
1318  box2,ncomp2,std::forward<L2>(f2),
1319  box3,ncomp3,std::forward<L3>(f3));
1320 }
1321 
1322 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1323 void For (T n, L&& f) noexcept
1324 {
1325  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1326 }
1327 
1328 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1329 void For (T n, L&& f) noexcept
1330 {
1331  ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1332 }
1333 
1334 template <typename L, int dim>
1335 void For (BoxND<dim> const& box, L&& f) noexcept
1336 {
1337  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1338 }
1339 
1340 template <int MT, typename L, int dim>
1341 void For (BoxND<dim> const& box, L&& f) noexcept
1342 {
1343  ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1344 }
1345 
1346 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1347 void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1348 {
1349  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1350 }
1351 
1352 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1353 void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1354 {
1355  ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1356 }
1357 
1358 template <typename L1, typename L2, int dim>
1359 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1360 {
1361  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1362 }
1363 
1364 template <int MT, typename L1, typename L2, int dim>
1365 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1366 {
1367  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1368 }
1369 
1370 template <typename L1, typename L2, typename L3, int dim>
1371 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1372  L1&& f1, L2&& f2, L3&& f3) noexcept
1373 {
1374  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1375 }
1376 
1377 template <int MT, typename L1, typename L2, typename L3, int dim>
1378 void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1379  L1&& f1, L2&& f2, L3&& f3) noexcept
1380 {
1381  ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1382 }
1383 
1384 template <typename T1, typename T2, typename L1, typename L2, int dim,
1385  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1386  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1387 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1388  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1389 {
1390  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1391 }
1392 
1393 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1394  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1395  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1396 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1397  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1398 {
1399  ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1400 }
1401 
1402 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1403  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1404  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1405  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1406 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1407  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1408  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1409 {
1410  ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1411  box1,ncomp1,std::forward<L1>(f1),
1412  box2,ncomp2,std::forward<L2>(f2),
1413  box3,ncomp3,std::forward<L3>(f3));
1414 }
1415 
1416 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1417  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1418  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1419  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1420 void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1421  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1422  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1423 {
1424  ParallelFor<MT>(Gpu::KernelInfo{},
1425  box1,ncomp1,std::forward<L1>(f1),
1426  box2,ncomp2,std::forward<L2>(f2),
1427  box3,ncomp3,std::forward<L3>(f3));
1428 }
1429 
1430 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1431 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1432 HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1433 {
1434  if (Gpu::inLaunchRegion()) {
1435  ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1436  } else {
1437 #ifdef AMREX_USE_SYCL
1438  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1439 #else
1441  for (T i = 0; i < n; ++i) { f(i); }
1442 #endif
1443  }
1444 }
1445 
1446 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1447 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1448 HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1449 {
1450  if (Gpu::inLaunchRegion()) {
1451  ParallelFor<MT>(info,n,std::forward<L>(f));
1452  } else {
1453 #ifdef AMREX_USE_SYCL
1454  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1455 #else
1457  for (T i = 0; i < n; ++i) { f(i); }
1458 #endif
1459  }
1460 }
1461 
1462 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1463 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1464 HostDeviceParallelFor (T n, L&& f) noexcept
1465 {
1466  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1467 }
1468 
1469 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1470 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1471 HostDeviceParallelFor (T n, L&& f) noexcept
1472 {
1473  HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1474 }
1475 
1476 template <typename L, int dim>
1477 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1478 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1479 {
1480  if (Gpu::inLaunchRegion()) {
1481  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1482  } else {
1483 #ifdef AMREX_USE_SYCL
1484  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1485 #else
1486  LoopConcurrentOnCpu(box,std::forward<L>(f));
1487 #endif
1488  }
1489 }
1490 
1491 template <int MT, typename L, int dim>
1492 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1493 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1494 {
1495  if (Gpu::inLaunchRegion()) {
1496  ParallelFor<MT>(info, box,std::forward<L>(f));
1497  } else {
1498 #ifdef AMREX_USE_SYCL
1499  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1500 #else
1501  LoopConcurrentOnCpu(box,std::forward<L>(f));
1502 #endif
1503  }
1504 }
1505 
1506 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1507 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1508 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1509 {
1510  if (Gpu::inLaunchRegion()) {
1511  ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1512  } else {
1513 #ifdef AMREX_USE_SYCL
1514  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1515 #else
1516  LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1517 #endif
1518  }
1519 }
1520 
1521 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1522 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1523 HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1524 {
1525  if (Gpu::inLaunchRegion()) {
1526  ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1527  } else {
1528 #ifdef AMREX_USE_SYCL
1529  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1530 #else
1531  LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1532 #endif
1533  }
1534 }
1535 
1536 template <typename L1, typename L2, int dim>
1537 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1539  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1540 {
1541  if (Gpu::inLaunchRegion()) {
1542  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1543  } else {
1544 #ifdef AMREX_USE_SYCL
1545  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1546 #else
1547  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1548  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1549 #endif
1550  }
1551 }
1552 
1553 template <int MT, typename L1, typename L2, int dim>
1554 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1556  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1557 {
1558  if (Gpu::inLaunchRegion()) {
1559  ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1560  } else {
1561 #ifdef AMREX_USE_SYCL
1562  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1563 #else
1564  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1565  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1566 #endif
1567  }
1568 }
1569 
1570 template <int MT, typename L1, typename L2, typename L3, int dim>
1571 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1573  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1574  L1&& f1, L2&& f2, L3&& f3) noexcept
1575 {
1576  if (Gpu::inLaunchRegion()) {
1577  ParallelFor<MT>(info,box1,box2,box3,
1578  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1579  } else {
1580 #ifdef AMREX_USE_SYCL
1581  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1582 #else
1583  LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1584  LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1585  LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1586 #endif
1587  }
1588 }
1589 
1590 template <typename T1, typename T2, typename L1, typename L2, int dim,
1591  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1592  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1593 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1595  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1596  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1597 {
1598  if (Gpu::inLaunchRegion()) {
1599  ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1600  } else {
1601 #ifdef AMREX_USE_SYCL
1602  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1603 #else
1604  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1605  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1606 #endif
1607  }
1608 }
1609 
1610 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1611  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1612  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1613 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1615  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1616  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1617 {
1618  if (Gpu::inLaunchRegion()) {
1619  ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1620  } else {
1621 #ifdef AMREX_USE_SYCL
1622  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1623 #else
1624  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1625  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1626 #endif
1627  }
1628 }
1629 
1630 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1631  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1632  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1633  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1634 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1636  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1637  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1638  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1639 {
1640  if (Gpu::inLaunchRegion()) {
1641  ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1642  box1,ncomp1,std::forward<L1>(f1),
1643  box2,ncomp2,std::forward<L2>(f2),
1644  box3,ncomp3,std::forward<L3>(f3));
1645  } else {
1646 #ifdef AMREX_USE_SYCL
1647  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1648 #else
1649  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1650  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1651  LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1652 #endif
1653  }
1654 }
1655 
1656 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1657  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1658  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1659  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1660 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1662  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1663  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1664  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1665 {
1666  if (Gpu::inLaunchRegion()) {
1667  ParallelFor<MT>(info,
1668  box1,ncomp1,std::forward<L1>(f1),
1669  box2,ncomp2,std::forward<L2>(f2),
1670  box3,ncomp3,std::forward<L3>(f3));
1671  } else {
1672 #ifdef AMREX_USE_SYCL
1673  amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1674 #else
1675  LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1676  LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1677  LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1678 #endif
1679  }
1680 }
1681 
1682 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1683 void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1684 {
1685  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1686 }
1687 
1688 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1689 void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1690 {
1691  HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1692 }
1693 
1694 template <typename L, int dim>
1695 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1696 {
1697  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1698 }
1699 
1700 template <int MT, typename L, int dim>
1701 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1702 {
1703  HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1704 }
1705 
1706 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1707 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1708 {
1709  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1710 }
1711 
1712 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1713 void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1714 {
1715  HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1716 }
1717 
1718 template <typename L1, typename L2, int dim>
1719 void HostDeviceFor (Gpu::KernelInfo const& info,
1720  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1721 {
1722  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1723 }
1724 
1725 template <int MT, typename L1, typename L2, int dim>
1726 void HostDeviceFor (Gpu::KernelInfo const& info,
1727  BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1728 {
1729  HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1730 }
1731 
1732 template <typename L1, typename L2, typename L3, int dim>
1733 void HostDeviceFor (Gpu::KernelInfo const& info,
1734  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1735  L1&& f1, L2&& f2, L3&& f3) noexcept
1736 {
1737  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1738  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1739 }
1740 
1741 template <int MT, typename L1, typename L2, typename L3, int dim>
1742 void HostDeviceFor (Gpu::KernelInfo const& info,
1743  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1744  L1&& f1, L2&& f2, L3&& f3) noexcept
1745 {
1746  HostDeviceParallelFor<MT>(info, box1,box2,box3,
1747  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1748 }
1749 
1750 template <typename T1, typename T2, typename L1, typename L2, int dim,
1751  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1752  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1753 void HostDeviceFor (Gpu::KernelInfo const& info,
1754  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1755  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1756 {
1757  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1758 }
1759 
1760 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1761  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1762  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1763 void HostDeviceFor (Gpu::KernelInfo const& info,
1764  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1765  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1766 {
1767  HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1768 }
1769 
1770 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1771  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1772  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1773  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1774 void HostDeviceFor (Gpu::KernelInfo const& info,
1775  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1776  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1777  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1778 {
1779  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1780  box1,ncomp1,std::forward<L1>(f1),
1781  box2,ncomp2,std::forward<L2>(f2),
1782  box3,ncomp3,std::forward<L3>(f3));
1783 }
1784 
1785 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1786  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1787  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1788  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1789 void HostDeviceFor (Gpu::KernelInfo const& info,
1790  BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1791  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1792  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1793 {
1794  HostDeviceParallelFor<MT>(info,
1795  box1,ncomp1,std::forward<L1>(f1),
1796  box2,ncomp2,std::forward<L2>(f2),
1797  box3,ncomp3,std::forward<L3>(f3));
1798 }
1799 
1800 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1801 void HostDeviceParallelFor (T n, L&& f) noexcept
1802 {
1803  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1804 }
1805 
1806 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1807 void HostDeviceParallelFor (T n, L&& f) noexcept
1808 {
1809  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1810 }
1811 
1812 template <typename L, int dim>
1813 void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1814 {
1815  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1816 }
1817 
1818 template <int MT, typename L, int dim>
1819 void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1820 {
1821  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1822 }
1823 
1824 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1825 void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1826 {
1827  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1828 }
1829 
1830 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1831 void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1832 {
1833  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1834 }
1835 
1836 template <typename L1, typename L2, int dim>
1837 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1838 {
1839  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1840 }
1841 
1842 template <int MT, typename L1, typename L2, int dim>
1843 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1844 {
1845  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1846 }
1847 
1848 template <typename L1, typename L2, typename L3, int dim>
1849 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1850  L1&& f1, L2&& f2, L3&& f3) noexcept
1851 {
1852  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
1853  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1854 }
1855 
1856 template <int MT, typename L1, typename L2, typename L3, int dim>
1857 void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1858  L1&& f1, L2&& f2, L3&& f3) noexcept
1859 {
1860  HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
1861  std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1862 }
1863 
1864 template <typename T1, typename T2, typename L1, typename L2, int dim,
1865  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1866  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1867 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1868  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1869 {
1870  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1871 }
1872 
1873 template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1874  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1875  typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1876 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1877  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1878 {
1879  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1880 }
1881 
1882 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1883  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1884  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1885  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1886 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1887  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1888  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1889 {
1890  HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1891  box1,ncomp1,std::forward<L1>(f1),
1892  box2,ncomp2,std::forward<L2>(f2),
1893  box3,ncomp3,std::forward<L3>(f3));
1894 }
1895 
1896 template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1897  typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1898  typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1899  typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1900 void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1901  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1902  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1903 {
1904  HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
1905  box1,ncomp1,std::forward<L1>(f1),
1906  box2,ncomp2,std::forward<L2>(f2),
1907  box3,ncomp3,std::forward<L3>(f3));
1908 }
1909 
1910 }
1911 
1912 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition: AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition: AMReX_GpuError.H:105
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:34
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition: AMReX_Box.H:43
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
Definition: AMReX_GpuKernelInfo.H:8
Definition: AMReX_IntVect.H:48
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition: AMReX_GpuRange.H:125
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
bool inLaunchRegion() noexcept
Definition: AMReX_GpuControl.H:86
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsC.H:75
@ max
Definition: AMReX_ParallelReduce.H:17
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T n) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n))
Definition: AMReX_GpuLaunchFunctsC.H:103
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T ncomp, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:127
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i, Gpu::Handler const &) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsG.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp(F const &f, IntVectND< dim > iv, T ncomp) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:103
AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsC.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:65
AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsC.H:65
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsG.H:75
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T n, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n, engine))
Definition: AMReX_GpuLaunchFunctsC.H:93
AMREX_FORCE_INLINE auto call_f_intvect_inner(std::index_sequence< Ns... >, F const &f, IntVectND< 1 > iv, Args...args) noexcept -> decltype(f(0, 0, 0, args...))
Definition: AMReX_GpuLaunchFunctsC.H:31
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T ncomp, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0, engine))
Definition: AMReX_GpuLaunchFunctsG.H:115
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
void ParallelFor(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1239
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition: AMReX_Loop.H:377
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
constexpr AMREX_GPU_HOST_DEVICE GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition: AMReX_Tuple.H:179
void For(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1347
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
void HostDeviceFor(T n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:869
AMREX_FORCE_INLINE randState_t * getRandState()
Definition: AMReX_RandomEngine.H:55
void single_task(gpuStream_t stream, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:710
bool isEmpty(T n) noexcept
Definition: AMReX_GpuRange.H:14
void single_task(L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:1307
curandState_t randState_t
Definition: AMReX_RandomEngine.H:48
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:221
std::enable_if_t< MaybeDeviceRunnable< L >::value > ParallelForRNG(BoxND< dim > const &box, T ncomp, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:869
void launch(T const &n, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:751
std::enable_if_t< MaybeHostDeviceRunnable< L1 >::value &&MaybeHostDeviceRunnable< L2 >::value &&MaybeHostDeviceRunnable< L3 >::value > HostDeviceParallelFor(Gpu::KernelInfo const &info, BoxND< dim > const &box1, T1 ncomp1, L1 &&f1, BoxND< dim > const &box2, T2 ncomp2, L2 &&f2, BoxND< dim > const &box3, T3 ncomp3, L3 &&f3) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1661
Definition: AMReX_FabArrayCommI.H:841
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE IntVectND< dim > intVect(std::uint64_t icell) const
Definition: AMReX_Box.H:2044
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuLaunch.H:127
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_RandomEngine.H:57