1 #ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2 #define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3 #include <AMReX_Config.H>
11 template <
typename F,
typename N>
14 noexcept -> decltype(
f(0))
19 template <
typename F,
typename N>
29 template <
typename F, std::size_t...Ns,
class...Args>
32 noexcept -> decltype(
f(0, 0, 0, args...))
34 f(iv[0], 0, 0, args...);
37 template <
typename F, std::size_t...Ns,
class...Args>
40 noexcept -> decltype(
f(0, 0, 0, args...))
42 f(iv[0], iv[1], 0, args...);
45 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
48 noexcept -> decltype(
f(iv, args...))
53 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
56 noexcept -> decltype(
f(iv[Ns]..., args...))
58 f(iv[Ns]..., args...);
63 template <
typename F,
int dim>
73 template <
typename F,
int dim>
83 template <
typename F,
int dim>
91 template <
typename F,
int dim>
101 template <
typename F,
typename T,
int dim>
106 for (T n = 0; n < ncomp; ++n) {
113 template <
typename F,
typename T,
int dim>
118 for (T n = 0; n < ncomp; ++n) {
125 template <
typename F,
typename T,
int dim>
130 for (T n = 0; n < ncomp; ++n) {
135 template <
typename F,
typename T,
int dim>
140 for (T n = 0; n < ncomp; ++n) {
147 #ifdef AMREX_USE_SYCL
149 template <
typename L>
152 auto& q = *(stream.queue);
154 q.submit([&] (sycl::handler& h) {
155 h.single_task([=] () {
f(); });
157 }
catch (sycl::exception
const& ex) {
158 amrex::Abort(std::string(
"single_task: ")+ex.what()+
"!!!!!");
163 void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
166 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
168 /
sizeof(
unsigned long long);
169 auto& q = *(stream.queue);
171 q.submit([&] (sycl::handler& h) {
172 sycl::local_accessor<unsigned long long>
173 shared_data(sycl::range<1>(shared_mem_numull), h);
174 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175 sycl::range<1>(nthreads_per_block)),
176 [=] (sycl::nd_item<1> item)
179 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
182 }
catch (sycl::exception
const& ex) {
183 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
188 void launch (
int nblocks,
int nthreads_per_block,
gpuStream_t stream, L
const&
f) noexcept
190 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191 auto& q = *(stream.queue);
193 q.submit([&] (sycl::handler& h) {
194 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195 sycl::range<1>(nthreads_per_block)),
196 [=] (sycl::nd_item<1> item)
202 }
catch (sycl::exception
const& ex) {
203 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
207 template <
int MT,
typename L>
211 const auto nthreads_total = MT * std::size_t(nblocks);
212 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
213 /
sizeof(
unsigned long long);
214 auto& q = *(stream.queue);
216 q.submit([&] (sycl::handler& h) {
217 sycl::local_accessor<unsigned long long>
218 shared_data(sycl::range<1>(shared_mem_numull), h);
219 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
221 [=] (sycl::nd_item<1> item)
222 [[sycl::reqd_work_group_size(MT)]]
225 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
228 }
catch (sycl::exception
const& ex) {
229 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
233 template <
int MT,
typename L>
236 const auto nthreads_total = MT * std::size_t(nblocks);
237 auto& q = *(stream.queue);
239 q.submit([&] (sycl::handler& h) {
240 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
242 [=] (sycl::nd_item<1> item)
243 [[sycl::reqd_work_group_size(MT)]]
249 }
catch (sycl::exception
const& ex) {
250 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
254 template<
int MT,
typename T,
typename L>
255 void launch (T
const& n, L
const&
f) noexcept
258 const auto ec = Gpu::makeExecutionConfig<MT>(n);
259 const auto nthreads_per_block = ec.numThreads.x;
260 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
261 auto& q = Gpu::Device::streamQueue();
263 q.submit([&] (sycl::handler& h) {
264 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
265 sycl::range<1>(nthreads_per_block)),
266 [=] (sycl::nd_item<1> item)
267 [[sycl::reqd_work_group_size(MT)]]
270 for (
auto const i :
Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
275 }
catch (sycl::exception
const& ex) {
276 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
280 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
281 void ParallelFor (Gpu::KernelInfo
const& info, T n, L
const&
f) noexcept
284 const auto ec = Gpu::makeExecutionConfig<MT>(n);
285 const auto nthreads_per_block = ec.numThreads.x;
286 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
287 auto& q = Gpu::Device::streamQueue();
289 if (info.hasReduction()) {
290 q.submit([&] (sycl::handler& h) {
291 sycl::local_accessor<unsigned long long>
293 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
294 sycl::range<1>(nthreads_per_block)),
295 [=] (sycl::nd_item<1> item)
296 [[sycl::reqd_work_group_size(MT)]]
299 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
300 i < std::size_t(n); i += stride) {
301 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
302 item.get_local_range(0));
303 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
309 q.submit([&] (sycl::handler& h) {
310 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
311 sycl::range<1>(nthreads_per_block)),
312 [=] (sycl::nd_item<1> item)
313 [[sycl::reqd_work_group_size(MT)]]
316 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
317 i < std::size_t(n); i += stride) {
323 }
catch (sycl::exception
const& ex) {
324 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
328 template <
int MT,
typename L,
int dim>
329 void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L
const&
f) noexcept
332 const BoxIndexerND<dim> indexer(box);
333 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
334 const auto nthreads_per_block = ec.numThreads.x;
335 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
336 auto& q = Gpu::Device::streamQueue();
338 if (info.hasReduction()) {
339 q.submit([&] (sycl::handler& h) {
340 sycl::local_accessor<unsigned long long>
341 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
342 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
343 sycl::range<1>(nthreads_per_block)),
344 [=] (sycl::nd_item<1> item)
345 [[sycl::reqd_work_group_size(MT)]]
346 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
348 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
349 icell < indexer.numPts(); icell += stride) {
350 auto iv = indexer.intVect(icell);
351 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
352 std::uint64_t(item.get_local_range(0)));
353 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
359 q.submit([&] (sycl::handler& h) {
360 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
361 sycl::range<1>(nthreads_per_block)),
362 [=] (sycl::nd_item<1> item)
363 [[sycl::reqd_work_group_size(MT)]]
364 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
366 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
367 icell < indexer.numPts(); icell += stride) {
368 auto iv = indexer.intVect(icell);
374 }
catch (sycl::exception
const& ex) {
375 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
379 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
380 void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L
const&
f) noexcept
383 const BoxIndexerND<dim> indexer(box);
384 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
385 const auto nthreads_per_block = ec.numThreads.x;
386 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
387 auto& q = Gpu::Device::streamQueue();
389 if (info.hasReduction()) {
390 q.submit([&] (sycl::handler& h) {
391 sycl::local_accessor<unsigned long long>
392 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
393 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
394 sycl::range<1>(nthreads_per_block)),
395 [=] (sycl::nd_item<1> item)
396 [[sycl::reqd_work_group_size(MT)]]
397 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
399 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
400 icell < indexer.numPts(); icell += stride) {
401 auto iv = indexer.intVect(icell);
402 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
403 std::uint64_t(item.get_local_range(0)));
404 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
405 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
411 q.submit([&] (sycl::handler& h) {
412 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
413 sycl::range<1>(nthreads_per_block)),
414 [=] (sycl::nd_item<1> item)
415 [[sycl::reqd_work_group_size(MT)]]
416 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
418 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
419 icell < indexer.numPts(); icell += stride) {
420 auto iv = indexer.intVect(icell);
426 }
catch (sycl::exception
const& ex) {
427 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
431 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
435 const auto ec = Gpu::ExecutionConfig(n);
436 const auto nthreads_per_block = ec.numThreads.x;
437 const auto nthreads_total = std::size_t(nthreads_per_block) *
amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
438 auto& q = Gpu::Device::streamQueue();
439 auto& engdescr = *(getRandEngineDescriptor());
441 q.submit([&] (sycl::handler& h) {
442 auto engine_acc = engdescr.get_access(h);
443 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
444 sycl::range<1>(nthreads_per_block)),
445 [=] (sycl::nd_item<1> item)
446 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
447 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
449 auto const tid = item.get_global_id(0);
450 auto engine = engine_acc.load(tid);
451 RandomEngine rand_eng{&engine};
452 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
455 engine_acc.store(engine, tid);
459 }
catch (sycl::exception
const& ex) {
460 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
464 template <
typename L,
int dim>
468 const BoxIndexerND<dim> indexer(box);
469 const auto ec = Gpu::ExecutionConfig(box.numPts());
470 const auto nthreads_per_block = ec.numThreads.x;
471 const auto nthreads_total = std::size_t(nthreads_per_block) *
amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
472 auto& q = Gpu::Device::streamQueue();
473 auto& engdescr = *(getRandEngineDescriptor());
475 q.submit([&] (sycl::handler& h) {
476 auto engine_acc = engdescr.get_access(h);
477 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
478 sycl::range<1>(nthreads_per_block)),
479 [=] (sycl::nd_item<1> item)
480 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
481 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
483 auto const tid = item.get_global_id(0);
484 auto engine = engine_acc.load(tid);
485 RandomEngine rand_eng{&engine};
486 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
487 icell < indexer.numPts(); icell += stride) {
488 auto iv = indexer.intVect(icell);
489 detail::call_f_intvect_engine(f,iv,rand_eng);
491 engine_acc.store(engine, tid);
495 }
catch (sycl::exception
const& ex) {
496 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
500 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
501 void ParallelForRNG (BoxND<dim>
const& box, T ncomp, L
const&
f) noexcept
504 const BoxIndexerND<dim> indexer(box);
505 const auto ec = Gpu::ExecutionConfig(box.numPts());
506 const auto nthreads_per_block = ec.numThreads.x;
507 const auto nthreads_total = std::size_t(nthreads_per_block) *
amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
508 auto& q = Gpu::Device::streamQueue();
509 auto& engdescr = *(getRandEngineDescriptor());
511 q.submit([&] (sycl::handler& h) {
512 auto engine_acc = engdescr.get_access(h);
513 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
514 sycl::range<1>(nthreads_per_block)),
515 [=] (sycl::nd_item<1> item)
516 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
517 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
519 auto const tid = item.get_global_id(0);
520 auto engine = engine_acc.load(tid);
521 RandomEngine rand_eng{&engine};
522 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
523 icell < indexer.numPts(); icell += stride) {
524 auto iv = indexer.intVect(icell);
525 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
527 engine_acc.store(engine, tid);
531 }
catch (sycl::exception
const& ex) {
532 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
536 template <
int MT,
typename L1,
typename L2,
int dim>
537 void ParallelFor (Gpu::KernelInfo
const& , BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
540 const BoxIndexerND<dim> indexer1(box1);
541 const BoxIndexerND<dim> indexer2(box2);
542 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max(box1.numPts(), box2.numPts()));
543 const auto nthreads_per_block = ec.numThreads.x;
544 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
545 auto& q = Gpu::Device::streamQueue();
547 q.submit([&] (sycl::handler& h) {
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(MT)]]
552 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
554 auto const ncells =
std::max(indexer1.numPts(), indexer2.numPts());
555 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
556 icell < ncells; icell += stride) {
557 if (icell < indexer1.numPts()) {
558 auto iv = indexer1.intVect(icell);
561 if (icell < indexer2.numPts()) {
562 auto iv = indexer2.intVect(icell);
568 }
catch (sycl::exception
const& ex) {
569 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
573 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
575 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
576 L1&& f1, L2&& f2, L3&& f3) noexcept
579 const BoxIndexerND<dim> indexer1(box1);
580 const BoxIndexerND<dim> indexer2(box2);
581 const BoxIndexerND<dim> indexer3(box3);
582 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
583 const auto nthreads_per_block = ec.numThreads.x;
584 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
585 auto& q = Gpu::Device::streamQueue();
587 q.submit([&] (sycl::handler& h) {
588 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
589 sycl::range<1>(nthreads_per_block)),
590 [=] (sycl::nd_item<1> item)
591 [[sycl::reqd_work_group_size(MT)]]
592 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
594 auto const ncells =
std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
595 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
596 icell < ncells; icell += stride) {
597 if (icell < indexer1.numPts()) {
598 auto iv = indexer1.intVect(icell);
601 if (icell < indexer2.numPts()) {
602 auto iv = indexer2.intVect(icell);
605 if (icell < indexer3.numPts()) {
606 auto iv = indexer3.intVect(icell);
612 }
catch (sycl::exception
const& ex) {
613 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
617 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
618 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
619 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
621 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
622 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
625 const BoxIndexerND<dim> indexer1(box1);
626 const BoxIndexerND<dim> indexer2(box2);
627 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max(box1.numPts(),box2.numPts()));
628 const auto nthreads_per_block = ec.numThreads.x;
629 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
630 auto& q = Gpu::Device::streamQueue();
632 q.submit([&] (sycl::handler& h) {
633 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
634 sycl::range<1>(nthreads_per_block)),
635 [=] (sycl::nd_item<1> item)
636 [[sycl::reqd_work_group_size(MT)]]
637 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
639 auto const ncells =
std::max(indexer1.numPts(), indexer2.numPts());
640 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
641 icell < ncells; icell += stride) {
642 if (icell < indexer1.numPts()) {
643 auto iv = indexer1.intVect(icell);
646 if (icell < indexer2.numPts()) {
647 auto iv = indexer2.intVect(icell);
653 }
catch (sycl::exception
const& ex) {
654 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
658 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
659 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
660 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
661 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
663 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
664 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
665 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
668 const BoxIndexerND<dim> indexer1(box1);
669 const BoxIndexerND<dim> indexer2(box2);
670 const BoxIndexerND<dim> indexer3(box3);
671 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
672 const auto nthreads_per_block = ec.numThreads.x;
673 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
674 auto& q = Gpu::Device::streamQueue();
676 q.submit([&] (sycl::handler& h) {
677 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
678 sycl::range<1>(nthreads_per_block)),
679 [=] (sycl::nd_item<1> item)
680 [[sycl::reqd_work_group_size(MT)]]
681 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
683 auto const ncells =
std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
684 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
685 icell < ncells; icell += stride) {
686 if (icell < indexer1.numPts()) {
687 auto iv = indexer1.intVect(icell);
690 if (icell < indexer2.numPts()) {
691 auto iv = indexer2.intVect(icell);
694 if (icell < indexer3.numPts()) {
695 auto iv = indexer3.intVect(icell);
701 }
catch (sycl::exception
const& ex) {
702 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
709 template <
typename L>
717 template <
int MT,
typename L>
726 template <
int MT,
typename L>
735 void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
738 AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
747 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(
f));
750 template<
int MT,
typename T,
typename L>
754 const auto ec = Gpu::makeExecutionConfig<MT>(n);
764 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
765 std::enable_if_t<MaybeDeviceRunnable<L>::value>
769 const auto ec = Gpu::makeExecutionConfig<MT>(n);
772 for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
773 i < Long(n); i += stride) {
776 (std::uint64_t)blockDim.x)));
782 template <
int MT,
typename L,
int dim>
783 std::enable_if_t<MaybeDeviceRunnable<L>::value>
788 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
791 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
792 icell < indexer.
numPts(); icell += stride)
794 auto iv = indexer.intVect(icell);
795 detail::call_f_intvect_handler(f, iv,
796 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
797 (std::uint64_t)blockDim.x)));
803 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
804 std::enable_if_t<MaybeDeviceRunnable<L>::value>
809 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
812 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
813 icell < indexer.
numPts(); icell += stride) {
814 auto iv = indexer.intVect(icell);
815 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
816 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
817 (std::uint64_t)blockDim.x)));
823 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
824 std::enable_if_t<MaybeDeviceRunnable<L>::value>
831 amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
834 Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
836 for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
844 template <
typename L,
int dim>
845 std::enable_if_t<MaybeDeviceRunnable<L>::value>
853 amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
856 auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
858 for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
859 auto iv = indexer.
intVect(icell);
867 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
868 std::enable_if_t<MaybeDeviceRunnable<L>::value>
876 amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
879 auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
881 for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
882 auto iv = indexer.
intVect(icell);
890 template <
int MT,
typename L1,
typename L2,
int dim>
891 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
898 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max(box1.numPts(),box2.numPts()));
902 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
903 icell < ncells; icell += stride) {
904 if (icell < indexer1.
numPts()) {
905 auto iv = indexer1.
intVect(icell);
908 if (icell < indexer2.
numPts()) {
909 auto iv = indexer2.
intVect(icell);
917 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
918 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
921 L1&& f1, L2&& f2, L3&& f3) noexcept
927 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
931 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
932 icell < ncells; icell += stride) {
933 if (icell < indexer1.
numPts()) {
934 auto iv = indexer1.
intVect(icell);
937 if (icell < indexer2.
numPts()) {
938 auto iv = indexer2.
intVect(icell);
941 if (icell < indexer3.
numPts()) {
942 auto iv = indexer3.
intVect(icell);
950 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
951 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
952 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
953 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
956 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
961 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max(box1.numPts(),box2.numPts()));
965 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
966 icell < ncells; icell += stride) {
967 if (icell < indexer1.
numPts()) {
968 auto iv = indexer1.
intVect(icell);
971 if (icell < indexer2.
numPts()) {
972 auto iv = indexer2.
intVect(icell);
980 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
981 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
982 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
983 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
984 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
988 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
994 const auto ec = Gpu::makeExecutionConfig<MT>(
std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
998 for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
999 icell < ncells; icell += stride) {
1000 if (icell < indexer1.
numPts()) {
1001 auto iv = indexer1.
intVect(icell);
1004 if (icell < indexer2.
numPts()) {
1005 auto iv = indexer2.
intVect(icell);
1008 if (icell < indexer3.
numPts()) {
1009 auto iv = indexer3.
intVect(icell);
1019 template <
typename L>
1025 template<
typename T,
typename L>
1026 void launch (T
const& n, L&&
f) noexcept
1028 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(
f));
1031 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1032 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1035 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(
f));
1038 template <
typename L,
int dim>
1039 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1042 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(
f));
1045 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1046 std::enable_if_t<MaybeDeviceRunnable<L>::value>
1049 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(
f));
1052 template <
typename L1,
typename L2,
int dim>
1053 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1057 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1058 std::forward<L2>(f2));
1061 template <
typename L1,
typename L2,
typename L3,
int dim>
1062 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1065 L1&& f1, L2&& f2, L3&& f3) noexcept
1067 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1068 std::forward<L2>(f2), std::forward<L3>(f3));
1071 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1072 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1073 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1074 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1077 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1079 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1080 box2, ncomp2, std::forward<L2>(f2));
1083 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1084 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1085 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1086 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1087 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1091 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1093 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1094 box2, ncomp2, std::forward<L2>(f2),
1095 box3, ncomp3, std::forward<L3>(f3));
1098 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1099 void For (Gpu::KernelInfo
const& info, T n, L&&
f) noexcept
1101 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(
f));
1104 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1105 void For (Gpu::KernelInfo
const& info, T n, L&&
f) noexcept
1107 ParallelFor<MT>(info, n,std::forward<L>(
f));
1110 template <
typename L,
int dim>
1111 void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&&
f) noexcept
1113 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(
f));
1116 template <
int MT,
typename L,
int dim>
1117 void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&&
f) noexcept
1119 ParallelFor<MT>(info, box,std::forward<L>(
f));
1122 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1123 void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1125 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(
f));
1128 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1129 void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1131 ParallelFor<MT>(info,box,ncomp,std::forward<L>(
f));
1134 template <
typename L1,
typename L2,
int dim>
1135 void For (Gpu::KernelInfo
const& info,
1136 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1138 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1141 template <
int MT,
typename L1,
typename L2,
int dim>
1142 void For (Gpu::KernelInfo
const& info,
1143 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1145 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1148 template <
typename L1,
typename L2,
typename L3,
int dim>
1149 void For (Gpu::KernelInfo
const& info,
1150 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1151 L1&& f1, L2&& f2, L3&& f3) noexcept
1153 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1156 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1157 void For (Gpu::KernelInfo
const& info,
1158 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1159 L1&& f1, L2&& f2, L3&& f3) noexcept
1161 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1164 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1165 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1166 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1167 void For (Gpu::KernelInfo
const& info,
1168 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1169 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1171 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1174 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1175 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1176 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1177 void For (Gpu::KernelInfo
const& info,
1178 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1179 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1181 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1184 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1185 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1186 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1187 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1188 void For (Gpu::KernelInfo
const& info,
1189 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1190 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1191 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1193 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1194 box1,ncomp1,std::forward<L1>(f1),
1195 box2,ncomp2,std::forward<L2>(f2),
1196 box3,ncomp3,std::forward<L3>(f3));
1199 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1200 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1201 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1202 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1203 void For (Gpu::KernelInfo
const& info,
1204 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1205 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1206 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1208 ParallelFor<MT>(info,
1209 box1,ncomp1,std::forward<L1>(f1),
1210 box2,ncomp2,std::forward<L2>(f2),
1211 box3,ncomp3,std::forward<L3>(f3));
1214 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1217 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(
f));
1220 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1223 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(
f));
1226 template <
typename L,
int dim>
1229 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box, std::forward<L>(
f));
1232 template <
int MT,
typename L,
int dim>
1233 void ParallelFor (BoxND<dim>
const& box, L&&
f) noexcept
1235 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(
f));
1238 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1241 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1244 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1245 void ParallelFor (BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1247 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1250 template <
typename L1,
typename L2,
int dim>
1251 void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1253 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1256 template <
int MT,
typename L1,
typename L2,
int dim>
1257 void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1259 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1262 template <
typename L1,
typename L2,
typename L3,
int dim>
1263 void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1264 L1&& f1, L2&& f2, L3&& f3) noexcept
1266 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1269 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1270 void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1271 L1&& f1, L2&& f2, L3&& f3) noexcept
1273 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1276 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1277 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1278 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1279 void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1280 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1282 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1285 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1286 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1287 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1288 void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1289 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1291 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1294 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1295 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1296 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1297 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1298 void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1299 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1300 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1302 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1303 box1,ncomp1,std::forward<L1>(f1),
1304 box2,ncomp2,std::forward<L2>(f2),
1305 box3,ncomp3,std::forward<L3>(f3));
1308 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1309 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1310 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1311 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1312 void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1313 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1314 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1316 ParallelFor<MT>(Gpu::KernelInfo{},
1317 box1,ncomp1,std::forward<L1>(f1),
1318 box2,ncomp2,std::forward<L2>(f2),
1319 box3,ncomp3,std::forward<L3>(f3));
1322 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1325 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n,std::forward<L>(
f));
1328 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1329 void For (T n, L&&
f) noexcept
1331 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(
f));
1334 template <
typename L,
int dim>
1337 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box,std::forward<L>(
f));
1340 template <
int MT,
typename L,
int dim>
1341 void For (BoxND<dim>
const& box, L&&
f) noexcept
1343 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(
f));
1346 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1349 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1352 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1353 void For (BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1355 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1358 template <
typename L1,
typename L2,
int dim>
1359 void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1361 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1364 template <
int MT,
typename L1,
typename L2,
int dim>
1365 void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1367 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1370 template <
typename L1,
typename L2,
typename L3,
int dim>
1371 void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1372 L1&& f1, L2&& f2, L3&& f3) noexcept
1374 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1377 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1378 void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1379 L1&& f1, L2&& f2, L3&& f3) noexcept
1381 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1384 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1385 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1386 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1387 void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1388 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1390 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1393 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1394 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1395 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1396 void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1397 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1399 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1402 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1403 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1404 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1405 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1406 void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1407 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1408 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1410 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1411 box1,ncomp1,std::forward<L1>(f1),
1412 box2,ncomp2,std::forward<L2>(f2),
1413 box3,ncomp3,std::forward<L3>(f3));
1416 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1417 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1418 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1419 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1420 void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1421 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1422 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1424 ParallelFor<MT>(Gpu::KernelInfo{},
1425 box1,ncomp1,std::forward<L1>(f1),
1426 box2,ncomp2,std::forward<L2>(f2),
1427 box3,ncomp3,std::forward<L3>(f3));
1430 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1431 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1435 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(
f));
1437 #ifdef AMREX_USE_SYCL
1438 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1441 for (T i = 0; i < n; ++i) {
f(i); }
1446 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1447 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1451 ParallelFor<MT>(info,n,std::forward<L>(
f));
1453 #ifdef AMREX_USE_SYCL
1454 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1457 for (T i = 0; i < n; ++i) {
f(i); }
1462 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1463 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1466 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(
f));
1469 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1470 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1476 template <
typename L,
int dim>
1477 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1481 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(
f));
1483 #ifdef AMREX_USE_SYCL
1484 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1491 template <
int MT,
typename L,
int dim>
1492 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1496 ParallelFor<MT>(info, box,std::forward<L>(
f));
1498 #ifdef AMREX_USE_SYCL
1499 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1506 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1507 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1511 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(
f));
1513 #ifdef AMREX_USE_SYCL
1514 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1521 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1522 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1526 ParallelFor<MT>(info, box,ncomp,std::forward<L>(
f));
1528 #ifdef AMREX_USE_SYCL
1529 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1536 template <
typename L1,
typename L2,
int dim>
1537 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1542 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1544 #ifdef AMREX_USE_SYCL
1545 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1553 template <
int MT,
typename L1,
typename L2,
int dim>
1554 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1559 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1561 #ifdef AMREX_USE_SYCL
1562 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1570 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1571 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1574 L1&& f1, L2&& f2, L3&& f3) noexcept
1577 ParallelFor<MT>(info,box1,box2,box3,
1578 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1580 #ifdef AMREX_USE_SYCL
1581 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1590 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1591 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1592 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1593 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1596 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1599 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1601 #ifdef AMREX_USE_SYCL
1602 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1610 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1611 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1612 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1613 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1616 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1619 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1621 #ifdef AMREX_USE_SYCL
1622 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1630 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1631 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1632 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1633 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1634 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1638 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1641 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1642 box1,ncomp1,std::forward<L1>(f1),
1643 box2,ncomp2,std::forward<L2>(f2),
1644 box3,ncomp3,std::forward<L3>(f3));
1646 #ifdef AMREX_USE_SYCL
1647 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1656 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1657 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1658 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1659 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1660 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1664 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1667 ParallelFor<MT>(info,
1668 box1,ncomp1,std::forward<L1>(f1),
1669 box2,ncomp2,std::forward<L2>(f2),
1670 box3,ncomp3,std::forward<L3>(f3));
1672 #ifdef AMREX_USE_SYCL
1673 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1682 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1683 void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&&
f) noexcept
1685 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(
f));
1688 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1689 void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&&
f) noexcept
1691 HostDeviceParallelFor<MT>(info,n,std::forward<L>(
f));
1694 template <
typename L,
int dim>
1695 void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&&
f) noexcept
1697 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(
f));
1700 template <
int MT,
typename L,
int dim>
1701 void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&&
f) noexcept
1703 HostDeviceParallelFor<MT>(info,box,std::forward<L>(
f));
1706 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1707 void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1709 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(
f));
1712 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1713 void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&&
f) noexcept
1715 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(
f));
1718 template <
typename L1,
typename L2,
int dim>
1720 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1722 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1725 template <
int MT,
typename L1,
typename L2,
int dim>
1727 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1729 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1732 template <
typename L1,
typename L2,
typename L3,
int dim>
1734 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1735 L1&& f1, L2&& f2, L3&& f3) noexcept
1737 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1738 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1741 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1743 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1744 L1&& f1, L2&& f2, L3&& f3) noexcept
1746 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1747 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1750 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1751 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1752 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1754 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1755 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1757 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1760 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1761 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1762 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1764 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1765 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1767 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1770 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1771 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1772 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1773 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1775 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1776 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1777 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1779 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1780 box1,ncomp1,std::forward<L1>(f1),
1781 box2,ncomp2,std::forward<L2>(f2),
1782 box3,ncomp3,std::forward<L3>(f3));
1785 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1786 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1787 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1788 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1790 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1791 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1792 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1794 HostDeviceParallelFor<MT>(info,
1795 box1,ncomp1,std::forward<L1>(f1),
1796 box2,ncomp2,std::forward<L2>(f2),
1797 box3,ncomp3,std::forward<L3>(f3));
1800 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1803 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(
f));
1806 template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1809 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(
f));
1812 template <
typename L,
int dim>
1815 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(
f));
1818 template <
int MT,
typename L,
int dim>
1821 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(
f));
1824 template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1827 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1830 template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1833 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(
f));
1836 template <
typename L1,
typename L2,
int dim>
1837 void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1839 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1842 template <
int MT,
typename L1,
typename L2,
int dim>
1843 void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2) noexcept
1845 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1848 template <
typename L1,
typename L2,
typename L3,
int dim>
1850 L1&& f1, L2&& f2, L3&& f3) noexcept
1852 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
1853 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1856 template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1858 L1&& f1, L2&& f2, L3&& f3) noexcept
1860 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
1861 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1864 template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1865 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1866 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1868 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1870 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1873 template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1874 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1875 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1877 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2) noexcept
1879 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1882 template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1883 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1884 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1885 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1887 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1888 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1890 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1891 box1,ncomp1,std::forward<L1>(f1),
1892 box2,ncomp2,std::forward<L2>(f2),
1893 box3,ncomp3,std::forward<L3>(f3));
1896 template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1897 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1898 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1899 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1901 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1902 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3) noexcept
1904 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
1905 box1,ncomp1,std::forward<L1>(f1),
1906 box2,ncomp2,std::forward<L2>(f2),
1907 box3,ncomp3,std::forward<L3>(f3));
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition: AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition: AMReX_GpuError.H:105
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:34
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition: AMReX_Box.H:43
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
Definition: AMReX_GpuKernelInfo.H:8
Definition: AMReX_IntVect.H:48
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition: AMReX_GpuRange.H:125
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
bool inLaunchRegion() noexcept
Definition: AMReX_GpuControl.H:86
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsC.H:75
@ max
Definition: AMReX_ParallelReduce.H:17
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T n) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n))
Definition: AMReX_GpuLaunchFunctsC.H:103
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T ncomp, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:127
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv, Gpu::Handler const &) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i, Gpu::Handler const &) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsG.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp(F const &f, IntVectND< dim > iv, T ncomp) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition: AMReX_GpuLaunchFunctsG.H:103
AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i) noexcept -> decltype(f(0))
Definition: AMReX_GpuLaunchFunctsC.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition: AMReX_GpuLaunchFunctsG.H:65
AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsC.H:65
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition: AMReX_GpuLaunchFunctsG.H:75
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T n, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n, engine))
Definition: AMReX_GpuLaunchFunctsC.H:93
AMREX_FORCE_INLINE auto call_f_intvect_inner(std::index_sequence< Ns... >, F const &f, IntVectND< 1 > iv, Args...args) noexcept -> decltype(f(0, 0, 0, args...))
Definition: AMReX_GpuLaunchFunctsC.H:31
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T ncomp, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0, engine))
Definition: AMReX_GpuLaunchFunctsG.H:115
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
void ParallelFor(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1239
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition: AMReX_Loop.H:377
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
constexpr AMREX_GPU_HOST_DEVICE GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition: AMReX_Tuple.H:179
void For(BoxND< dim > const &box, T ncomp, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1347
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
void HostDeviceFor(T n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:869
AMREX_FORCE_INLINE randState_t * getRandState()
Definition: AMReX_RandomEngine.H:55
void single_task(gpuStream_t stream, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:710
bool isEmpty(T n) noexcept
Definition: AMReX_GpuRange.H:14
void single_task(L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:1307
curandState_t randState_t
Definition: AMReX_RandomEngine.H:48
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:221
std::enable_if_t< MaybeDeviceRunnable< L >::value > ParallelForRNG(BoxND< dim > const &box, T ncomp, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:869
void launch(T const &n, L const &f) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:751
std::enable_if_t< MaybeHostDeviceRunnable< L1 >::value &&MaybeHostDeviceRunnable< L2 >::value &&MaybeHostDeviceRunnable< L3 >::value > HostDeviceParallelFor(Gpu::KernelInfo const &info, BoxND< dim > const &box1, T1 ncomp1, L1 &&f1, BoxND< dim > const &box2, T2 ncomp2, L2 &&f2, BoxND< dim > const &box3, T3 ncomp3, L3 &&f3) noexcept
Definition: AMReX_GpuLaunchFunctsG.H:1661
Definition: AMReX_FabArrayCommI.H:841
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE IntVectND< dim > intVect(std::uint64_t icell) const
Definition: AMReX_Box.H:2044
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuLaunch.H:127
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_RandomEngine.H:57