1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
12 template <
typename F,
typename N>
14 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const&)
15 noexcept ->
decltype(f(0))
20 template <
typename F,
typename N>
22 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const& handler)
23 noexcept ->
decltype(f(0,Gpu::Handler{}))
30 template <
typename F, std::size_t...Ns,
class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
35 return f(iv[0], 0, 0, args...);
38 template <
typename F, std::size_t...Ns,
class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
43 return f(iv[0], iv[1], 0, args...);
46 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
51 return f(iv, args...);
54 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
59 return f(iv[Ns]..., args...);
64 template <
typename F,
int dim>
66 auto call_f_intvect (
F const& f, IntVectND<dim> iv)
67 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
74 template <
typename F,
int dim>
76 auto call_f_intvect_engine (
F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
84 template <
typename F,
int dim>
86 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const&)
87 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
92 template <
typename F,
int dim>
94 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const& handler)
95 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
102 template <
typename F,
typename T,
int dim>
104 auto call_f_intvect_ncomp (
F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
114 template <
typename F,
typename T,
int dim>
116 auto call_f_intvect_ncomp_engine (
F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
126 template <
typename F,
typename T,
int dim>
128 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const&)
129 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
136 template <
typename F,
typename T,
int dim>
138 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const& handler)
139 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
154 detail::SyclKernelDevPtr<L> skdp(f, stream);
155 L
const* pf = skdp.template get<0>();
158 auto& q = *(stream.queue);
160 q.submit([&] (sycl::handler& h) {
161 if constexpr (detail::is_big_kernel<L>()) {
167 }
catch (sycl::exception
const& ex) {
168 amrex::Abort(std::string(
"single_task: ")+ex.what()+
"!!!!!");
173void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
176 detail::SyclKernelDevPtr<L> skdp(f, stream);
177 L
const* pf = skdp.template get<0>();
180 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
181 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
182 /
sizeof(
unsigned long long);
183 auto& q = *(stream.queue);
185 q.submit([&] (sycl::handler& h) {
186 sycl::local_accessor<unsigned long long>
187 shared_data(sycl::range<1>(shared_mem_numull), h);
188 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
189 sycl::range<1>(nthreads_per_block)),
190 [=] (sycl::nd_item<1> item)
193 if constexpr (detail::is_big_kernel<L>()) {
194 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
196 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
200 }
catch (sycl::exception
const& ex) {
201 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
208 detail::SyclKernelDevPtr<L> skdp(f, stream);
209 L
const* pf = skdp.template get<0>();
212 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
213 auto& q = *(stream.queue);
215 q.submit([&] (sycl::handler& h) {
216 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
217 sycl::range<1>(nthreads_per_block)),
218 [=] (sycl::nd_item<1> item)
221 if constexpr (detail::is_big_kernel<L>()) {
228 }
catch (sycl::exception
const& ex) {
229 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
233template <
int MT,
typename L>
237 detail::SyclKernelDevPtr<L> skdp(f, stream);
238 L
const* pf = skdp.template get<0>();
241 const auto nthreads_total = MT * std::size_t(nblocks);
242 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
243 /
sizeof(
unsigned long long);
244 auto& q = *(stream.queue);
246 q.submit([&] (sycl::handler& h) {
247 sycl::local_accessor<unsigned long long>
248 shared_data(sycl::range<1>(shared_mem_numull), h);
249 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
251 [=] (sycl::nd_item<1> item)
252 [[sycl::reqd_work_group_size(MT)]]
255 if constexpr (detail::is_big_kernel<L>()) {
256 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
258 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
262 }
catch (sycl::exception
const& ex) {
263 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
267template <
int MT,
typename L>
270 detail::SyclKernelDevPtr<L> skdp(f, stream);
271 L
const* pf = skdp.template get<0>();
274 const auto nthreads_total = MT * std::size_t(nblocks);
275 auto& q = *(stream.queue);
277 q.submit([&] (sycl::handler& h) {
278 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
280 [=] (sycl::nd_item<1> item)
281 [[sycl::reqd_work_group_size(MT)]]
284 if constexpr (detail::is_big_kernel<L>()) {
291 }
catch (sycl::exception
const& ex) {
292 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
296template<
int MT,
typename T,
typename L>
297void launch (T
const& n, L
const& f)
302 L
const* pf = skdp.template get<0>();
305 const auto ec = Gpu::makeExecutionConfig<MT>(n);
306 const auto nthreads_per_block = ec.numThreads.x;
307 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
308 auto& q = Gpu::Device::streamQueue();
310 q.submit([&] (sycl::handler& h) {
311 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
312 sycl::range<1>(nthreads_per_block)),
313 [=] (sycl::nd_item<1> item)
314 [[sycl::reqd_work_group_size(MT)]]
317 for (
auto const i :
Gpu::
Range(n,item.get_global_id(0),item.get_global_range(0))) {
318 if constexpr (detail::is_big_kernel<L>()) {
326 }
catch (sycl::exception
const& ex) {
327 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
331template <
int MT, std::
integral T,
typename L>
332void ParallelFor (Gpu::KernelInfo
const& info, T n, L
const& f)
337 L
const* pf = skdp.template get<0>();
340 const auto ec = Gpu::makeExecutionConfig<MT>(n);
341 const auto nthreads_per_block = ec.numThreads.x;
342 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
343 auto& q = Gpu::Device::streamQueue();
345 if (info.hasReduction()) {
346 q.submit([&] (sycl::handler& h) {
347 sycl::local_accessor<unsigned long long>
349 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
350 sycl::range<1>(nthreads_per_block)),
351 [=] (sycl::nd_item<1> item)
352 [[sycl::reqd_work_group_size(MT)]]
355 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
356 i < std::size_t(n); i += stride) {
357 int n_active_threads =
amrex::min(std::size_t(n)-i+item.get_local_id(0),
358 item.get_local_range(0));
359 if constexpr (detail::is_big_kernel<L>()) {
360 detail::call_f_scalar_handler(*pf, T(i),
361 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
364 detail::call_f_scalar_handler(f, T(i),
365 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
372 q.submit([&] (sycl::handler& h) {
373 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
374 sycl::range<1>(nthreads_per_block)),
375 [=] (sycl::nd_item<1> item)
376 [[sycl::reqd_work_group_size(MT)]]
379 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
380 i < std::size_t(n); i += stride) {
381 if constexpr (detail::is_big_kernel<L>()) {
382 detail::call_f_scalar_handler(*pf, T(i), Gpu::Handler{&item});
384 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
390 }
catch (sycl::exception
const& ex) {
391 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
395template <
int MT,
typename L,
int dim>
396void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L
const& f)
401 L
const* pf = skdp.template get<0>();
404 const BoxIndexerND<dim> indexer(box);
405 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
406 const auto nthreads_per_block = ec.numThreads.x;
407 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
408 auto& q = Gpu::Device::streamQueue();
410 if (info.hasReduction()) {
411 q.submit([&] (sycl::handler& h) {
412 sycl::local_accessor<unsigned long long>
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
424 std::uint64_t(item.get_local_range(0)));
425 if constexpr (detail::is_big_kernel<L>()) {
426 detail::call_f_intvect_handler(*pf,
427 iv, Gpu::Handler{&item,
428 shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
431 detail::call_f_intvect_handler(f,
432 iv, Gpu::Handler{&item,
433 shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
440 q.submit([&] (sycl::handler& h) {
441 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
442 sycl::range<1>(nthreads_per_block)),
443 [=] (sycl::nd_item<1> item)
444 [[sycl::reqd_work_group_size(MT)]]
447 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
448 icell < indexer.numPts(); icell += stride) {
449 auto iv = indexer.intVect(icell);
450 if constexpr (detail::is_big_kernel<L>()) {
451 detail::call_f_intvect_handler(*pf,iv,Gpu::Handler{&item});
453 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
459 }
catch (sycl::exception
const& ex) {
460 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
464template <
int MT, std::
integral T,
typename L,
int dim>
465void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L
const& f)
470 L
const* pf = skdp.template get<0>();
473 const BoxIndexerND<dim> indexer(box);
474 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
475 const auto nthreads_per_block = ec.numThreads.x;
476 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
477 auto& q = Gpu::Device::streamQueue();
479 if (info.hasReduction()) {
480 q.submit([&] (sycl::handler& h) {
481 sycl::local_accessor<unsigned long long>
483 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
484 sycl::range<1>(nthreads_per_block)),
485 [=] (sycl::nd_item<1> item)
486 [[sycl::reqd_work_group_size(MT)]]
489 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
490 icell < indexer.numPts(); icell += stride) {
491 auto iv = indexer.intVect(icell);
492 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
493 std::uint64_t(item.get_local_range(0)));
494 if constexpr (detail::is_big_kernel<L>()) {
495 detail::call_f_intvect_ncomp_handler(*pf, iv, ncomp,
496 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
499 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
500 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
507 q.submit([&] (sycl::handler& h) {
508 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
509 sycl::range<1>(nthreads_per_block)),
510 [=] (sycl::nd_item<1> item)
511 [[sycl::reqd_work_group_size(MT)]]
514 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
515 icell < indexer.numPts(); icell += stride) {
516 auto iv = indexer.intVect(icell);
517 if constexpr (detail::is_big_kernel<L>()) {
518 detail::call_f_intvect_ncomp_handler(*pf,iv,ncomp,Gpu::Handler{&item});
520 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
526 }
catch (sycl::exception
const& ex) {
527 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
531template <std::
integral T,
typename L>
537 L
const* pf = skdp.template get<0>();
540 const auto ec = Gpu::ExecutionConfig(n);
541 const auto nthreads_per_block = ec.numThreads.x;
543 auto& q = Gpu::Device::streamQueue();
544 auto& engdescr = *(getRandEngineDescriptor());
546 q.submit([&] (sycl::handler& h) {
547 auto engine_acc = engdescr.get_access(h);
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
554 auto const tid = item.get_global_id(0);
555 auto engine = engine_acc.load(tid);
556 RandomEngine rand_eng{&engine};
557 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
558 if constexpr (detail::is_big_kernel<L>()) {
559 (*pf)(T(i),rand_eng);
564 engine_acc.store(engine, tid);
568 }
catch (sycl::exception
const& ex) {
569 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
573template <
typename L,
int dim>
579 L
const* pf = skdp.template get<0>();
582 const BoxIndexerND<dim> indexer(box);
583 const auto ec = Gpu::ExecutionConfig(box.numPts());
584 const auto nthreads_per_block = ec.numThreads.x;
586 auto& q = Gpu::Device::streamQueue();
587 auto& engdescr = *(getRandEngineDescriptor());
589 q.submit([&] (sycl::handler& h) {
590 auto engine_acc = engdescr.get_access(h);
591 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
592 sycl::range<1>(nthreads_per_block)),
593 [=] (sycl::nd_item<1> item)
594 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
597 auto const tid = item.get_global_id(0);
598 auto engine = engine_acc.load(tid);
599 RandomEngine rand_eng{&engine};
600 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
601 icell < indexer.numPts(); icell += stride) {
602 auto iv = indexer.intVect(icell);
603 if constexpr (detail::is_big_kernel<L>()) {
604 detail::call_f_intvect_engine(*pf,iv,rand_eng);
606 detail::call_f_intvect_engine(f,iv,rand_eng);
609 engine_acc.store(engine, tid);
613 }
catch (sycl::exception
const& ex) {
614 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
618template <std::
integral T,
typename L,
int dim>
624 L
const* pf = skdp.template get<0>();
627 const BoxIndexerND<dim> indexer(box);
628 const auto ec = Gpu::ExecutionConfig(box.numPts());
629 const auto nthreads_per_block = ec.numThreads.x;
631 auto& q = Gpu::Device::streamQueue();
632 auto& engdescr = *(getRandEngineDescriptor());
634 q.submit([&] (sycl::handler& h) {
635 auto engine_acc = engdescr.get_access(h);
636 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
637 sycl::range<1>(nthreads_per_block)),
638 [=] (sycl::nd_item<1> item)
639 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
642 auto const tid = item.get_global_id(0);
643 auto engine = engine_acc.load(tid);
644 RandomEngine rand_eng{&engine};
645 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
646 icell < indexer.numPts(); icell += stride) {
647 auto iv = indexer.intVect(icell);
648 if constexpr (detail::is_big_kernel<L>()) {
649 detail::call_f_intvect_ncomp_engine(*pf,iv,ncomp,rand_eng);
651 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
654 engine_acc.store(engine, tid);
658 }
catch (sycl::exception
const& ex) {
659 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
663template <
int MT,
typename L1,
typename L2,
int dim>
664void ParallelFor (Gpu::KernelInfo
const& , BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1
const& f1, L2
const& f2)
669 L1
const* pf1 = skdp.template get<0>();
670 L2
const* pf2 = skdp.template get<1>();
673 const BoxIndexerND<dim> indexer1(box1);
674 const BoxIndexerND<dim> indexer2(box2);
675 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
676 const auto nthreads_per_block = ec.numThreads.x;
677 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
678 auto& q = Gpu::Device::streamQueue();
680 q.submit([&] (sycl::handler& h) {
681 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
682 sycl::range<1>(nthreads_per_block)),
683 [=] (sycl::nd_item<1> item)
684 [[sycl::reqd_work_group_size(MT)]]
687 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
688 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
689 icell < ncells; icell += stride) {
690 if (icell < indexer1.numPts()) {
691 auto iv = indexer1.intVect(icell);
692 if constexpr (detail::is_big_kernel<L1,L2>()) {
693 detail::call_f_intvect(*pf1,iv);
695 detail::call_f_intvect(f1,iv);
698 if (icell < indexer2.numPts()) {
699 auto iv = indexer2.intVect(icell);
700 if constexpr (detail::is_big_kernel<L1,L2>()) {
701 detail::call_f_intvect(*pf2,iv);
703 detail::call_f_intvect(f2,iv);
709 }
catch (sycl::exception
const& ex) {
710 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
714template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
716 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
717 L1
const& f1, L2
const& f2, L3
const& f3)
721 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3,
Gpu::gpuStream());
722 L1
const* pf1 = skdp.template get<0>();
723 L2
const* pf2 = skdp.template get<1>();
724 L3
const* pf3 = skdp.template get<2>();
727 const BoxIndexerND<dim> indexer1(box1);
728 const BoxIndexerND<dim> indexer2(box2);
729 const BoxIndexerND<dim> indexer3(box3);
730 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
731 const auto nthreads_per_block = ec.numThreads.x;
732 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
733 auto& q = Gpu::Device::streamQueue();
735 q.submit([&] (sycl::handler& h) {
736 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
737 sycl::range<1>(nthreads_per_block)),
738 [=] (sycl::nd_item<1> item)
739 [[sycl::reqd_work_group_size(MT)]]
742 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
743 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
744 icell < ncells; icell += stride) {
745 if (icell < indexer1.numPts()) {
746 auto iv = indexer1.intVect(icell);
747 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
748 detail::call_f_intvect(*pf1,iv);
750 detail::call_f_intvect(f1,iv);
753 if (icell < indexer2.numPts()) {
754 auto iv = indexer2.intVect(icell);
755 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
756 detail::call_f_intvect(*pf2,iv);
758 detail::call_f_intvect(f2,iv);
761 if (icell < indexer3.numPts()) {
762 auto iv = indexer3.intVect(icell);
763 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
764 detail::call_f_intvect(*pf3,iv);
766 detail::call_f_intvect(f3,iv);
772 }
catch (sycl::exception
const& ex) {
773 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
777template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
779 BoxND<dim>
const& box1, T1 ncomp1, L1
const& f1,
780 BoxND<dim>
const& box2, T2 ncomp2, L2
const& f2)
785 L1
const* pf1 = skdp.template get<0>();
786 L2
const* pf2 = skdp.template get<1>();
789 const BoxIndexerND<dim> indexer1(box1);
790 const BoxIndexerND<dim> indexer2(box2);
791 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
792 const auto nthreads_per_block = ec.numThreads.x;
793 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
794 auto& q = Gpu::Device::streamQueue();
796 q.submit([&] (sycl::handler& h) {
797 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
798 sycl::range<1>(nthreads_per_block)),
799 [=] (sycl::nd_item<1> item)
800 [[sycl::reqd_work_group_size(MT)]]
803 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
804 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
805 icell < ncells; icell += stride) {
806 if (icell < indexer1.numPts()) {
807 auto iv = indexer1.intVect(icell);
808 if constexpr (detail::is_big_kernel<L1,L2>()) {
809 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
811 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
814 if (icell < indexer2.numPts()) {
815 auto iv = indexer2.intVect(icell);
816 if constexpr (detail::is_big_kernel<L1,L2>()) {
817 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
819 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
825 }
catch (sycl::exception
const& ex) {
826 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
830template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
832 BoxND<dim>
const& box1, T1 ncomp1, L1
const& f1,
833 BoxND<dim>
const& box2, T2 ncomp2, L2
const& f2,
834 BoxND<dim>
const& box3, T3 ncomp3, L3
const& f3)
838 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3,
Gpu::gpuStream());
839 L1
const* pf1 = skdp.template get<0>();
840 L2
const* pf2 = skdp.template get<1>();
841 L3
const* pf3 = skdp.template get<2>();
844 const BoxIndexerND<dim> indexer1(box1);
845 const BoxIndexerND<dim> indexer2(box2);
846 const BoxIndexerND<dim> indexer3(box3);
847 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
848 const auto nthreads_per_block = ec.numThreads.x;
849 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
850 auto& q = Gpu::Device::streamQueue();
852 q.submit([&] (sycl::handler& h) {
853 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
854 sycl::range<1>(nthreads_per_block)),
855 [=] (sycl::nd_item<1> item)
856 [[sycl::reqd_work_group_size(MT)]]
859 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
860 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
861 icell < ncells; icell += stride) {
862 if (icell < indexer1.numPts()) {
863 auto iv = indexer1.intVect(icell);
864 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
865 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
867 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
870 if (icell < indexer2.numPts()) {
871 auto iv = indexer2.intVect(icell);
872 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
873 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
875 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
878 if (icell < indexer3.numPts()) {
879 auto iv = indexer3.intVect(icell);
880 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
881 detail::call_f_intvect_ncomp(*pf3,iv,ncomp3);
883 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
889 }
catch (sycl::exception
const& ex) {
890 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
904template <
int MT,
typename L>
912template <
int MT,
typename L>
920void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
930 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
933template<
int MT, std::
integral T,
typename L>
936 static_assert(
sizeof(T) >= 2);
938 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
939 for (
auto const& ec : nec) {
940 const T start_idx = T(ec.start_idx);
941 const T nleft = n - start_idx;
945 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
954template<
int MT,
int dim,
typename L>
958 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
960 const auto type = box.
ixType();
961 for (
auto const& ec : nec) {
962 const auto start_idx = std::uint64_t(ec.start_idx);
965 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
966 if (icell < indexer.
numPts()) {
967 auto iv = indexer.
intVect(icell);
979template <
int MT, std::
integral T,
typename L>
980requires (MaybeDeviceRunnable<L>::value)
984 static_assert(
sizeof(T) >= 2);
986 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
987 for (
auto const& ec : nec) {
988 const T start_idx = T(ec.start_idx);
989 const T nleft = n - start_idx;
993 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
995 detail::call_f_scalar_handler(f, tid+start_idx,
997 (std::uint64_t)MT)));
1008template <
int MT,
typename L,
int dim>
1009requires (MaybeDeviceRunnable<L>::value)
1015 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1016 for (
auto const& ec : nec) {
1017 const auto start_idx = std::uint64_t(ec.start_idx);
1020 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1021 if (icell < indexer.
numPts()) {
1022 auto iv = indexer.
intVect(icell);
1023 detail::call_f_intvect_handler(f, iv,
1025 (std::uint64_t)MT)));
1036template <
int MT, std::
integral T,
typename L,
int dim>
1037requires (MaybeDeviceRunnable<L>::value)
1043 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1044 for (
auto const& ec : nec) {
1045 const auto start_idx = std::uint64_t(ec.start_idx);
1048 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1049 if (icell < indexer.
numPts()) {
1050 auto iv = indexer.
intVect(icell);
1051 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
1053 (std::uint64_t)MT)));
1065template <std::
integral T,
typename L>
1066requires (MaybeDeviceRunnable<L>::value)
1077 Long tid =
Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1079 for (
Long i = tid, stride =
Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i <
Long(n); i += stride) {
1092template <
typename L,
int dim>
1093requires (MaybeDeviceRunnable<L>::value)
1105 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1107 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
1108 auto iv = indexer.
intVect(icell);
1109 detail::call_f_intvect_engine(f, iv, engine);
1121template <std::
integral T,
typename L,
int dim>
1122requires (MaybeDeviceRunnable<L>::value)
1134 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1136 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
1137 auto iv = indexer.
intVect(icell);
1138 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
1149template <
int MT,
typename L1,
typename L2,
int dim>
1150requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1158 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.
numPts(),box2.
numPts()));
1161 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1162 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1163 icell < ncells; icell += stride) {
1164 if (icell < indexer1.
numPts()) {
1165 auto iv = indexer1.
intVect(icell);
1166 detail::call_f_intvect(f1, iv);
1168 if (icell < indexer2.
numPts()) {
1169 auto iv = indexer2.
intVect(icell);
1170 detail::call_f_intvect(f2, iv);
1181template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1182requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1183 MaybeDeviceRunnable<L3>::value)
1187 L1&& f1, L2&& f2, L3&& f3)
1193 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.
numPts(),box2.
numPts(),box3.
numPts()}));
1196 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1197 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1198 icell < ncells; icell += stride) {
1199 if (icell < indexer1.
numPts()) {
1200 auto iv = indexer1.
intVect(icell);
1201 detail::call_f_intvect(f1, iv);
1203 if (icell < indexer2.
numPts()) {
1204 auto iv = indexer2.
intVect(icell);
1205 detail::call_f_intvect(f2, iv);
1207 if (icell < indexer3.
numPts()) {
1208 auto iv = indexer3.
intVect(icell);
1209 detail::call_f_intvect(f3, iv);
1220template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1221requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1230 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.
numPts(),box2.
numPts()));
1233 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1234 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1235 icell < ncells; icell += stride) {
1236 if (icell < indexer1.
numPts()) {
1237 auto iv = indexer1.
intVect(icell);
1238 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1240 if (icell < indexer2.
numPts()) {
1241 auto iv = indexer2.
intVect(icell);
1242 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1253template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1254requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1255 MaybeDeviceRunnable<L3>::value)
1266 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.
numPts(),box2.
numPts(),box3.
numPts()}));
1269 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1270 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1271 icell < ncells; icell += stride) {
1272 if (icell < indexer1.
numPts()) {
1273 auto iv = indexer1.
intVect(icell);
1274 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1276 if (icell < indexer2.
numPts()) {
1277 auto iv = indexer2.
intVect(icell);
1278 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1280 if (icell < indexer3.
numPts()) {
1281 auto iv = indexer3.
intVect(icell);
1282 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1291template <
typename L>
1297template<
typename T,
typename L>
1298void launch (T
const& n, L&& f)
noexcept
1300 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1307template <std::
integral T,
typename L>
1308requires (MaybeDeviceRunnable<L>::value)
1312 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1319template <
typename L,
int dim>
1320requires (MaybeDeviceRunnable<L>::value)
1324 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1331template <std::
integral T,
typename L,
int dim>
1332requires (MaybeDeviceRunnable<L>::value)
1336 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1343template <
typename L1,
typename L2,
int dim>
1344requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1349 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1350 std::forward<L2>(f2));
1357template <
typename L1,
typename L2,
typename L3,
int dim>
1358requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1359 MaybeDeviceRunnable<L3>::value)
1363 L1&& f1, L2&& f2, L3&& f3)
noexcept
1365 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1366 std::forward<L2>(f2), std::forward<L3>(f3));
1373template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1374requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1378 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1380 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1381 box2, ncomp2, std::forward<L2>(f2));
1388template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1389requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1390 MaybeDeviceRunnable<L3>::value)
1395 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1397 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1398 box2, ncomp2, std::forward<L2>(f2),
1399 box3, ncomp3, std::forward<L3>(f3));
1402template <std::
integral T,
typename L>
1403void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1405 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1408template <
int MT, std::
integral T,
typename L>
1409void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1411 ParallelFor<MT>(info, n,std::forward<L>(f));
1414template <
typename L,
int dim>
1415void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1417 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1420template <
int MT,
typename L,
int dim>
1421void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1423 ParallelFor<MT>(info, box,std::forward<L>(f));
1426template <std::
integral T,
typename L,
int dim>
1427void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1429 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1432template <
int MT, std::
integral T,
typename L,
int dim>
1433void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1435 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1438template <
typename L1,
typename L2,
int dim>
1439void For (Gpu::KernelInfo
const& info,
1440 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1442 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1445template <
int MT,
typename L1,
typename L2,
int dim>
1446void For (Gpu::KernelInfo
const& info,
1447 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1449 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1452template <
typename L1,
typename L2,
typename L3,
int dim>
1453void For (Gpu::KernelInfo
const& info,
1454 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1455 L1&& f1, L2&& f2, L3&& f3)
noexcept
1457 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1460template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1461void For (Gpu::KernelInfo
const& info,
1462 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1463 L1&& f1, L2&& f2, L3&& f3)
noexcept
1465 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1468template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1469void For (Gpu::KernelInfo
const& info,
1470 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1471 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1473 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1476template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1477void For (Gpu::KernelInfo
const& info,
1478 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1479 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1481 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1484template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1485void For (Gpu::KernelInfo
const& info,
1486 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1487 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1488 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1490 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1491 box1,ncomp1,std::forward<L1>(f1),
1492 box2,ncomp2,std::forward<L2>(f2),
1493 box3,ncomp3,std::forward<L3>(f3));
1496template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1497void For (Gpu::KernelInfo
const& info,
1498 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1499 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1500 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1502 ParallelFor<MT>(info,
1503 box1,ncomp1,std::forward<L1>(f1),
1504 box2,ncomp2,std::forward<L2>(f2),
1505 box3,ncomp3,std::forward<L3>(f3));
1512template <std::
integral T,
typename L>
1515 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1522template <
int MT, std::
integral T,
typename L>
1525 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1532template <
typename L,
int dim>
1535 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box, std::forward<L>(f));
1542template <
int MT,
typename L,
int dim>
1543void ParallelFor (BoxND<dim>
const& box, L&& f)
noexcept
1545 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1552template <std::
integral T,
typename L,
int dim>
1555 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1562template <
int MT, std::
integral T,
typename L,
int dim>
1563void ParallelFor (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1565 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1572template <
typename L1,
typename L2,
int dim>
1573void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1575 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1582template <
int MT,
typename L1,
typename L2,
int dim>
1583void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1585 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1592template <
typename L1,
typename L2,
typename L3,
int dim>
1593void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1594 L1&& f1, L2&& f2, L3&& f3)
noexcept
1596 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1603template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1604void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1605 L1&& f1, L2&& f2, L3&& f3)
noexcept
1607 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1614template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1615void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1616 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1618 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1625template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1626void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1627 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1629 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1636template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1637void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1638 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1639 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1641 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1642 box1,ncomp1,std::forward<L1>(f1),
1643 box2,ncomp2,std::forward<L2>(f2),
1644 box3,ncomp3,std::forward<L3>(f3));
1651template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1652void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1653 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1654 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1656 ParallelFor<MT>(Gpu::KernelInfo{},
1657 box1,ncomp1,std::forward<L1>(f1),
1658 box2,ncomp2,std::forward<L2>(f2),
1659 box3,ncomp3,std::forward<L3>(f3));
1662template <std::
integral T,
typename L>
1665 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n,std::forward<L>(f));
1668template <
int MT, std::
integral T,
typename L>
1669void For (T n, L&& f)
noexcept
1671 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1674template <
typename L,
int dim>
1677 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box,std::forward<L>(f));
1680template <
int MT,
typename L,
int dim>
1681void For (BoxND<dim>
const& box, L&& f)
noexcept
1683 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1686template <std::
integral T,
typename L,
int dim>
1689 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1692template <
int MT, std::
integral T,
typename L,
int dim>
1693void For (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1695 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1698template <
typename L1,
typename L2,
int dim>
1699void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1701 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1704template <
int MT,
typename L1,
typename L2,
int dim>
1705void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1707 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1710template <
typename L1,
typename L2,
typename L3,
int dim>
1711void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1712 L1&& f1, L2&& f2, L3&& f3)
noexcept
1714 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1717template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1718void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1719 L1&& f1, L2&& f2, L3&& f3)
noexcept
1721 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1724template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1725void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1726 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1728 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1731template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1732void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1733 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1735 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1738template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1739void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1740 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1741 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1743 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1744 box1,ncomp1,std::forward<L1>(f1),
1745 box2,ncomp2,std::forward<L2>(f2),
1746 box3,ncomp3,std::forward<L3>(f3));
1749template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1750void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1751 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1752 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1754 ParallelFor<MT>(Gpu::KernelInfo{},
1755 box1,ncomp1,std::forward<L1>(f1),
1756 box2,ncomp2,std::forward<L2>(f2),
1757 box3,ncomp3,std::forward<L3>(f3));
1760template <std::
integral T,
typename L>
1761requires (MaybeHostDeviceRunnable<L>::value)
1766 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1768#ifdef AMREX_USE_SYCL
1769 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1772 for (T i = 0; i < n; ++i) { f(i); }
1777template <
int MT, std::
integral T,
typename L>
1778requires (MaybeHostDeviceRunnable<L>::value)
1783 ParallelFor<MT>(info,n,std::forward<L>(f));
1785#ifdef AMREX_USE_SYCL
1786 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1789 for (T i = 0; i < n; ++i) { f(i); }
1794template <std::
integral T,
typename L>
1795requires (MaybeHostDeviceRunnable<L>::value)
1799 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1802template <
int MT, std::
integral T,
typename L>
1803requires (MaybeHostDeviceRunnable<L>::value)
1810template <
typename L,
int dim>
1811requires (MaybeHostDeviceRunnable<L>::value)
1816 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1818#ifdef AMREX_USE_SYCL
1819 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1826template <
int MT,
typename L,
int dim>
1827requires (MaybeHostDeviceRunnable<L>::value)
1832 ParallelFor<MT>(info, box,std::forward<L>(f));
1834#ifdef AMREX_USE_SYCL
1835 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1842template <std::
integral T,
typename L,
int dim>
1843requires (MaybeHostDeviceRunnable<L>::value)
1848 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1850#ifdef AMREX_USE_SYCL
1851 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1858template <
int MT, std::
integral T,
typename L,
int dim>
1859requires (MaybeHostDeviceRunnable<L>::value)
1864 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1866#ifdef AMREX_USE_SYCL
1867 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1874template <
typename L1,
typename L2,
int dim>
1875requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1881 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1883#ifdef AMREX_USE_SYCL
1884 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1892template <
int MT,
typename L1,
typename L2,
int dim>
1893requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1899 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1901#ifdef AMREX_USE_SYCL
1902 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1910template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1911requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1912 MaybeHostDeviceRunnable<L3>::value)
1916 L1&& f1, L2&& f2, L3&& f3)
1919 ParallelFor<MT>(info,box1,box2,box3,
1920 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1922#ifdef AMREX_USE_SYCL
1923 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1932template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1933requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1940 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1942#ifdef AMREX_USE_SYCL
1943 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1951template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
1952requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1959 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1961#ifdef AMREX_USE_SYCL
1962 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1970template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1971requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1972 MaybeHostDeviceRunnable<L3>::value)
1980 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1981 box1,ncomp1,std::forward<L1>(f1),
1982 box2,ncomp2,std::forward<L2>(f2),
1983 box3,ncomp3,std::forward<L3>(f3));
1985#ifdef AMREX_USE_SYCL
1986 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1995template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
1996requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1997 MaybeHostDeviceRunnable<L3>::value)
2005 ParallelFor<MT>(info,
2006 box1,ncomp1,std::forward<L1>(f1),
2007 box2,ncomp2,std::forward<L2>(f2),
2008 box3,ncomp3,std::forward<L3>(f3));
2010#ifdef AMREX_USE_SYCL
2011 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2020template <std::
integral T,
typename L>
2021void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
2023 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
2026template <
int MT, std::
integral T,
typename L>
2027void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
2029 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
2032template <
typename L,
int dim>
2033void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
2035 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
2038template <
int MT,
typename L,
int dim>
2039void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
2041 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
2044template <std::
integral T,
typename L,
int dim>
2045void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
2047 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
2050template <
int MT, std::
integral T,
typename L,
int dim>
2051void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
2053 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
2056template <
typename L1,
typename L2,
int dim>
2058 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2060 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2063template <
int MT,
typename L1,
typename L2,
int dim>
2065 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2067 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2070template <
typename L1,
typename L2,
typename L3,
int dim>
2072 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
2073 L1&& f1, L2&& f2, L3&& f3)
noexcept
2075 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
2076 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2079template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2081 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
2082 L1&& f1, L2&& f2, L3&& f3)
noexcept
2084 HostDeviceParallelFor<MT>(info, box1,box2,box3,
2085 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2088template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
2090 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2091 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2093 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2096template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
2098 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2099 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2101 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2104template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
2106 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2107 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2108 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2110 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
2111 box1,ncomp1,std::forward<L1>(f1),
2112 box2,ncomp2,std::forward<L2>(f2),
2113 box3,ncomp3,std::forward<L3>(f3));
2116template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
2118 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2119 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2120 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2122 HostDeviceParallelFor<MT>(info,
2123 box1,ncomp1,std::forward<L1>(f1),
2124 box2,ncomp2,std::forward<L2>(f2),
2125 box3,ncomp3,std::forward<L3>(f3));
2128template <std::
integral T,
typename L>
2131 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
2134template <
int MT, std::
integral T,
typename L>
2137 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
2140template <
typename L,
int dim>
2143 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
2146template <
int MT,
typename L,
int dim>
2149 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
2152template <std::
integral T,
typename L,
int dim>
2155 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2158template <
int MT, std::
integral T,
typename L,
int dim>
2161 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2164template <
typename L1,
typename L2,
int dim>
2165void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2167 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2170template <
int MT,
typename L1,
typename L2,
int dim>
2171void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2173 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2176template <
typename L1,
typename L2,
typename L3,
int dim>
2178 L1&& f1, L2&& f2, L3&& f3)
noexcept
2180 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2181 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2184template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2186 L1&& f1, L2&& f2, L3&& f3)
noexcept
2188 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2189 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2192template <std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
2194 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2196 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2199template <
int MT, std::
integral T1, std::
integral T2,
typename L1,
typename L2,
int dim>
2201 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2203 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2206template <std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
2208 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2209 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2211 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2212 box1,ncomp1,std::forward<L1>(f1),
2213 box2,ncomp2,std::forward<L2>(f2),
2214 box3,ncomp3,std::forward<L3>(f3));
2217template <
int MT, std::
integral T1, std::
integral T2, std::
integral T3,
typename L1,
typename L2,
typename L3,
int dim>
2219 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2220 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2222 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2223 box1,ncomp1,std::forward<L1>(f1),
2224 box2,ncomp2,std::forward<L2>(f2),
2225 box3,ncomp3,std::forward<L3>(f3));
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:39
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ bool isEmpty() const noexcept
Checks if it is an empty BoxND.
Definition AMReX_Box.H:208
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:136
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:239
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:128
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:726
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:829
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:25
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:122
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:15
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:45
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1239
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1151
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334
Definition AMReX_Box.H:2170
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2187
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2211
Definition AMReX_GpuLaunch.H:120
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72