1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
12 template <
typename F,
typename N>
14 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const&)
15 noexcept ->
decltype(f(0))
20 template <
typename F,
typename N>
22 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const& handler)
23 noexcept ->
decltype(f(0,Gpu::Handler{}))
30 template <
typename F, std::size_t...Ns,
class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
35 return f(iv[0], 0, 0, args...);
38 template <
typename F, std::size_t...Ns,
class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
43 return f(iv[0], iv[1], 0, args...);
46 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
51 return f(iv, args...);
54 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
59 return f(iv[Ns]..., args...);
64 template <
typename F,
int dim>
66 auto call_f_intvect (
F const& f, IntVectND<dim> iv)
67 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
74 template <
typename F,
int dim>
76 auto call_f_intvect_engine (
F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
84 template <
typename F,
int dim>
86 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const&)
87 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
92 template <
typename F,
int dim>
94 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const& handler)
95 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
102 template <
typename F,
typename T,
int dim>
104 auto call_f_intvect_ncomp (
F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
114 template <
typename F,
typename T,
int dim>
116 auto call_f_intvect_ncomp_engine (
F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
126 template <
typename F,
typename T,
int dim>
128 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const&)
129 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
136 template <
typename F,
typename T,
int dim>
138 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const& handler)
139 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
154 detail::SyclKernelDevPtr<L> skdp(f, stream);
155 L
const* pf = skdp.template get<0>();
158 auto& q = *(stream.queue);
160 q.submit([&] (sycl::handler& h) {
161 if constexpr (detail::is_big_kernel<L>()) {
167 }
catch (sycl::exception
const& ex) {
168 amrex::Abort(std::string(
"single_task: ")+ex.what()+
"!!!!!");
173void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
176 detail::SyclKernelDevPtr<L> skdp(f, stream);
177 L
const* pf = skdp.template get<0>();
180 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
181 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
182 /
sizeof(
unsigned long long);
183 auto& q = *(stream.queue);
185 q.submit([&] (sycl::handler& h) {
186 sycl::local_accessor<unsigned long long>
187 shared_data(sycl::range<1>(shared_mem_numull), h);
188 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
189 sycl::range<1>(nthreads_per_block)),
190 [=] (sycl::nd_item<1> item)
193 if constexpr (detail::is_big_kernel<L>()) {
194 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
196 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
200 }
catch (sycl::exception
const& ex) {
201 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
206void launch (
int nblocks,
int nthreads_per_block,
gpuStream_t stream, L
const& f)
noexcept
208 detail::SyclKernelDevPtr<L> skdp(f, stream);
209 L
const* pf = skdp.template get<0>();
212 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
213 auto& q = *(stream.queue);
215 q.submit([&] (sycl::handler& h) {
216 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
217 sycl::range<1>(nthreads_per_block)),
218 [=] (sycl::nd_item<1> item)
221 if constexpr (detail::is_big_kernel<L>()) {
228 }
catch (sycl::exception
const& ex) {
229 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
233template <
int MT,
typename L>
237 detail::SyclKernelDevPtr<L> skdp(f, stream);
238 L
const* pf = skdp.template get<0>();
241 const auto nthreads_total = MT * std::size_t(nblocks);
242 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
243 /
sizeof(
unsigned long long);
244 auto& q = *(stream.queue);
246 q.submit([&] (sycl::handler& h) {
247 sycl::local_accessor<unsigned long long>
248 shared_data(sycl::range<1>(shared_mem_numull), h);
249 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
251 [=] (sycl::nd_item<1> item)
252 [[sycl::reqd_work_group_size(MT)]]
255 if constexpr (detail::is_big_kernel<L>()) {
256 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
258 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
262 }
catch (sycl::exception
const& ex) {
263 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
267template <
int MT,
typename L>
270 detail::SyclKernelDevPtr<L> skdp(f, stream);
271 L
const* pf = skdp.template get<0>();
274 const auto nthreads_total = MT * std::size_t(nblocks);
275 auto& q = *(stream.queue);
277 q.submit([&] (sycl::handler& h) {
278 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
280 [=] (sycl::nd_item<1> item)
281 [[sycl::reqd_work_group_size(MT)]]
284 if constexpr (detail::is_big_kernel<L>()) {
291 }
catch (sycl::exception
const& ex) {
292 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
296template<
int MT,
typename T,
typename L>
297void launch (T
const& n, L
const& f)
noexcept
302 L
const* pf = skdp.template get<0>();
305 const auto ec = Gpu::makeExecutionConfig<MT>(n);
306 const auto nthreads_per_block = ec.numThreads.x;
307 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
308 auto& q = Gpu::Device::streamQueue();
310 q.submit([&] (sycl::handler& h) {
311 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
312 sycl::range<1>(nthreads_per_block)),
313 [=] (sycl::nd_item<1> item)
314 [[sycl::reqd_work_group_size(MT)]]
317 for (
auto const i :
Gpu::
Range(n,item.get_global_id(0),item.get_global_range(0))) {
318 if constexpr (detail::is_big_kernel<L>()) {
326 }
catch (sycl::exception
const& ex) {
327 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
331template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
332void ParallelFor (Gpu::KernelInfo
const& info, T n, L
const& f)
noexcept
337 L
const* pf = skdp.template get<0>();
340 const auto ec = Gpu::makeExecutionConfig<MT>(n);
341 const auto nthreads_per_block = ec.numThreads.x;
342 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
343 auto& q = Gpu::Device::streamQueue();
345 if (info.hasReduction()) {
346 q.submit([&] (sycl::handler& h) {
347 sycl::local_accessor<unsigned long long>
349 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
350 sycl::range<1>(nthreads_per_block)),
351 [=] (sycl::nd_item<1> item)
352 [[sycl::reqd_work_group_size(MT)]]
355 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
356 i < std::size_t(n); i += stride) {
357 int n_active_threads =
amrex::min(std::size_t(n)-i+item.get_local_id(0),
358 item.get_local_range(0));
359 if constexpr (detail::is_big_kernel<L>()) {
360 detail::call_f_scalar_handler(*pf, T(i),
361 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
364 detail::call_f_scalar_handler(f, T(i),
365 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
372 q.submit([&] (sycl::handler& h) {
373 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
374 sycl::range<1>(nthreads_per_block)),
375 [=] (sycl::nd_item<1> item)
376 [[sycl::reqd_work_group_size(MT)]]
379 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
380 i < std::size_t(n); i += stride) {
381 if constexpr (detail::is_big_kernel<L>()) {
382 detail::call_f_scalar_handler(*pf, T(i), Gpu::Handler{&item});
384 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
390 }
catch (sycl::exception
const& ex) {
391 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
395template <
int MT,
typename L,
int dim>
396void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L
const& f)
noexcept
401 L
const* pf = skdp.template get<0>();
404 const BoxIndexerND<dim> indexer(box);
405 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
406 const auto nthreads_per_block = ec.numThreads.x;
407 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
408 auto& q = Gpu::Device::streamQueue();
410 if (info.hasReduction()) {
411 q.submit([&] (sycl::handler& h) {
412 sycl::local_accessor<unsigned long long>
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
424 std::uint64_t(item.get_local_range(0)));
425 if constexpr (detail::is_big_kernel<L>()) {
426 detail::call_f_intvect_handler(*pf,
427 iv, Gpu::Handler{&item,
428 shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
431 detail::call_f_intvect_handler(f,
432 iv, Gpu::Handler{&item,
433 shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
440 q.submit([&] (sycl::handler& h) {
441 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
442 sycl::range<1>(nthreads_per_block)),
443 [=] (sycl::nd_item<1> item)
444 [[sycl::reqd_work_group_size(MT)]]
447 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
448 icell < indexer.numPts(); icell += stride) {
449 auto iv = indexer.intVect(icell);
450 if constexpr (detail::is_big_kernel<L>()) {
451 detail::call_f_intvect_handler(*pf,iv,Gpu::Handler{&item});
453 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
459 }
catch (sycl::exception
const& ex) {
460 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
464template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
465void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
470 L
const* pf = skdp.template get<0>();
473 const BoxIndexerND<dim> indexer(box);
474 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
475 const auto nthreads_per_block = ec.numThreads.x;
476 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
477 auto& q = Gpu::Device::streamQueue();
479 if (info.hasReduction()) {
480 q.submit([&] (sycl::handler& h) {
481 sycl::local_accessor<unsigned long long>
483 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
484 sycl::range<1>(nthreads_per_block)),
485 [=] (sycl::nd_item<1> item)
486 [[sycl::reqd_work_group_size(MT)]]
489 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
490 icell < indexer.numPts(); icell += stride) {
491 auto iv = indexer.intVect(icell);
492 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
493 std::uint64_t(item.get_local_range(0)));
494 if constexpr (detail::is_big_kernel<L>()) {
495 detail::call_f_intvect_ncomp_handler(*pf, iv, ncomp,
496 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
499 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
500 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
507 q.submit([&] (sycl::handler& h) {
508 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
509 sycl::range<1>(nthreads_per_block)),
510 [=] (sycl::nd_item<1> item)
511 [[sycl::reqd_work_group_size(MT)]]
514 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
515 icell < indexer.numPts(); icell += stride) {
516 auto iv = indexer.intVect(icell);
517 if constexpr (detail::is_big_kernel<L>()) {
518 detail::call_f_intvect_ncomp_handler(*pf,iv,ncomp,Gpu::Handler{&item});
520 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
526 }
catch (sycl::exception
const& ex) {
527 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
531template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
537 L
const* pf = skdp.template get<0>();
540 const auto ec = Gpu::ExecutionConfig(n);
541 const auto nthreads_per_block = ec.numThreads.x;
543 auto& q = Gpu::Device::streamQueue();
544 auto& engdescr = *(getRandEngineDescriptor());
546 q.submit([&] (sycl::handler& h) {
547 auto engine_acc = engdescr.get_access(h);
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
554 auto const tid = item.get_global_id(0);
555 auto engine = engine_acc.load(tid);
556 RandomEngine rand_eng{&engine};
557 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
558 if constexpr (detail::is_big_kernel<L>()) {
559 (*pf)(T(i),rand_eng);
564 engine_acc.store(engine, tid);
568 }
catch (sycl::exception
const& ex) {
569 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
573template <
typename L,
int dim>
579 L
const* pf = skdp.template get<0>();
582 const BoxIndexerND<dim> indexer(box);
583 const auto ec = Gpu::ExecutionConfig(box.numPts());
584 const auto nthreads_per_block = ec.numThreads.x;
586 auto& q = Gpu::Device::streamQueue();
587 auto& engdescr = *(getRandEngineDescriptor());
589 q.submit([&] (sycl::handler& h) {
590 auto engine_acc = engdescr.get_access(h);
591 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
592 sycl::range<1>(nthreads_per_block)),
593 [=] (sycl::nd_item<1> item)
594 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
597 auto const tid = item.get_global_id(0);
598 auto engine = engine_acc.load(tid);
599 RandomEngine rand_eng{&engine};
600 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
601 icell < indexer.numPts(); icell += stride) {
602 auto iv = indexer.intVect(icell);
603 if constexpr (detail::is_big_kernel<L>()) {
604 detail::call_f_intvect_engine(*pf,iv,rand_eng);
606 detail::call_f_intvect_engine(f,iv,rand_eng);
609 engine_acc.store(engine, tid);
613 }
catch (sycl::exception
const& ex) {
614 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
618template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
619void ParallelForRNG (BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
624 L
const* pf = skdp.template get<0>();
627 const BoxIndexerND<dim> indexer(box);
628 const auto ec = Gpu::ExecutionConfig(box.numPts());
629 const auto nthreads_per_block = ec.numThreads.x;
631 auto& q = Gpu::Device::streamQueue();
632 auto& engdescr = *(getRandEngineDescriptor());
634 q.submit([&] (sycl::handler& h) {
635 auto engine_acc = engdescr.get_access(h);
636 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
637 sycl::range<1>(nthreads_per_block)),
638 [=] (sycl::nd_item<1> item)
639 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
642 auto const tid = item.get_global_id(0);
643 auto engine = engine_acc.load(tid);
644 RandomEngine rand_eng{&engine};
645 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
646 icell < indexer.numPts(); icell += stride) {
647 auto iv = indexer.intVect(icell);
648 if constexpr (detail::is_big_kernel<L>()) {
649 detail::call_f_intvect_ncomp_engine(*pf,iv,ncomp,rand_eng);
651 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
654 engine_acc.store(engine, tid);
658 }
catch (sycl::exception
const& ex) {
659 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
663template <
int MT,
typename L1,
typename L2,
int dim>
664void ParallelFor (Gpu::KernelInfo
const& , BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1
const& f1, L2
const& f2)
noexcept
669 L1
const* pf1 = skdp.template get<0>();
670 L2
const* pf2 = skdp.template get<1>();
673 const BoxIndexerND<dim> indexer1(box1);
674 const BoxIndexerND<dim> indexer2(box2);
675 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
676 const auto nthreads_per_block = ec.numThreads.x;
677 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
678 auto& q = Gpu::Device::streamQueue();
680 q.submit([&] (sycl::handler& h) {
681 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
682 sycl::range<1>(nthreads_per_block)),
683 [=] (sycl::nd_item<1> item)
684 [[sycl::reqd_work_group_size(MT)]]
687 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
688 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
689 icell < ncells; icell += stride) {
690 if (icell < indexer1.numPts()) {
691 auto iv = indexer1.intVect(icell);
692 if constexpr (detail::is_big_kernel<L1,L2>()) {
693 detail::call_f_intvect(*pf1,iv);
695 detail::call_f_intvect(f1,iv);
698 if (icell < indexer2.numPts()) {
699 auto iv = indexer2.intVect(icell);
700 if constexpr (detail::is_big_kernel<L1,L2>()) {
701 detail::call_f_intvect(*pf2,iv);
703 detail::call_f_intvect(f2,iv);
709 }
catch (sycl::exception
const& ex) {
710 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
714template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
716 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
717 L1
const& f1, L2
const& f2, L3
const& f3)
noexcept
721 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3,
Gpu::gpuStream());
722 L1
const* pf1 = skdp.template get<0>();
723 L2
const* pf2 = skdp.template get<1>();
724 L3
const* pf3 = skdp.template get<2>();
727 const BoxIndexerND<dim> indexer1(box1);
728 const BoxIndexerND<dim> indexer2(box2);
729 const BoxIndexerND<dim> indexer3(box3);
730 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
731 const auto nthreads_per_block = ec.numThreads.x;
732 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
733 auto& q = Gpu::Device::streamQueue();
735 q.submit([&] (sycl::handler& h) {
736 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
737 sycl::range<1>(nthreads_per_block)),
738 [=] (sycl::nd_item<1> item)
739 [[sycl::reqd_work_group_size(MT)]]
742 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
743 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
744 icell < ncells; icell += stride) {
745 if (icell < indexer1.numPts()) {
746 auto iv = indexer1.intVect(icell);
747 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
748 detail::call_f_intvect(*pf1,iv);
750 detail::call_f_intvect(f1,iv);
753 if (icell < indexer2.numPts()) {
754 auto iv = indexer2.intVect(icell);
755 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
756 detail::call_f_intvect(*pf2,iv);
758 detail::call_f_intvect(f2,iv);
761 if (icell < indexer3.numPts()) {
762 auto iv = indexer3.intVect(icell);
763 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
764 detail::call_f_intvect(*pf3,iv);
766 detail::call_f_intvect(f3,iv);
772 }
catch (sycl::exception
const& ex) {
773 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
777template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
778 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
779 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
781 BoxND<dim>
const& box1, T1 ncomp1, L1
const& f1,
782 BoxND<dim>
const& box2, T2 ncomp2, L2
const& f2)
noexcept
787 L1
const* pf1 = skdp.template get<0>();
788 L2
const* pf2 = skdp.template get<1>();
791 const BoxIndexerND<dim> indexer1(box1);
792 const BoxIndexerND<dim> indexer2(box2);
793 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
794 const auto nthreads_per_block = ec.numThreads.x;
795 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
796 auto& q = Gpu::Device::streamQueue();
798 q.submit([&] (sycl::handler& h) {
799 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
800 sycl::range<1>(nthreads_per_block)),
801 [=] (sycl::nd_item<1> item)
802 [[sycl::reqd_work_group_size(MT)]]
805 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
806 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
807 icell < ncells; icell += stride) {
808 if (icell < indexer1.numPts()) {
809 auto iv = indexer1.intVect(icell);
810 if constexpr (detail::is_big_kernel<L1,L2>()) {
811 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
813 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
816 if (icell < indexer2.numPts()) {
817 auto iv = indexer2.intVect(icell);
818 if constexpr (detail::is_big_kernel<L1,L2>()) {
819 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
821 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
827 }
catch (sycl::exception
const& ex) {
828 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
832template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
833 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
834 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
835 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
837 BoxND<dim>
const& box1, T1 ncomp1, L1
const& f1,
838 BoxND<dim>
const& box2, T2 ncomp2, L2
const& f2,
839 BoxND<dim>
const& box3, T3 ncomp3, L3
const& f3)
noexcept
843 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3,
Gpu::gpuStream());
844 L1
const* pf1 = skdp.template get<0>();
845 L2
const* pf2 = skdp.template get<1>();
846 L3
const* pf3 = skdp.template get<2>();
849 const BoxIndexerND<dim> indexer1(box1);
850 const BoxIndexerND<dim> indexer2(box2);
851 const BoxIndexerND<dim> indexer3(box3);
852 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
853 const auto nthreads_per_block = ec.numThreads.x;
854 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
855 auto& q = Gpu::Device::streamQueue();
857 q.submit([&] (sycl::handler& h) {
858 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
859 sycl::range<1>(nthreads_per_block)),
860 [=] (sycl::nd_item<1> item)
861 [[sycl::reqd_work_group_size(MT)]]
864 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
865 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
866 icell < ncells; icell += stride) {
867 if (icell < indexer1.numPts()) {
868 auto iv = indexer1.intVect(icell);
869 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
870 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
872 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
875 if (icell < indexer2.numPts()) {
876 auto iv = indexer2.intVect(icell);
877 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
878 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
880 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
883 if (icell < indexer3.numPts()) {
884 auto iv = indexer3.intVect(icell);
885 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
886 detail::call_f_intvect_ncomp(*pf3,iv,ncomp3);
888 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
894 }
catch (sycl::exception
const& ex) {
895 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
909template <
int MT,
typename L>
917template <
int MT,
typename L>
925void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
935 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
938template<
int MT,
typename T,
typename L, std::enable_if_t<std::is_
integral_v<T>,
int> FOO = 0>
939void launch (T
const& n, L
const& f)
noexcept
941 static_assert(
sizeof(T) >= 2);
943 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
944 for (
auto const& ec : nec) {
945 const T start_idx = T(ec.start_idx);
946 const T nleft = n - start_idx;
950 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
959template<
int MT,
int dim,
typename L>
962 if (box.isEmpty()) {
return; }
963 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
965 const auto type = box.ixType();
966 for (
auto const& ec : nec) {
967 const auto start_idx = std::uint64_t(ec.start_idx);
970 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
971 if (icell < indexer.
numPts()) {
972 auto iv = indexer.
intVect(icell);
984template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
985std::enable_if_t<MaybeDeviceRunnable<L>::value>
988 static_assert(
sizeof(T) >= 2);
990 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
991 for (
auto const& ec : nec) {
992 const T start_idx = T(ec.start_idx);
993 const T nleft = n - start_idx;
997 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
999 detail::call_f_scalar_handler(f, tid+start_idx,
1001 (std::uint64_t)MT)));
1012template <
int MT,
typename L,
int dim>
1013std::enable_if_t<MaybeDeviceRunnable<L>::value>
1018 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1019 for (
auto const& ec : nec) {
1020 const auto start_idx = std::uint64_t(ec.start_idx);
1023 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1024 if (icell < indexer.
numPts()) {
1025 auto iv = indexer.
intVect(icell);
1026 detail::call_f_intvect_handler(f, iv,
1028 (std::uint64_t)MT)));
1039template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1040std::enable_if_t<MaybeDeviceRunnable<L>::value>
1045 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1046 for (
auto const& ec : nec) {
1047 const auto start_idx = std::uint64_t(ec.start_idx);
1050 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1051 if (icell < indexer.
numPts()) {
1052 auto iv = indexer.
intVect(icell);
1053 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
1055 (std::uint64_t)MT)));
1067template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1068std::enable_if_t<MaybeDeviceRunnable<L>::value>
1078 Long tid =
Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1080 for (
Long i = tid, stride =
Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i <
Long(n); i += stride) {
1093template <
typename L,
int dim>
1094std::enable_if_t<MaybeDeviceRunnable<L>::value>
1105 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1107 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
1108 auto iv = indexer.
intVect(icell);
1109 detail::call_f_intvect_engine(f, iv, engine);
1121template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1122std::enable_if_t<MaybeDeviceRunnable<L>::value>
1133 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1135 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
1136 auto iv = indexer.
intVect(icell);
1137 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
1148template <
int MT,
typename L1,
typename L2,
int dim>
1149std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1156 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1159 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1160 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1161 icell < ncells; icell += stride) {
1162 if (icell < indexer1.
numPts()) {
1163 auto iv = indexer1.
intVect(icell);
1164 detail::call_f_intvect(f1, iv);
1166 if (icell < indexer2.
numPts()) {
1167 auto iv = indexer2.
intVect(icell);
1168 detail::call_f_intvect(f2, iv);
1179template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1180std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1183 L1&& f1, L2&& f2, L3&& f3)
noexcept
1189 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1192 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1193 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1194 icell < ncells; icell += stride) {
1195 if (icell < indexer1.
numPts()) {
1196 auto iv = indexer1.
intVect(icell);
1197 detail::call_f_intvect(f1, iv);
1199 if (icell < indexer2.
numPts()) {
1200 auto iv = indexer2.
intVect(icell);
1201 detail::call_f_intvect(f2, iv);
1203 if (icell < indexer3.
numPts()) {
1204 auto iv = indexer3.
intVect(icell);
1205 detail::call_f_intvect(f3, iv);
1216template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1217 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1218 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1219std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1222 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1227 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1230 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1231 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1232 icell < ncells; icell += stride) {
1233 if (icell < indexer1.
numPts()) {
1234 auto iv = indexer1.
intVect(icell);
1235 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1237 if (icell < indexer2.
numPts()) {
1238 auto iv = indexer2.
intVect(icell);
1239 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1250template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1251 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1252 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1253 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1254std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1258 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1264 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1267 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1268 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1269 icell < ncells; icell += stride) {
1270 if (icell < indexer1.
numPts()) {
1271 auto iv = indexer1.
intVect(icell);
1272 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1274 if (icell < indexer2.
numPts()) {
1275 auto iv = indexer2.
intVect(icell);
1276 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1278 if (icell < indexer3.
numPts()) {
1279 auto iv = indexer3.
intVect(icell);
1280 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1289template <
typename L>
1295template<
typename T,
typename L>
1296void launch (T
const& n, L&& f)
noexcept
1298 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1305template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1306std::enable_if_t<MaybeDeviceRunnable<L>::value>
1309 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1316template <
typename L,
int dim>
1317std::enable_if_t<MaybeDeviceRunnable<L>::value>
1320 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1327template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1328std::enable_if_t<MaybeDeviceRunnable<L>::value>
1331 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1338template <
typename L1,
typename L2,
int dim>
1339std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1343 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1344 std::forward<L2>(f2));
1351template <
typename L1,
typename L2,
typename L3,
int dim>
1352std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1355 L1&& f1, L2&& f2, L3&& f3)
noexcept
1357 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1358 std::forward<L2>(f2), std::forward<L3>(f3));
1365template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1366 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1367 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1368std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1371 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1373 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1374 box2, ncomp2, std::forward<L2>(f2));
1381template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1382 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1383 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1384 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1385std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1389 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1391 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1392 box2, ncomp2, std::forward<L2>(f2),
1393 box3, ncomp3, std::forward<L3>(f3));
1396template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1397void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1399 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1402template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1403void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1405 ParallelFor<MT>(info, n,std::forward<L>(f));
1408template <
typename L,
int dim>
1409void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1411 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1414template <
int MT,
typename L,
int dim>
1415void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1417 ParallelFor<MT>(info, box,std::forward<L>(f));
1420template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1421void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1423 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1426template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1427void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1429 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1432template <
typename L1,
typename L2,
int dim>
1433void For (Gpu::KernelInfo
const& info,
1434 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1436 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1439template <
int MT,
typename L1,
typename L2,
int dim>
1440void For (Gpu::KernelInfo
const& info,
1441 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1443 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1446template <
typename L1,
typename L2,
typename L3,
int dim>
1447void For (Gpu::KernelInfo
const& info,
1448 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1449 L1&& f1, L2&& f2, L3&& f3)
noexcept
1451 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1454template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1455void For (Gpu::KernelInfo
const& info,
1456 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1457 L1&& f1, L2&& f2, L3&& f3)
noexcept
1459 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1462template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1463 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1464 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1465void For (Gpu::KernelInfo
const& info,
1466 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1467 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1469 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1472template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1473 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1474 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1475void For (Gpu::KernelInfo
const& info,
1476 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1477 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1479 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1482template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1483 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1484 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1485 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1486void For (Gpu::KernelInfo
const& info,
1487 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1488 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1489 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1491 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1492 box1,ncomp1,std::forward<L1>(f1),
1493 box2,ncomp2,std::forward<L2>(f2),
1494 box3,ncomp3,std::forward<L3>(f3));
1497template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1498 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1499 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1500 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1501void For (Gpu::KernelInfo
const& info,
1502 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1503 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1504 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1506 ParallelFor<MT>(info,
1507 box1,ncomp1,std::forward<L1>(f1),
1508 box2,ncomp2,std::forward<L2>(f2),
1509 box3,ncomp3,std::forward<L3>(f3));
1516template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1519 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1526template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1529 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1536template <
typename L,
int dim>
1539 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box, std::forward<L>(f));
1546template <
int MT,
typename L,
int dim>
1547void ParallelFor (BoxND<dim>
const& box, L&& f)
noexcept
1549 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1556template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1559 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1566template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1567void ParallelFor (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1569 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1576template <
typename L1,
typename L2,
int dim>
1577void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1579 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1586template <
int MT,
typename L1,
typename L2,
int dim>
1587void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1589 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1596template <
typename L1,
typename L2,
typename L3,
int dim>
1597void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1598 L1&& f1, L2&& f2, L3&& f3)
noexcept
1600 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1607template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1608void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1609 L1&& f1, L2&& f2, L3&& f3)
noexcept
1611 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1618template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1619 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1620 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1621void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1622 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1624 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1631template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1632 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1633 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1634void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1635 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1637 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1644template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1645 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1646 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1647 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1648void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1649 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1650 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1652 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1653 box1,ncomp1,std::forward<L1>(f1),
1654 box2,ncomp2,std::forward<L2>(f2),
1655 box3,ncomp3,std::forward<L3>(f3));
1662template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1663 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1664 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1665 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1666void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1667 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1668 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1670 ParallelFor<MT>(Gpu::KernelInfo{},
1671 box1,ncomp1,std::forward<L1>(f1),
1672 box2,ncomp2,std::forward<L2>(f2),
1673 box3,ncomp3,std::forward<L3>(f3));
1676template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1679 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n,std::forward<L>(f));
1682template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1683void For (T n, L&& f)
noexcept
1685 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1688template <
typename L,
int dim>
1691 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box,std::forward<L>(f));
1694template <
int MT,
typename L,
int dim>
1695void For (BoxND<dim>
const& box, L&& f)
noexcept
1697 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1700template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1703 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1706template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1707void For (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1709 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1712template <
typename L1,
typename L2,
int dim>
1713void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1715 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1718template <
int MT,
typename L1,
typename L2,
int dim>
1719void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1721 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1724template <
typename L1,
typename L2,
typename L3,
int dim>
1725void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1726 L1&& f1, L2&& f2, L3&& f3)
noexcept
1728 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1731template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1732void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1733 L1&& f1, L2&& f2, L3&& f3)
noexcept
1735 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1738template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1739 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1740 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1741void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1742 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1744 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1747template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1748 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1749 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1750void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1751 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1753 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1756template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1757 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1758 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1759 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1760void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1761 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1762 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1764 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1765 box1,ncomp1,std::forward<L1>(f1),
1766 box2,ncomp2,std::forward<L2>(f2),
1767 box3,ncomp3,std::forward<L3>(f3));
1770template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1771 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1772 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1773 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1774void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1775 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1776 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1778 ParallelFor<MT>(Gpu::KernelInfo{},
1779 box1,ncomp1,std::forward<L1>(f1),
1780 box2,ncomp2,std::forward<L2>(f2),
1781 box3,ncomp3,std::forward<L3>(f3));
1784template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1785std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1789 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1791#ifdef AMREX_USE_SYCL
1792 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1795 for (T i = 0; i < n; ++i) { f(i); }
1800template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1801std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1805 ParallelFor<MT>(info,n,std::forward<L>(f));
1807#ifdef AMREX_USE_SYCL
1808 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1811 for (T i = 0; i < n; ++i) { f(i); }
1816template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1817std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1820 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1823template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1824std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1830template <
typename L,
int dim>
1831std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1835 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1837#ifdef AMREX_USE_SYCL
1838 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1845template <
int MT,
typename L,
int dim>
1846std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1850 ParallelFor<MT>(info, box,std::forward<L>(f));
1852#ifdef AMREX_USE_SYCL
1853 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1860template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1861std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1865 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1867#ifdef AMREX_USE_SYCL
1868 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1875template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1876std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1880 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1882#ifdef AMREX_USE_SYCL
1883 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1890template <
typename L1,
typename L2,
int dim>
1891std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1896 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1898#ifdef AMREX_USE_SYCL
1899 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1907template <
int MT,
typename L1,
typename L2,
int dim>
1908std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1913 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1915#ifdef AMREX_USE_SYCL
1916 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1924template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1925std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1928 L1&& f1, L2&& f2, L3&& f3)
noexcept
1931 ParallelFor<MT>(info,box1,box2,box3,
1932 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1934#ifdef AMREX_USE_SYCL
1935 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1944template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1945 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1946 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1947std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1950 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1953 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1955#ifdef AMREX_USE_SYCL
1956 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1964template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1965 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1966 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1967std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1970 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1973 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1975#ifdef AMREX_USE_SYCL
1976 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1984template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1985 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1986 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1987 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1988std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1992 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1995 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1996 box1,ncomp1,std::forward<L1>(f1),
1997 box2,ncomp2,std::forward<L2>(f2),
1998 box3,ncomp3,std::forward<L3>(f3));
2000#ifdef AMREX_USE_SYCL
2001 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2010template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2011 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2012 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2013 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2014std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
2018 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
2021 ParallelFor<MT>(info,
2022 box1,ncomp1,std::forward<L1>(f1),
2023 box2,ncomp2,std::forward<L2>(f2),
2024 box3,ncomp3,std::forward<L3>(f3));
2026#ifdef AMREX_USE_SYCL
2027 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2036template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2037void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
2039 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
2042template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2043void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
2045 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
2048template <
typename L,
int dim>
2049void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
2051 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
2054template <
int MT,
typename L,
int dim>
2055void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
2057 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
2060template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2061void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
2063 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
2066template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2067void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
2069 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
2072template <
typename L1,
typename L2,
int dim>
2074 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2076 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2079template <
int MT,
typename L1,
typename L2,
int dim>
2081 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2083 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2086template <
typename L1,
typename L2,
typename L3,
int dim>
2088 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
2089 L1&& f1, L2&& f2, L3&& f3)
noexcept
2091 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
2092 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2095template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2097 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
2098 L1&& f1, L2&& f2, L3&& f3)
noexcept
2100 HostDeviceParallelFor<MT>(info, box1,box2,box3,
2101 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2104template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2105 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2106 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2108 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2109 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2111 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2114template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2115 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2116 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2118 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2119 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2121 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2124template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2125 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2126 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2127 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2129 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2130 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2131 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2133 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
2134 box1,ncomp1,std::forward<L1>(f1),
2135 box2,ncomp2,std::forward<L2>(f2),
2136 box3,ncomp3,std::forward<L3>(f3));
2139template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2140 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2141 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2142 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2144 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
2145 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2146 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2148 HostDeviceParallelFor<MT>(info,
2149 box1,ncomp1,std::forward<L1>(f1),
2150 box2,ncomp2,std::forward<L2>(f2),
2151 box3,ncomp3,std::forward<L3>(f3));
2154template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2157 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
2160template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2163 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
2166template <
typename L,
int dim>
2169 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
2172template <
int MT,
typename L,
int dim>
2175 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
2178template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2181 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2184template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2187 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2190template <
typename L1,
typename L2,
int dim>
2191void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2193 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2196template <
int MT,
typename L1,
typename L2,
int dim>
2197void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2199 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2202template <
typename L1,
typename L2,
typename L3,
int dim>
2204 L1&& f1, L2&& f2, L3&& f3)
noexcept
2206 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2207 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2210template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2212 L1&& f1, L2&& f2, L3&& f3)
noexcept
2214 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2215 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2218template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2219 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2220 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2222 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2224 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2227template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2228 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2229 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2231 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2233 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2236template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2237 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2238 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2239 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2241 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2242 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2244 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2245 box1,ncomp1,std::forward<L1>(f1),
2246 box2,ncomp2,std::forward<L2>(f2),
2247 box3,ncomp3,std::forward<L3>(f3));
2250template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2251 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2252 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2253 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2255 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2256 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2258 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2259 box1,ncomp1,std::forward<L1>(f1),
2260 box2,ncomp2,std::forward<L2>(f2),
2261 box3,ncomp3,std::forward<L3>(f3));
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:36
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:38
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:122
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:879
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:766
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1319
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1231
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:119
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72