1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
12 template <
typename F,
typename N>
14 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const&)
15 noexcept ->
decltype(f(0))
20 template <
typename F,
typename N>
22 auto call_f_scalar_handler (
F const& f, N i, Gpu::Handler
const& handler)
23 noexcept ->
decltype(f(0,Gpu::Handler{}))
30 template <
typename F, std::size_t...Ns,
class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
35 return f(iv[0], 0, 0, args...);
38 template <
typename F, std::size_t...Ns,
class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
43 return f(iv[0], iv[1], 0, args...);
46 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
51 return f(iv, args...);
54 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>,
F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
59 return f(iv[Ns]..., args...);
64 template <
typename F,
int dim>
66 auto call_f_intvect (
F const& f, IntVectND<dim> iv)
67 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
74 template <
typename F,
int dim>
76 auto call_f_intvect_engine (
F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
84 template <
typename F,
int dim>
86 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const&)
87 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
92 template <
typename F,
int dim>
94 auto call_f_intvect_handler (
F const& f, IntVectND<dim> iv, Gpu::Handler
const& handler)
95 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
102 template <
typename F,
typename T,
int dim>
104 auto call_f_intvect_ncomp (
F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
114 template <
typename F,
typename T,
int dim>
116 auto call_f_intvect_ncomp_engine (
F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
126 template <
typename F,
typename T,
int dim>
128 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const&)
129 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
136 template <
typename F,
typename T,
int dim>
138 auto call_f_intvect_ncomp_handler (
F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const& handler)
139 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
154 auto& q = *(stream.queue);
156 q.submit([&] (sycl::handler& h) {
159 }
catch (sycl::exception
const& ex) {
160 amrex::Abort(std::string(
"single_task: ")+ex.what()+
"!!!!!");
165void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
168 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
169 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
170 /
sizeof(
unsigned long long);
171 auto& q = *(stream.queue);
173 q.submit([&] (sycl::handler& h) {
174 sycl::local_accessor<unsigned long long>
175 shared_data(sycl::range<1>(shared_mem_numull), h);
176 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
177 sycl::range<1>(nthreads_per_block)),
178 [=] (sycl::nd_item<1> item)
181 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
184 }
catch (sycl::exception
const& ex) {
185 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
190void launch (
int nblocks,
int nthreads_per_block,
gpuStream_t stream, L
const& f)
noexcept
192 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
193 auto& q = *(stream.queue);
195 q.submit([&] (sycl::handler& h) {
196 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
197 sycl::range<1>(nthreads_per_block)),
198 [=] (sycl::nd_item<1> item)
204 }
catch (sycl::exception
const& ex) {
205 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
209template <
int MT,
typename L>
213 const auto nthreads_total = MT * std::size_t(nblocks);
214 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
215 /
sizeof(
unsigned long long);
216 auto& q = *(stream.queue);
218 q.submit([&] (sycl::handler& h) {
219 sycl::local_accessor<unsigned long long>
220 shared_data(sycl::range<1>(shared_mem_numull), h);
221 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
223 [=] (sycl::nd_item<1> item)
224 [[sycl::reqd_work_group_size(MT)]]
227 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
230 }
catch (sycl::exception
const& ex) {
231 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
235template <
int MT,
typename L>
238 const auto nthreads_total = MT * std::size_t(nblocks);
239 auto& q = *(stream.queue);
241 q.submit([&] (sycl::handler& h) {
242 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
244 [=] (sycl::nd_item<1> item)
245 [[sycl::reqd_work_group_size(MT)]]
251 }
catch (sycl::exception
const& ex) {
252 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
256template<
int MT,
typename T,
typename L>
257void launch (T
const& n, L
const& f)
noexcept
260 const auto ec = Gpu::makeExecutionConfig<MT>(n);
261 const auto nthreads_per_block = ec.numThreads.x;
262 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
263 auto& q = Gpu::Device::streamQueue();
265 q.submit([&] (sycl::handler& h) {
266 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
267 sycl::range<1>(nthreads_per_block)),
268 [=] (sycl::nd_item<1> item)
269 [[sycl::reqd_work_group_size(MT)]]
272 for (
auto const i :
Gpu::
Range(n,item.get_global_id(0),item.get_global_range(0))) {
277 }
catch (sycl::exception
const& ex) {
278 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
282template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
283void ParallelFor (Gpu::KernelInfo
const& info, T n, L
const& f)
noexcept
286 const auto ec = Gpu::makeExecutionConfig<MT>(n);
287 const auto nthreads_per_block = ec.numThreads.x;
288 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
289 auto& q = Gpu::Device::streamQueue();
291 if (info.hasReduction()) {
292 q.submit([&] (sycl::handler& h) {
293 sycl::local_accessor<unsigned long long>
295 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
296 sycl::range<1>(nthreads_per_block)),
297 [=] (sycl::nd_item<1> item)
298 [[sycl::reqd_work_group_size(MT)]]
301 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
302 i < std::size_t(n); i += stride) {
303 int n_active_threads =
amrex::min(std::size_t(n)-i+item.get_local_id(0),
304 item.get_local_range(0));
305 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
311 q.submit([&] (sycl::handler& h) {
312 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
313 sycl::range<1>(nthreads_per_block)),
314 [=] (sycl::nd_item<1> item)
315 [[sycl::reqd_work_group_size(MT)]]
318 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
319 i < std::size_t(n); i += stride) {
320 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
325 }
catch (sycl::exception
const& ex) {
326 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
330template <
int MT,
typename L,
int dim>
331void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L
const& f)
noexcept
334 const BoxIndexerND<dim> indexer(box);
335 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
336 const auto nthreads_per_block = ec.numThreads.x;
337 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
338 auto& q = Gpu::Device::streamQueue();
340 if (info.hasReduction()) {
341 q.submit([&] (sycl::handler& h) {
342 sycl::local_accessor<unsigned long long>
344 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
345 sycl::range<1>(nthreads_per_block)),
346 [=] (sycl::nd_item<1> item)
347 [[sycl::reqd_work_group_size(MT)]]
350 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
351 icell < indexer.numPts(); icell += stride) {
352 auto iv = indexer.intVect(icell);
353 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
354 std::uint64_t(item.get_local_range(0)));
355 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
361 q.submit([&] (sycl::handler& h) {
362 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
363 sycl::range<1>(nthreads_per_block)),
364 [=] (sycl::nd_item<1> item)
365 [[sycl::reqd_work_group_size(MT)]]
368 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
369 icell < indexer.numPts(); icell += stride) {
370 auto iv = indexer.intVect(icell);
371 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
376 }
catch (sycl::exception
const& ex) {
377 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
381template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
382void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
385 const BoxIndexerND<dim> indexer(box);
386 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
387 const auto nthreads_per_block = ec.numThreads.x;
388 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
389 auto& q = Gpu::Device::streamQueue();
391 if (info.hasReduction()) {
392 q.submit([&] (sycl::handler& h) {
393 sycl::local_accessor<unsigned long long>
395 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
396 sycl::range<1>(nthreads_per_block)),
397 [=] (sycl::nd_item<1> item)
398 [[sycl::reqd_work_group_size(MT)]]
401 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
402 icell < indexer.numPts(); icell += stride) {
403 auto iv = indexer.intVect(icell);
404 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
405 std::uint64_t(item.get_local_range(0)));
406 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
407 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
413 q.submit([&] (sycl::handler& h) {
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
428 }
catch (sycl::exception
const& ex) {
429 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
433template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
437 const auto ec = Gpu::ExecutionConfig(n);
438 const auto nthreads_per_block = ec.numThreads.x;
440 auto& q = Gpu::Device::streamQueue();
441 auto& engdescr = *(getRandEngineDescriptor());
443 q.submit([&] (sycl::handler& h) {
444 auto engine_acc = engdescr.get_access(h);
445 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
446 sycl::range<1>(nthreads_per_block)),
447 [=] (sycl::nd_item<1> item)
448 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
451 auto const tid = item.get_global_id(0);
452 auto engine = engine_acc.load(tid);
453 RandomEngine rand_eng{&engine};
454 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
457 engine_acc.store(engine, tid);
461 }
catch (sycl::exception
const& ex) {
462 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
466template <
typename L,
int dim>
470 const BoxIndexerND<dim> indexer(box);
471 const auto ec = Gpu::ExecutionConfig(box.numPts());
472 const auto nthreads_per_block = ec.numThreads.x;
474 auto& q = Gpu::Device::streamQueue();
475 auto& engdescr = *(getRandEngineDescriptor());
477 q.submit([&] (sycl::handler& h) {
478 auto engine_acc = engdescr.get_access(h);
479 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
480 sycl::range<1>(nthreads_per_block)),
481 [=] (sycl::nd_item<1> item)
482 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
485 auto const tid = item.get_global_id(0);
486 auto engine = engine_acc.load(tid);
487 RandomEngine rand_eng{&engine};
488 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
489 icell < indexer.numPts(); icell += stride) {
490 auto iv = indexer.intVect(icell);
491 detail::call_f_intvect_engine(f,iv,rand_eng);
493 engine_acc.store(engine, tid);
497 }
catch (sycl::exception
const& ex) {
498 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
502template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
503void ParallelForRNG (BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
506 const BoxIndexerND<dim> indexer(box);
507 const auto ec = Gpu::ExecutionConfig(box.numPts());
508 const auto nthreads_per_block = ec.numThreads.x;
510 auto& q = Gpu::Device::streamQueue();
511 auto& engdescr = *(getRandEngineDescriptor());
513 q.submit([&] (sycl::handler& h) {
514 auto engine_acc = engdescr.get_access(h);
515 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
516 sycl::range<1>(nthreads_per_block)),
517 [=] (sycl::nd_item<1> item)
518 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
521 auto const tid = item.get_global_id(0);
522 auto engine = engine_acc.load(tid);
523 RandomEngine rand_eng{&engine};
524 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
525 icell < indexer.numPts(); icell += stride) {
526 auto iv = indexer.intVect(icell);
527 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
529 engine_acc.store(engine, tid);
533 }
catch (sycl::exception
const& ex) {
534 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
538template <
int MT,
typename L1,
typename L2,
int dim>
539void ParallelFor (Gpu::KernelInfo
const& , BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
542 const BoxIndexerND<dim> indexer1(box1);
543 const BoxIndexerND<dim> indexer2(box2);
544 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
545 const auto nthreads_per_block = ec.numThreads.x;
546 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
547 auto& q = Gpu::Device::streamQueue();
549 q.submit([&] (sycl::handler& h) {
550 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
551 sycl::range<1>(nthreads_per_block)),
552 [=] (sycl::nd_item<1> item)
553 [[sycl::reqd_work_group_size(MT)]]
556 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
557 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
558 icell < ncells; icell += stride) {
559 if (icell < indexer1.numPts()) {
560 auto iv = indexer1.intVect(icell);
561 detail::call_f_intvect(f1,iv);
563 if (icell < indexer2.numPts()) {
564 auto iv = indexer2.intVect(icell);
565 detail::call_f_intvect(f2,iv);
570 }
catch (sycl::exception
const& ex) {
571 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
575template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
577 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
578 L1&& f1, L2&& f2, L3&& f3)
noexcept
581 const BoxIndexerND<dim> indexer1(box1);
582 const BoxIndexerND<dim> indexer2(box2);
583 const BoxIndexerND<dim> indexer3(box3);
584 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
585 const auto nthreads_per_block = ec.numThreads.x;
586 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
587 auto& q = Gpu::Device::streamQueue();
589 q.submit([&] (sycl::handler& h) {
590 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
591 sycl::range<1>(nthreads_per_block)),
592 [=] (sycl::nd_item<1> item)
593 [[sycl::reqd_work_group_size(MT)]]
596 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
597 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
598 icell < ncells; icell += stride) {
599 if (icell < indexer1.numPts()) {
600 auto iv = indexer1.intVect(icell);
601 detail::call_f_intvect(f1,iv);
603 if (icell < indexer2.numPts()) {
604 auto iv = indexer2.intVect(icell);
605 detail::call_f_intvect(f2,iv);
607 if (icell < indexer3.numPts()) {
608 auto iv = indexer3.intVect(icell);
609 detail::call_f_intvect(f3,iv);
614 }
catch (sycl::exception
const& ex) {
615 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
619template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
620 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
621 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
623 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
624 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
627 const BoxIndexerND<dim> indexer1(box1);
628 const BoxIndexerND<dim> indexer2(box2);
629 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
630 const auto nthreads_per_block = ec.numThreads.x;
631 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
632 auto& q = Gpu::Device::streamQueue();
634 q.submit([&] (sycl::handler& h) {
635 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
636 sycl::range<1>(nthreads_per_block)),
637 [=] (sycl::nd_item<1> item)
638 [[sycl::reqd_work_group_size(MT)]]
641 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
642 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
643 icell < ncells; icell += stride) {
644 if (icell < indexer1.numPts()) {
645 auto iv = indexer1.intVect(icell);
646 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
648 if (icell < indexer2.numPts()) {
649 auto iv = indexer2.intVect(icell);
650 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
655 }
catch (sycl::exception
const& ex) {
656 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
660template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
661 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
662 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
663 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
665 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
666 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
667 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
670 const BoxIndexerND<dim> indexer1(box1);
671 const BoxIndexerND<dim> indexer2(box2);
672 const BoxIndexerND<dim> indexer3(box3);
673 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
674 const auto nthreads_per_block = ec.numThreads.x;
675 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
676 auto& q = Gpu::Device::streamQueue();
678 q.submit([&] (sycl::handler& h) {
679 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
680 sycl::range<1>(nthreads_per_block)),
681 [=] (sycl::nd_item<1> item)
682 [[sycl::reqd_work_group_size(MT)]]
685 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
686 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
687 icell < ncells; icell += stride) {
688 if (icell < indexer1.numPts()) {
689 auto iv = indexer1.intVect(icell);
690 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
692 if (icell < indexer2.numPts()) {
693 auto iv = indexer2.intVect(icell);
694 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
696 if (icell < indexer3.numPts()) {
697 auto iv = indexer3.intVect(icell);
698 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
703 }
catch (sycl::exception
const& ex) {
704 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
718template <
int MT,
typename L>
726template <
int MT,
typename L>
734void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
744 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
747template<
int MT,
typename T,
typename L, std::enable_if_t<std::is_
integral_v<T>,
int> FOO = 0>
748void launch (T
const& n, L
const& f)
noexcept
750 static_assert(
sizeof(T) >= 2);
752 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
753 for (
auto const& ec : nec) {
754 const T start_idx = T(ec.start_idx);
755 const T nleft = n - start_idx;
759 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
768template<
int MT,
int dim,
typename L>
771 if (box.isEmpty()) {
return; }
772 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
774 const auto type = box.ixType();
775 for (
auto const& ec : nec) {
776 const auto start_idx = std::uint64_t(ec.start_idx);
779 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
780 if (icell < indexer.
numPts()) {
781 auto iv = indexer.
intVect(icell);
793template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
794std::enable_if_t<MaybeDeviceRunnable<L>::value>
797 static_assert(
sizeof(T) >= 2);
799 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
800 for (
auto const& ec : nec) {
801 const T start_idx = T(ec.start_idx);
802 const T nleft = n - start_idx;
806 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
808 detail::call_f_scalar_handler(f, tid+start_idx,
810 (std::uint64_t)MT)));
821template <
int MT,
typename L,
int dim>
822std::enable_if_t<MaybeDeviceRunnable<L>::value>
827 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
828 for (
auto const& ec : nec) {
829 const auto start_idx = std::uint64_t(ec.start_idx);
832 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
833 if (icell < indexer.
numPts()) {
834 auto iv = indexer.
intVect(icell);
835 detail::call_f_intvect_handler(f, iv,
837 (std::uint64_t)MT)));
848template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
849std::enable_if_t<MaybeDeviceRunnable<L>::value>
854 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
855 for (
auto const& ec : nec) {
856 const auto start_idx = std::uint64_t(ec.start_idx);
859 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
860 if (icell < indexer.
numPts()) {
861 auto iv = indexer.
intVect(icell);
862 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
864 (std::uint64_t)MT)));
876template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
877std::enable_if_t<MaybeDeviceRunnable<L>::value>
887 Long tid =
Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
889 for (
Long i = tid, stride =
Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i <
Long(n); i += stride) {
902template <
typename L,
int dim>
903std::enable_if_t<MaybeDeviceRunnable<L>::value>
914 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
916 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
917 auto iv = indexer.
intVect(icell);
918 detail::call_f_intvect_engine(f, iv, engine);
930template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
931std::enable_if_t<MaybeDeviceRunnable<L>::value>
942 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
944 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
945 auto iv = indexer.
intVect(icell);
946 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
957template <
int MT,
typename L1,
typename L2,
int dim>
958std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
965 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
968 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
969 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
970 icell < ncells; icell += stride) {
971 if (icell < indexer1.
numPts()) {
972 auto iv = indexer1.
intVect(icell);
973 detail::call_f_intvect(f1, iv);
975 if (icell < indexer2.
numPts()) {
976 auto iv = indexer2.
intVect(icell);
977 detail::call_f_intvect(f2, iv);
988template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
989std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
992 L1&& f1, L2&& f2, L3&& f3)
noexcept
998 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1001 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1002 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1003 icell < ncells; icell += stride) {
1004 if (icell < indexer1.
numPts()) {
1005 auto iv = indexer1.
intVect(icell);
1006 detail::call_f_intvect(f1, iv);
1008 if (icell < indexer2.
numPts()) {
1009 auto iv = indexer2.
intVect(icell);
1010 detail::call_f_intvect(f2, iv);
1012 if (icell < indexer3.
numPts()) {
1013 auto iv = indexer3.
intVect(icell);
1014 detail::call_f_intvect(f3, iv);
1025template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1026 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1027 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1028std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1031 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1036 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1039 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1040 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1041 icell < ncells; icell += stride) {
1042 if (icell < indexer1.
numPts()) {
1043 auto iv = indexer1.
intVect(icell);
1044 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1046 if (icell < indexer2.
numPts()) {
1047 auto iv = indexer2.
intVect(icell);
1048 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1059template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1060 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1061 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1062 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1063std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1067 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1073 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1076 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1077 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1078 icell < ncells; icell += stride) {
1079 if (icell < indexer1.
numPts()) {
1080 auto iv = indexer1.
intVect(icell);
1081 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1083 if (icell < indexer2.
numPts()) {
1084 auto iv = indexer2.
intVect(icell);
1085 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1087 if (icell < indexer3.
numPts()) {
1088 auto iv = indexer3.
intVect(icell);
1089 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1098template <
typename L>
1104template<
typename T,
typename L>
1105void launch (T
const& n, L&& f)
noexcept
1107 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1114template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1115std::enable_if_t<MaybeDeviceRunnable<L>::value>
1118 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1125template <
typename L,
int dim>
1126std::enable_if_t<MaybeDeviceRunnable<L>::value>
1129 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1136template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1137std::enable_if_t<MaybeDeviceRunnable<L>::value>
1140 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1147template <
typename L1,
typename L2,
int dim>
1148std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1152 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1153 std::forward<L2>(f2));
1160template <
typename L1,
typename L2,
typename L3,
int dim>
1161std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1164 L1&& f1, L2&& f2, L3&& f3)
noexcept
1166 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1167 std::forward<L2>(f2), std::forward<L3>(f3));
1174template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1175 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1176 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1177std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1180 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1182 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1183 box2, ncomp2, std::forward<L2>(f2));
1190template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1191 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1192 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1193 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1194std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1198 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1200 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1201 box2, ncomp2, std::forward<L2>(f2),
1202 box3, ncomp3, std::forward<L3>(f3));
1205template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1206void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1208 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1211template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1212void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1214 ParallelFor<MT>(info, n,std::forward<L>(f));
1217template <
typename L,
int dim>
1218void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1220 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1223template <
int MT,
typename L,
int dim>
1224void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1226 ParallelFor<MT>(info, box,std::forward<L>(f));
1229template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1230void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1232 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1235template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1236void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1238 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1241template <
typename L1,
typename L2,
int dim>
1242void For (Gpu::KernelInfo
const& info,
1243 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1245 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1248template <
int MT,
typename L1,
typename L2,
int dim>
1249void For (Gpu::KernelInfo
const& info,
1250 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1252 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1255template <
typename L1,
typename L2,
typename L3,
int dim>
1256void For (Gpu::KernelInfo
const& info,
1257 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1258 L1&& f1, L2&& f2, L3&& f3)
noexcept
1260 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1263template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1264void For (Gpu::KernelInfo
const& info,
1265 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1266 L1&& f1, L2&& f2, L3&& f3)
noexcept
1268 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1271template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1272 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1273 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1274void For (Gpu::KernelInfo
const& info,
1275 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1276 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1278 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1281template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1282 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1283 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1284void For (Gpu::KernelInfo
const& info,
1285 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1286 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1288 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1291template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1292 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1293 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1294 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1295void For (Gpu::KernelInfo
const& info,
1296 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1297 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1298 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1300 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1301 box1,ncomp1,std::forward<L1>(f1),
1302 box2,ncomp2,std::forward<L2>(f2),
1303 box3,ncomp3,std::forward<L3>(f3));
1306template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1307 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1308 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1309 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1310void For (Gpu::KernelInfo
const& info,
1311 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1312 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1313 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1315 ParallelFor<MT>(info,
1316 box1,ncomp1,std::forward<L1>(f1),
1317 box2,ncomp2,std::forward<L2>(f2),
1318 box3,ncomp3,std::forward<L3>(f3));
1325template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1328 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1335template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1338 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1345template <
typename L,
int dim>
1348 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box, std::forward<L>(f));
1355template <
int MT,
typename L,
int dim>
1356void ParallelFor (BoxND<dim>
const& box, L&& f)
noexcept
1358 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1365template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1368 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1375template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1376void ParallelFor (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1378 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1385template <
typename L1,
typename L2,
int dim>
1386void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1388 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1395template <
int MT,
typename L1,
typename L2,
int dim>
1396void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1398 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1405template <
typename L1,
typename L2,
typename L3,
int dim>
1406void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1407 L1&& f1, L2&& f2, L3&& f3)
noexcept
1409 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1416template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1417void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1418 L1&& f1, L2&& f2, L3&& f3)
noexcept
1420 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1427template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1428 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1429 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1430void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1431 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1433 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1440template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1441 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1442 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1443void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1444 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1446 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1453template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1454 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1455 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1456 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1457void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1458 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1459 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1461 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1462 box1,ncomp1,std::forward<L1>(f1),
1463 box2,ncomp2,std::forward<L2>(f2),
1464 box3,ncomp3,std::forward<L3>(f3));
1471template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1472 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1473 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1474 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1475void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1476 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1477 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1479 ParallelFor<MT>(Gpu::KernelInfo{},
1480 box1,ncomp1,std::forward<L1>(f1),
1481 box2,ncomp2,std::forward<L2>(f2),
1482 box3,ncomp3,std::forward<L3>(f3));
1485template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1488 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n,std::forward<L>(f));
1491template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1492void For (T n, L&& f)
noexcept
1494 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1497template <
typename L,
int dim>
1500 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box,std::forward<L>(f));
1503template <
int MT,
typename L,
int dim>
1504void For (BoxND<dim>
const& box, L&& f)
noexcept
1506 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1509template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1512 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1515template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1516void For (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1518 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1521template <
typename L1,
typename L2,
int dim>
1522void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1524 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1527template <
int MT,
typename L1,
typename L2,
int dim>
1528void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1530 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1533template <
typename L1,
typename L2,
typename L3,
int dim>
1534void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1535 L1&& f1, L2&& f2, L3&& f3)
noexcept
1537 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1540template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1541void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1542 L1&& f1, L2&& f2, L3&& f3)
noexcept
1544 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1547template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1548 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1549 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1550void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1551 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1553 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1556template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1557 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1558 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1559void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1560 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1562 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1565template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1566 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1567 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1568 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1569void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1570 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1571 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1573 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1574 box1,ncomp1,std::forward<L1>(f1),
1575 box2,ncomp2,std::forward<L2>(f2),
1576 box3,ncomp3,std::forward<L3>(f3));
1579template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1580 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1581 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1582 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1583void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1584 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1585 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1587 ParallelFor<MT>(Gpu::KernelInfo{},
1588 box1,ncomp1,std::forward<L1>(f1),
1589 box2,ncomp2,std::forward<L2>(f2),
1590 box3,ncomp3,std::forward<L3>(f3));
1593template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1594std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1598 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1600#ifdef AMREX_USE_SYCL
1601 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1604 for (T i = 0; i < n; ++i) { f(i); }
1609template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1610std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1614 ParallelFor<MT>(info,n,std::forward<L>(f));
1616#ifdef AMREX_USE_SYCL
1617 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1620 for (T i = 0; i < n; ++i) { f(i); }
1625template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1626std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1629 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1632template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1633std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1639template <
typename L,
int dim>
1640std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1644 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1646#ifdef AMREX_USE_SYCL
1647 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1654template <
int MT,
typename L,
int dim>
1655std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1659 ParallelFor<MT>(info, box,std::forward<L>(f));
1661#ifdef AMREX_USE_SYCL
1662 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1669template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1670std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1674 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1676#ifdef AMREX_USE_SYCL
1677 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1684template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1685std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1689 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1691#ifdef AMREX_USE_SYCL
1692 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1699template <
typename L1,
typename L2,
int dim>
1700std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1705 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1707#ifdef AMREX_USE_SYCL
1708 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1716template <
int MT,
typename L1,
typename L2,
int dim>
1717std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1722 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1724#ifdef AMREX_USE_SYCL
1725 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1733template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1734std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1737 L1&& f1, L2&& f2, L3&& f3)
noexcept
1740 ParallelFor<MT>(info,box1,box2,box3,
1741 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1743#ifdef AMREX_USE_SYCL
1744 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1753template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1754 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1755 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1756std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1759 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1762 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1764#ifdef AMREX_USE_SYCL
1765 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1773template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1774 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1775 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1776std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1779 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1782 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1784#ifdef AMREX_USE_SYCL
1785 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1793template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1794 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1795 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1796 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1797std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1801 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1804 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1805 box1,ncomp1,std::forward<L1>(f1),
1806 box2,ncomp2,std::forward<L2>(f2),
1807 box3,ncomp3,std::forward<L3>(f3));
1809#ifdef AMREX_USE_SYCL
1810 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1819template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1820 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1821 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1822 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1823std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1827 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1830 ParallelFor<MT>(info,
1831 box1,ncomp1,std::forward<L1>(f1),
1832 box2,ncomp2,std::forward<L2>(f2),
1833 box3,ncomp3,std::forward<L3>(f3));
1835#ifdef AMREX_USE_SYCL
1836 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1845template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1846void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1848 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1851template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1852void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1854 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1857template <
typename L,
int dim>
1858void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1860 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1863template <
int MT,
typename L,
int dim>
1864void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1866 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1869template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1870void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1872 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1875template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1876void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1878 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1881template <
typename L1,
typename L2,
int dim>
1883 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1885 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1888template <
int MT,
typename L1,
typename L2,
int dim>
1890 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1892 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1895template <
typename L1,
typename L2,
typename L3,
int dim>
1897 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1898 L1&& f1, L2&& f2, L3&& f3)
noexcept
1900 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1901 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1904template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1906 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1907 L1&& f1, L2&& f2, L3&& f3)
noexcept
1909 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1910 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1913template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1914 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1915 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1917 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1918 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1920 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1923template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1924 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1925 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1927 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1928 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1930 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1933template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1934 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1935 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1936 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1938 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1939 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1940 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1942 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1943 box1,ncomp1,std::forward<L1>(f1),
1944 box2,ncomp2,std::forward<L2>(f2),
1945 box3,ncomp3,std::forward<L3>(f3));
1948template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1949 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1950 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1951 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1953 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1954 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1955 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1957 HostDeviceParallelFor<MT>(info,
1958 box1,ncomp1,std::forward<L1>(f1),
1959 box2,ncomp2,std::forward<L2>(f2),
1960 box3,ncomp3,std::forward<L3>(f3));
1963template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1966 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1969template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1972 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1975template <
typename L,
int dim>
1978 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1981template <
int MT,
typename L,
int dim>
1984 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1987template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1990 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1993template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1996 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1999template <
typename L1,
typename L2,
int dim>
2000void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2002 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2005template <
int MT,
typename L1,
typename L2,
int dim>
2006void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2008 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2011template <
typename L1,
typename L2,
typename L3,
int dim>
2013 L1&& f1, L2&& f2, L3&& f3)
noexcept
2015 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2016 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2019template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2021 L1&& f1, L2&& f2, L3&& f3)
noexcept
2023 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2024 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2027template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2028 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2029 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2031 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2033 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2036template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2037 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2038 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2040 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2042 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2045template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2046 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2047 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2048 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2050 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2051 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2053 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2054 box1,ncomp1,std::forward<L1>(f1),
2055 box2,ncomp2,std::forward<L2>(f2),
2056 box3,ncomp3,std::forward<L3>(f3));
2059template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2060 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2061 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2062 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2064 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2065 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2067 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2068 box1,ncomp1,std::forward<L1>(f1),
2069 box2,ncomp2,std::forward<L2>(f2),
2070 box3,ncomp3,std::forward<L3>(f3));
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:140
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:922
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:809
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1362
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1274
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72