1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
12 template <
typename F,
typename N>
14 auto call_f_scalar_handler (F
const& f, N i, Gpu::Handler
const&)
15 noexcept ->
decltype(f(0))
20 template <
typename F,
typename N>
22 auto call_f_scalar_handler (F
const& f, N i, Gpu::Handler
const& handler)
23 noexcept ->
decltype(f(0,Gpu::Handler{}))
30 template <
typename F, std::size_t...Ns,
class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>, F
const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
35 f(iv[0], 0, 0, args...);
38 template <
typename F, std::size_t...Ns,
class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>, F
const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
43 f(iv[0], iv[1], 0, args...);
46 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>, F
const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
54 template <
typename F,
int dim, std::size_t...Ns,
class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>, F
const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
59 f(iv[Ns]..., args...);
64 template <
typename F,
int dim>
66 auto call_f_intvect (F
const& f, IntVectND<dim> iv)
67 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
69 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
74 template <
typename F,
int dim>
76 auto call_f_intvect_engine (F
const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
79 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
84 template <
typename F,
int dim>
86 auto call_f_intvect_handler (F
const& f, IntVectND<dim> iv, Gpu::Handler
const&)
87 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
89 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
92 template <
typename F,
int dim>
94 auto call_f_intvect_handler (F
const& f, IntVectND<dim> iv, Gpu::Handler
const& handler)
95 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
97 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
102 template <
typename F,
typename T,
int dim>
104 auto call_f_intvect_ncomp (F
const& f, IntVectND<dim> iv, T ncomp)
105 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
114 template <
typename F,
typename T,
int dim>
116 auto call_f_intvect_ncomp_engine (F
const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
126 template <
typename F,
typename T,
int dim>
128 auto call_f_intvect_ncomp_handler (F
const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const&)
129 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
136 template <
typename F,
typename T,
int dim>
138 auto call_f_intvect_ncomp_handler (F
const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler
const& handler)
139 noexcept ->
decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
154 auto& q = *(stream.queue);
156 q.submit([&] (sycl::handler& h) {
157 h.single_task([=] () { f(); });
159 }
catch (sycl::exception
const& ex) {
160 amrex::Abort(std::string(
"single_task: ")+ex.what()+
"!!!!!");
165void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
168 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
169 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
170 /
sizeof(
unsigned long long);
171 auto& q = *(stream.queue);
173 q.submit([&] (sycl::handler& h) {
174 sycl::local_accessor<unsigned long long>
175 shared_data(sycl::range<1>(shared_mem_numull), h);
176 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
177 sycl::range<1>(nthreads_per_block)),
178 [=] (sycl::nd_item<1> item)
181 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
184 }
catch (sycl::exception
const& ex) {
185 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
190void launch (
int nblocks,
int nthreads_per_block,
gpuStream_t stream, L
const& f)
noexcept
192 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
193 auto& q = *(stream.queue);
195 q.submit([&] (sycl::handler& h) {
196 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
197 sycl::range<1>(nthreads_per_block)),
198 [=] (sycl::nd_item<1> item)
204 }
catch (sycl::exception
const& ex) {
205 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
209template <
int MT,
typename L>
213 const auto nthreads_total = MT * std::size_t(nblocks);
214 const std::size_t shared_mem_numull = (shared_mem_bytes+
sizeof(
unsigned long long)-1)
215 /
sizeof(
unsigned long long);
216 auto& q = *(stream.queue);
218 q.submit([&] (sycl::handler& h) {
219 sycl::local_accessor<unsigned long long>
220 shared_data(sycl::range<1>(shared_mem_numull), h);
221 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
223 [=] (sycl::nd_item<1> item)
224 [[sycl::reqd_work_group_size(MT)]]
227 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get()});
230 }
catch (sycl::exception
const& ex) {
231 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
235template <
int MT,
typename L>
238 const auto nthreads_total = MT * std::size_t(nblocks);
239 auto& q = *(stream.queue);
241 q.submit([&] (sycl::handler& h) {
242 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
244 [=] (sycl::nd_item<1> item)
245 [[sycl::reqd_work_group_size(MT)]]
251 }
catch (sycl::exception
const& ex) {
252 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
256template<
int MT,
typename T,
typename L>
257void launch (T
const& n, L
const& f)
noexcept
260 const auto ec = Gpu::makeExecutionConfig<MT>(n);
261 const auto nthreads_per_block = ec.numThreads.x;
262 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
263 auto& q = Gpu::Device::streamQueue();
265 q.submit([&] (sycl::handler& h) {
266 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
267 sycl::range<1>(nthreads_per_block)),
268 [=] (sycl::nd_item<1> item)
269 [[sycl::reqd_work_group_size(MT)]]
272 for (
auto const i :
Gpu::
Range(n,item.get_global_id(0),item.get_global_range(0))) {
277 }
catch (sycl::exception
const& ex) {
278 amrex::Abort(std::string(
"launch: ")+ex.what()+
"!!!!!");
282template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
283void ParallelFor (Gpu::KernelInfo
const& info, T n, L
const& f)
noexcept
286 const auto ec = Gpu::makeExecutionConfig<MT>(n);
287 const auto nthreads_per_block = ec.numThreads.x;
288 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
289 auto& q = Gpu::Device::streamQueue();
291 if (info.hasReduction()) {
292 q.submit([&] (sycl::handler& h) {
293 sycl::local_accessor<unsigned long long>
295 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
296 sycl::range<1>(nthreads_per_block)),
297 [=] (sycl::nd_item<1> item)
298 [[sycl::reqd_work_group_size(MT)]]
301 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
302 i < std::size_t(n); i += stride) {
303 int n_active_threads =
amrex::min(std::size_t(n)-i+item.get_local_id(0),
304 item.get_local_range(0));
305 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
311 q.submit([&] (sycl::handler& h) {
312 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
313 sycl::range<1>(nthreads_per_block)),
314 [=] (sycl::nd_item<1> item)
315 [[sycl::reqd_work_group_size(MT)]]
318 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
319 i < std::size_t(n); i += stride) {
320 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
325 }
catch (sycl::exception
const& ex) {
326 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
330template <
int MT,
typename L,
int dim>
331void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L
const& f)
noexcept
334 const BoxIndexerND<dim> indexer(box);
335 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
336 const auto nthreads_per_block = ec.numThreads.x;
337 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
338 auto& q = Gpu::Device::streamQueue();
340 if (info.hasReduction()) {
341 q.submit([&] (sycl::handler& h) {
342 sycl::local_accessor<unsigned long long>
344 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
345 sycl::range<1>(nthreads_per_block)),
346 [=] (sycl::nd_item<1> item)
347 [[sycl::reqd_work_group_size(MT)]]
350 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
351 icell < indexer.numPts(); icell += stride) {
352 auto iv = indexer.intVect(icell);
353 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
354 std::uint64_t(item.get_local_range(0)));
355 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
361 q.submit([&] (sycl::handler& h) {
362 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
363 sycl::range<1>(nthreads_per_block)),
364 [=] (sycl::nd_item<1> item)
365 [[sycl::reqd_work_group_size(MT)]]
368 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
369 icell < indexer.numPts(); icell += stride) {
370 auto iv = indexer.intVect(icell);
371 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
376 }
catch (sycl::exception
const& ex) {
377 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
381template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
382void ParallelFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
385 const BoxIndexerND<dim> indexer(box);
386 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
387 const auto nthreads_per_block = ec.numThreads.x;
388 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
389 auto& q = Gpu::Device::streamQueue();
391 if (info.hasReduction()) {
392 q.submit([&] (sycl::handler& h) {
393 sycl::local_accessor<unsigned long long>
395 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
396 sycl::range<1>(nthreads_per_block)),
397 [=] (sycl::nd_item<1> item)
398 [[sycl::reqd_work_group_size(MT)]]
401 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
402 icell < indexer.numPts(); icell += stride) {
403 auto iv = indexer.intVect(icell);
404 int n_active_threads =
amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
405 std::uint64_t(item.get_local_range(0)));
406 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
407 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().
get(),
413 q.submit([&] (sycl::handler& h) {
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
428 }
catch (sycl::exception
const& ex) {
429 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
433template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
437 const auto ec = Gpu::ExecutionConfig(n);
438 const auto nthreads_per_block = ec.numThreads.x;
440 auto& q = Gpu::Device::streamQueue();
441 auto& engdescr = *(getRandEngineDescriptor());
443 q.submit([&] (sycl::handler& h) {
444 auto engine_acc = engdescr.get_access(h);
445 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
446 sycl::range<1>(nthreads_per_block)),
447 [=] (sycl::nd_item<1> item)
448 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
451 auto const tid = item.get_global_id(0);
452 auto engine = engine_acc.load(tid);
453 RandomEngine rand_eng{&engine};
454 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
457 engine_acc.store(engine, tid);
461 }
catch (sycl::exception
const& ex) {
462 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
466template <
typename L,
int dim>
470 const BoxIndexerND<dim> indexer(box);
471 const auto ec = Gpu::ExecutionConfig(box.numPts());
472 const auto nthreads_per_block = ec.numThreads.x;
474 auto& q = Gpu::Device::streamQueue();
475 auto& engdescr = *(getRandEngineDescriptor());
477 q.submit([&] (sycl::handler& h) {
478 auto engine_acc = engdescr.get_access(h);
479 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
480 sycl::range<1>(nthreads_per_block)),
481 [=] (sycl::nd_item<1> item)
482 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
485 auto const tid = item.get_global_id(0);
486 auto engine = engine_acc.load(tid);
487 RandomEngine rand_eng{&engine};
488 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
489 icell < indexer.numPts(); icell += stride) {
490 auto iv = indexer.intVect(icell);
491 detail::call_f_intvect_engine(f,iv,rand_eng);
493 engine_acc.store(engine, tid);
497 }
catch (sycl::exception
const& ex) {
498 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
502template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
503void ParallelForRNG (BoxND<dim>
const& box, T ncomp, L
const& f)
noexcept
506 const BoxIndexerND<dim> indexer(box);
507 const auto ec = Gpu::ExecutionConfig(box.numPts());
508 const auto nthreads_per_block = ec.numThreads.x;
510 auto& q = Gpu::Device::streamQueue();
511 auto& engdescr = *(getRandEngineDescriptor());
513 q.submit([&] (sycl::handler& h) {
514 auto engine_acc = engdescr.get_access(h);
515 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
516 sycl::range<1>(nthreads_per_block)),
517 [=] (sycl::nd_item<1> item)
518 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
521 auto const tid = item.get_global_id(0);
522 auto engine = engine_acc.load(tid);
523 RandomEngine rand_eng{&engine};
524 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
525 icell < indexer.numPts(); icell += stride) {
526 auto iv = indexer.intVect(icell);
527 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
529 engine_acc.store(engine, tid);
533 }
catch (sycl::exception
const& ex) {
534 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
538template <
int MT,
typename L1,
typename L2,
int dim>
539void ParallelFor (Gpu::KernelInfo
const& , BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
542 const BoxIndexerND<dim> indexer1(box1);
543 const BoxIndexerND<dim> indexer2(box2);
544 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
545 const auto nthreads_per_block = ec.numThreads.x;
546 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
547 auto& q = Gpu::Device::streamQueue();
549 q.submit([&] (sycl::handler& h) {
550 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
551 sycl::range<1>(nthreads_per_block)),
552 [=] (sycl::nd_item<1> item)
553 [[sycl::reqd_work_group_size(MT)]]
556 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
557 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
558 icell < ncells; icell += stride) {
559 if (icell < indexer1.numPts()) {
560 auto iv = indexer1.intVect(icell);
561 detail::call_f_intvect(f1,iv);
563 if (icell < indexer2.numPts()) {
564 auto iv = indexer2.intVect(icell);
565 detail::call_f_intvect(f2,iv);
570 }
catch (sycl::exception
const& ex) {
571 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
575template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
577 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
578 L1&& f1, L2&& f2, L3&& f3)
noexcept
581 const BoxIndexerND<dim> indexer1(box1);
582 const BoxIndexerND<dim> indexer2(box2);
583 const BoxIndexerND<dim> indexer3(box3);
584 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
585 const auto nthreads_per_block = ec.numThreads.x;
586 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
587 auto& q = Gpu::Device::streamQueue();
589 q.submit([&] (sycl::handler& h) {
590 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
591 sycl::range<1>(nthreads_per_block)),
592 [=] (sycl::nd_item<1> item)
593 [[sycl::reqd_work_group_size(MT)]]
596 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
597 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
598 icell < ncells; icell += stride) {
599 if (icell < indexer1.numPts()) {
600 auto iv = indexer1.intVect(icell);
601 detail::call_f_intvect(f1,iv);
603 if (icell < indexer2.numPts()) {
604 auto iv = indexer2.intVect(icell);
605 detail::call_f_intvect(f2,iv);
607 if (icell < indexer3.numPts()) {
608 auto iv = indexer3.intVect(icell);
609 detail::call_f_intvect(f3,iv);
614 }
catch (sycl::exception
const& ex) {
615 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
619template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
620 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
621 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
623 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
624 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
627 const BoxIndexerND<dim> indexer1(box1);
628 const BoxIndexerND<dim> indexer2(box2);
629 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
630 const auto nthreads_per_block = ec.numThreads.x;
631 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
632 auto& q = Gpu::Device::streamQueue();
634 q.submit([&] (sycl::handler& h) {
635 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
636 sycl::range<1>(nthreads_per_block)),
637 [=] (sycl::nd_item<1> item)
638 [[sycl::reqd_work_group_size(MT)]]
641 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
642 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
643 icell < ncells; icell += stride) {
644 if (icell < indexer1.numPts()) {
645 auto iv = indexer1.intVect(icell);
646 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
648 if (icell < indexer2.numPts()) {
649 auto iv = indexer2.intVect(icell);
650 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
655 }
catch (sycl::exception
const& ex) {
656 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
660template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
661 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
662 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
663 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
665 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
666 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
667 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
670 const BoxIndexerND<dim> indexer1(box1);
671 const BoxIndexerND<dim> indexer2(box2);
672 const BoxIndexerND<dim> indexer3(box3);
673 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
674 const auto nthreads_per_block = ec.numThreads.x;
675 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
676 auto& q = Gpu::Device::streamQueue();
678 q.submit([&] (sycl::handler& h) {
679 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
680 sycl::range<1>(nthreads_per_block)),
681 [=] (sycl::nd_item<1> item)
682 [[sycl::reqd_work_group_size(MT)]]
685 auto const ncells =
amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
686 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
687 icell < ncells; icell += stride) {
688 if (icell < indexer1.numPts()) {
689 auto iv = indexer1.intVect(icell);
690 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
692 if (icell < indexer2.numPts()) {
693 auto iv = indexer2.intVect(icell);
694 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
696 if (icell < indexer3.numPts()) {
697 auto iv = indexer3.intVect(icell);
698 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
703 }
catch (sycl::exception
const& ex) {
704 amrex::Abort(std::string(
"ParallelFor: ")+ex.what()+
"!!!!!");
719template <
int MT,
typename L>
728template <
int MT,
typename L>
737void launch (
int nblocks,
int nthreads_per_block, std::size_t shared_mem_bytes,
748 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
751template<
int MT,
typename T,
typename L, std::enable_if_t<std::is_
integral_v<T>,
int> FOO = 0>
752void launch (T
const& n, L
const& f)
noexcept
754 static_assert(
sizeof(T) >= 2);
756 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
757 for (
auto const& ec : nec) {
758 const T start_idx = T(ec.start_idx);
759 const T nleft = n - start_idx;
763 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
772template<
int MT,
int dim,
typename L>
775 if (box.isEmpty()) {
return; }
776 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
778 const auto type = box.ixType();
779 for (
auto const& ec : nec) {
780 const auto start_idx = std::uint64_t(ec.start_idx);
783 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
784 if (icell < indexer.
numPts()) {
785 auto iv = indexer.
intVect(icell);
797template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
798std::enable_if_t<MaybeDeviceRunnable<L>::value>
801 static_assert(
sizeof(T) >= 2);
803 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
804 for (
auto const& ec : nec) {
805 const T start_idx = T(ec.start_idx);
806 const T nleft = n - start_idx;
810 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
812 detail::call_f_scalar_handler(f, tid+start_idx,
814 (std::uint64_t)MT)));
825template <
int MT,
typename L,
int dim>
826std::enable_if_t<MaybeDeviceRunnable<L>::value>
831 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
832 for (
auto const& ec : nec) {
833 const auto start_idx = std::uint64_t(ec.start_idx);
836 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
837 if (icell < indexer.
numPts()) {
838 auto iv = indexer.
intVect(icell);
839 detail::call_f_intvect_handler(f, iv,
841 (std::uint64_t)MT)));
852template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
853std::enable_if_t<MaybeDeviceRunnable<L>::value>
858 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
859 for (
auto const& ec : nec) {
860 const auto start_idx = std::uint64_t(ec.start_idx);
863 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
864 if (icell < indexer.
numPts()) {
865 auto iv = indexer.
intVect(icell);
866 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
868 (std::uint64_t)MT)));
880template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
881std::enable_if_t<MaybeDeviceRunnable<L>::value>
891 Long tid =
Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
893 for (
Long i = tid, stride =
Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i <
Long(n); i += stride) {
906template <
typename L,
int dim>
907std::enable_if_t<MaybeDeviceRunnable<L>::value>
918 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
920 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
921 auto iv = indexer.
intVect(icell);
922 detail::call_f_intvect_engine(f, iv, engine);
934template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
935std::enable_if_t<MaybeDeviceRunnable<L>::value>
946 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
948 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.
numPts(); icell += stride) {
949 auto iv = indexer.
intVect(icell);
950 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
961template <
int MT,
typename L1,
typename L2,
int dim>
962std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
969 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
972 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
973 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
974 icell < ncells; icell += stride) {
975 if (icell < indexer1.
numPts()) {
976 auto iv = indexer1.
intVect(icell);
977 detail::call_f_intvect(f1, iv);
979 if (icell < indexer2.
numPts()) {
980 auto iv = indexer2.
intVect(icell);
981 detail::call_f_intvect(f2, iv);
992template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
993std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
996 L1&& f1, L2&& f2, L3&& f3)
noexcept
1002 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1005 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1006 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1007 icell < ncells; icell += stride) {
1008 if (icell < indexer1.
numPts()) {
1009 auto iv = indexer1.
intVect(icell);
1010 detail::call_f_intvect(f1, iv);
1012 if (icell < indexer2.
numPts()) {
1013 auto iv = indexer2.
intVect(icell);
1014 detail::call_f_intvect(f2, iv);
1016 if (icell < indexer3.
numPts()) {
1017 auto iv = indexer3.
intVect(icell);
1018 detail::call_f_intvect(f3, iv);
1029template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1030 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1031 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1032std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1035 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1040 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1043 auto const ncells = std::max(indexer1.
numPts(), indexer2.
numPts());
1044 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1045 icell < ncells; icell += stride) {
1046 if (icell < indexer1.
numPts()) {
1047 auto iv = indexer1.
intVect(icell);
1048 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1050 if (icell < indexer2.
numPts()) {
1051 auto iv = indexer2.
intVect(icell);
1052 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1063template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1064 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1065 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1066 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1067std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1071 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1077 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1080 auto const ncells = std::max({indexer1.
numPts(), indexer2.
numPts(), indexer3.
numPts()});
1081 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1082 icell < ncells; icell += stride) {
1083 if (icell < indexer1.
numPts()) {
1084 auto iv = indexer1.
intVect(icell);
1085 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1087 if (icell < indexer2.
numPts()) {
1088 auto iv = indexer2.
intVect(icell);
1089 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1091 if (icell < indexer3.
numPts()) {
1092 auto iv = indexer3.
intVect(icell);
1093 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1102template <
typename L>
1108template<
typename T,
typename L>
1109void launch (T
const& n, L&& f)
noexcept
1111 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1118template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1119std::enable_if_t<MaybeDeviceRunnable<L>::value>
1122 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1129template <
typename L,
int dim>
1130std::enable_if_t<MaybeDeviceRunnable<L>::value>
1133 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1140template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1141std::enable_if_t<MaybeDeviceRunnable<L>::value>
1144 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1151template <
typename L1,
typename L2,
int dim>
1152std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1156 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1157 std::forward<L2>(f2));
1164template <
typename L1,
typename L2,
typename L3,
int dim>
1165std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1168 L1&& f1, L2&& f2, L3&& f3)
noexcept
1170 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1171 std::forward<L2>(f2), std::forward<L3>(f3));
1178template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1179 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1180 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1181std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1184 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1186 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1187 box2, ncomp2, std::forward<L2>(f2));
1194template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1195 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1196 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1197 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1198std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1202 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1204 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1205 box2, ncomp2, std::forward<L2>(f2),
1206 box3, ncomp3, std::forward<L3>(f3));
1209template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1210void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1212 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1215template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1216void For (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1218 ParallelFor<MT>(info, n,std::forward<L>(f));
1221template <
typename L,
int dim>
1222void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1224 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1227template <
int MT,
typename L,
int dim>
1228void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1230 ParallelFor<MT>(info, box,std::forward<L>(f));
1233template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1234void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1236 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1239template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1240void For (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1242 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1245template <
typename L1,
typename L2,
int dim>
1246void For (Gpu::KernelInfo
const& info,
1247 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1249 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1252template <
int MT,
typename L1,
typename L2,
int dim>
1253void For (Gpu::KernelInfo
const& info,
1254 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1256 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1259template <
typename L1,
typename L2,
typename L3,
int dim>
1260void For (Gpu::KernelInfo
const& info,
1261 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1262 L1&& f1, L2&& f2, L3&& f3)
noexcept
1264 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1267template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1268void For (Gpu::KernelInfo
const& info,
1269 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1270 L1&& f1, L2&& f2, L3&& f3)
noexcept
1272 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1275template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1276 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1277 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1278void For (Gpu::KernelInfo
const& info,
1279 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1280 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1282 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1285template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1286 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1287 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1288void For (Gpu::KernelInfo
const& info,
1289 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1290 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1292 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1295template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1296 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1297 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1298 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1299void For (Gpu::KernelInfo
const& info,
1300 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1301 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1302 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1304 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1305 box1,ncomp1,std::forward<L1>(f1),
1306 box2,ncomp2,std::forward<L2>(f2),
1307 box3,ncomp3,std::forward<L3>(f3));
1310template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1311 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1312 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1313 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1314void For (Gpu::KernelInfo
const& info,
1315 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1316 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1317 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1319 ParallelFor<MT>(info,
1320 box1,ncomp1,std::forward<L1>(f1),
1321 box2,ncomp2,std::forward<L2>(f2),
1322 box3,ncomp3,std::forward<L3>(f3));
1329template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1332 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1339template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1342 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1349template <
typename L,
int dim>
1352 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box, std::forward<L>(f));
1359template <
int MT,
typename L,
int dim>
1360void ParallelFor (BoxND<dim>
const& box, L&& f)
noexcept
1362 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1369template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1372 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1379template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1380void ParallelFor (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1382 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1389template <
typename L1,
typename L2,
int dim>
1390void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1392 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1399template <
int MT,
typename L1,
typename L2,
int dim>
1400void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1402 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1409template <
typename L1,
typename L2,
typename L3,
int dim>
1410void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1411 L1&& f1, L2&& f2, L3&& f3)
noexcept
1413 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1420template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1421void ParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1422 L1&& f1, L2&& f2, L3&& f3)
noexcept
1424 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1431template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1432 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1433 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1434void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1435 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1437 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1444template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1445 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1446 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1447void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1448 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1450 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1457template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1458 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1459 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1460 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1461void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1462 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1463 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1465 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1466 box1,ncomp1,std::forward<L1>(f1),
1467 box2,ncomp2,std::forward<L2>(f2),
1468 box3,ncomp3,std::forward<L3>(f3));
1475template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1476 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1477 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1478 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1479void ParallelFor (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1480 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1481 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1483 ParallelFor<MT>(Gpu::KernelInfo{},
1484 box1,ncomp1,std::forward<L1>(f1),
1485 box2,ncomp2,std::forward<L2>(f2),
1486 box3,ncomp3,std::forward<L3>(f3));
1489template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1492 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n,std::forward<L>(f));
1495template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1496void For (T n, L&& f)
noexcept
1498 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1501template <
typename L,
int dim>
1504 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, box,std::forward<L>(f));
1507template <
int MT,
typename L,
int dim>
1508void For (BoxND<dim>
const& box, L&& f)
noexcept
1510 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1513template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1516 ParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1519template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1520void For (BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1522 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1525template <
typename L1,
typename L2,
int dim>
1526void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1528 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1531template <
int MT,
typename L1,
typename L2,
int dim>
1532void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1534 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1537template <
typename L1,
typename L2,
typename L3,
int dim>
1538void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1539 L1&& f1, L2&& f2, L3&& f3)
noexcept
1541 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1544template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1545void For (BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1546 L1&& f1, L2&& f2, L3&& f3)
noexcept
1548 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1551template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1552 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1553 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1554void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1555 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1557 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1560template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1561 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1562 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1563void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1564 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1566 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1569template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1570 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1571 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1572 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1573void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1574 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1575 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1577 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1578 box1,ncomp1,std::forward<L1>(f1),
1579 box2,ncomp2,std::forward<L2>(f2),
1580 box3,ncomp3,std::forward<L3>(f3));
1583template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1584 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1585 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1586 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1587void For (BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1588 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1589 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1591 ParallelFor<MT>(Gpu::KernelInfo{},
1592 box1,ncomp1,std::forward<L1>(f1),
1593 box2,ncomp2,std::forward<L2>(f2),
1594 box3,ncomp3,std::forward<L3>(f3));
1597template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1598std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1602 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1604#ifdef AMREX_USE_SYCL
1605 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1608 for (T i = 0; i < n; ++i) { f(i); }
1613template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1614std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1618 ParallelFor<MT>(info,n,std::forward<L>(f));
1620#ifdef AMREX_USE_SYCL
1621 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1624 for (T i = 0; i < n; ++i) { f(i); }
1629template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1630std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1633 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(
Gpu::KernelInfo{}, n, std::forward<L>(f));
1636template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1637std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1643template <
typename L,
int dim>
1644std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1648 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1650#ifdef AMREX_USE_SYCL
1651 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1658template <
int MT,
typename L,
int dim>
1659std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1663 ParallelFor<MT>(info, box,std::forward<L>(f));
1665#ifdef AMREX_USE_SYCL
1666 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1673template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1674std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1678 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1680#ifdef AMREX_USE_SYCL
1681 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1688template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1689std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1693 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1695#ifdef AMREX_USE_SYCL
1696 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1703template <
typename L1,
typename L2,
int dim>
1704std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1709 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1711#ifdef AMREX_USE_SYCL
1712 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1720template <
int MT,
typename L1,
typename L2,
int dim>
1721std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1726 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1728#ifdef AMREX_USE_SYCL
1729 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1737template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1738std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1741 L1&& f1, L2&& f2, L3&& f3)
noexcept
1744 ParallelFor<MT>(info,box1,box2,box3,
1745 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1747#ifdef AMREX_USE_SYCL
1748 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1757template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1758 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1759 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1760std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1763 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1766 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1768#ifdef AMREX_USE_SYCL
1769 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1777template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1778 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1779 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1780std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1783 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
noexcept
1786 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1788#ifdef AMREX_USE_SYCL
1789 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1797template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1798 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1799 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1800 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1801std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1805 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1808 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1809 box1,ncomp1,std::forward<L1>(f1),
1810 box2,ncomp2,std::forward<L2>(f2),
1811 box3,ncomp3,std::forward<L3>(f3));
1813#ifdef AMREX_USE_SYCL
1814 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1823template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1824 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1825 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1826 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1827std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1831 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
noexcept
1834 ParallelFor<MT>(info,
1835 box1,ncomp1,std::forward<L1>(f1),
1836 box2,ncomp2,std::forward<L2>(f2),
1837 box3,ncomp3,std::forward<L3>(f3));
1839#ifdef AMREX_USE_SYCL
1840 amrex::Abort(
"amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1849template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1850void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1852 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1855template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1856void HostDeviceFor (Gpu::KernelInfo
const& info, T n, L&& f)
noexcept
1858 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1861template <
typename L,
int dim>
1862void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1864 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1867template <
int MT,
typename L,
int dim>
1868void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, L&& f)
noexcept
1870 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1873template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1874void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1876 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1879template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1880void HostDeviceFor (Gpu::KernelInfo
const& info, BoxND<dim>
const& box, T ncomp, L&& f)
noexcept
1882 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1885template <
typename L1,
typename L2,
int dim>
1887 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1889 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1892template <
int MT,
typename L1,
typename L2,
int dim>
1894 BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
1896 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1899template <
typename L1,
typename L2,
typename L3,
int dim>
1901 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1902 L1&& f1, L2&& f2, L3&& f3)
noexcept
1904 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1905 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1908template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
1910 BoxND<dim>
const& box1, BoxND<dim>
const& box2, BoxND<dim>
const& box3,
1911 L1&& f1, L2&& f2, L3&& f3)
noexcept
1913 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1914 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1917template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1918 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1919 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1921 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1922 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1924 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1927template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
1928 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1929 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1931 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1932 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
1934 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1937template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1938 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1939 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1940 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1942 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1943 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1944 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1946 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1947 box1,ncomp1,std::forward<L1>(f1),
1948 box2,ncomp2,std::forward<L2>(f2),
1949 box3,ncomp3,std::forward<L3>(f3));
1952template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
1953 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1954 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1955 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1957 BoxND<dim>
const& box1, T1 ncomp1, L1&& f1,
1958 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
1959 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
1961 HostDeviceParallelFor<MT>(info,
1962 box1,ncomp1,std::forward<L1>(f1),
1963 box2,ncomp2,std::forward<L2>(f2),
1964 box3,ncomp3,std::forward<L3>(f3));
1967template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1970 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1973template <
int MT,
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1976 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1979template <
typename L,
int dim>
1982 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1985template <
int MT,
typename L,
int dim>
1988 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1991template <
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
1994 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1997template <
int MT,
typename T,
typename L,
int dim,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
2000 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2003template <
typename L1,
typename L2,
int dim>
2004void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2006 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2009template <
int MT,
typename L1,
typename L2,
int dim>
2010void HostDeviceParallelFor (BoxND<dim>
const& box1, BoxND<dim>
const& box2, L1&& f1, L2&& f2)
noexcept
2012 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2015template <
typename L1,
typename L2,
typename L3,
int dim>
2017 L1&& f1, L2&& f2, L3&& f3)
noexcept
2019 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2020 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2023template <
int MT,
typename L1,
typename L2,
typename L3,
int dim>
2025 L1&& f1, L2&& f2, L3&& f3)
noexcept
2027 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2028 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2031template <
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2032 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2033 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2035 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2037 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2040template <
int MT,
typename T1,
typename T2,
typename L1,
typename L2,
int dim,
2041 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2042 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2044 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2)
noexcept
2046 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2049template <
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2050 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2051 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2052 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2054 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2055 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2057 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2058 box1,ncomp1,std::forward<L1>(f1),
2059 box2,ncomp2,std::forward<L2>(f2),
2060 box3,ncomp3,std::forward<L3>(f3));
2063template <
int MT,
typename T1,
typename T2,
typename T3,
typename L1,
typename L2,
typename L3,
int dim,
2064 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2065 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2066 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2068 BoxND<dim>
const& box2, T2 ncomp2, L2&& f2,
2069 BoxND<dim>
const& box3, T3 ncomp3, L3&& f3)
noexcept
2071 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2072 box1,ncomp1,std::forward<L1>(f1),
2073 box2,ncomp2,std::forward<L2>(f2),
2074 box3,ncomp3,std::forward<L3>(f3));
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:186
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:140
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:922
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:809
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1362
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1274
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72