Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
8namespace detail {
9
10 // call_f_scalar_handler
11
12 template <typename F, typename N>
14 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
15 noexcept -> decltype(f(0))
16 {
17 return f(i);
18 }
19
20 template <typename F, typename N>
22 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
23 noexcept -> decltype(f(0,Gpu::Handler{}))
24 {
25 return f(i, handler);
26 }
27
28 // call_f_intvect_inner
29
30 template <typename F, std::size_t...Ns, class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
34 {
35 return f(iv[0], 0, 0, args...);
36 }
37
38 template <typename F, std::size_t...Ns, class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
42 {
43 return f(iv[0], iv[1], 0, args...);
44 }
45
46 template <typename F, int dim, std::size_t...Ns, class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
50 {
51 return f(iv, args...);
52 }
53
54 template <typename F, int dim, std::size_t...Ns, class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
58 {
59 return f(iv[Ns]..., args...);
60 }
61
62 // call_f_intvect
63
64 template <typename F, int dim>
66 auto call_f_intvect (F const& f, IntVectND<dim> iv)
67 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
68 {
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
70 }
71
72 // call_f_intvect_engine
73
74 template <typename F, int dim>
76 auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
78 {
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
80 }
81
82 // call_f_intvect_handler
83
84 template <typename F, int dim>
86 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)
87 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
88 {
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
90 }
91
92 template <typename F, int dim>
94 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
95 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
96 {
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
98 }
99
100 // call_f_intvect_ncomp
101
102 template <typename F, typename T, int dim>
104 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
106 {
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
109 }
110 }
111
112 // call_f_intvect_ncomp_engine
113
114 template <typename F, typename T, int dim>
116 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
118 {
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
121 }
122 }
123
124 // call_f_intvect_ncomp_handler
125
126 template <typename F, typename T, int dim>
128 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
129 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
130 {
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
133 }
134 }
135
136 template <typename F, typename T, int dim>
138 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
139 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
140 {
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
143 }
144 }
145
146}
148
149#ifdef AMREX_USE_SYCL
150
151template <typename L>
152void single_task (gpuStream_t stream, L const& f)
153{
154 detail::SyclKernelDevPtr<L> skdp(f, stream);
155 L const* pf = skdp.template get<0>();
157
158 auto& q = *(stream.queue);
159 try {
160 q.submit([&] (sycl::handler& h) {
161 if constexpr (detail::is_big_kernel<L>()) {
162 h.single_task(*pf);
163 } else {
164 h.single_task(f);
165 }
166 });
167 } catch (sycl::exception const& ex) {
168 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
169 }
170}
171
172template<typename L>
173void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
174 gpuStream_t stream, L const& f)
175{
176 detail::SyclKernelDevPtr<L> skdp(f, stream);
177 L const* pf = skdp.template get<0>();
179
180 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
181 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
182 / sizeof(unsigned long long);
183 auto& q = *(stream.queue);
184 try {
185 q.submit([&] (sycl::handler& h) {
186 sycl::local_accessor<unsigned long long>
187 shared_data(sycl::range<1>(shared_mem_numull), h);
188 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
189 sycl::range<1>(nthreads_per_block)),
190 [=] (sycl::nd_item<1> item)
191 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
192 {
193 if constexpr (detail::is_big_kernel<L>()) {
194 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
195 } else {
196 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
197 }
198 });
199 });
200 } catch (sycl::exception const& ex) {
201 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
202 }
203}
204
205template<typename L>
206void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f)
207{
208 detail::SyclKernelDevPtr<L> skdp(f, stream);
209 L const* pf = skdp.template get<0>();
211
212 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
213 auto& q = *(stream.queue);
214 try {
215 q.submit([&] (sycl::handler& h) {
216 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
217 sycl::range<1>(nthreads_per_block)),
218 [=] (sycl::nd_item<1> item)
219 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
220 {
221 if constexpr (detail::is_big_kernel<L>()) {
222 (*pf)(item);
223 } else {
224 f(item);
225 }
226 });
227 });
228 } catch (sycl::exception const& ex) {
229 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230 }
231}
232
233template <int MT, typename L>
234void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
235 L const& f)
236{
237 detail::SyclKernelDevPtr<L> skdp(f, stream);
238 L const* pf = skdp.template get<0>();
240
241 const auto nthreads_total = MT * std::size_t(nblocks);
242 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
243 / sizeof(unsigned long long);
244 auto& q = *(stream.queue);
245 try {
246 q.submit([&] (sycl::handler& h) {
247 sycl::local_accessor<unsigned long long>
248 shared_data(sycl::range<1>(shared_mem_numull), h);
249 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
250 sycl::range<1>(MT)),
251 [=] (sycl::nd_item<1> item)
252 [[sycl::reqd_work_group_size(MT)]]
253 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
254 {
255 if constexpr (detail::is_big_kernel<L>()) {
256 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
257 } else {
258 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
259 }
260 });
261 });
262 } catch (sycl::exception const& ex) {
263 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
264 }
265}
266
267template <int MT, typename L>
268void launch (int nblocks, gpuStream_t stream, L const& f)
269{
270 detail::SyclKernelDevPtr<L> skdp(f, stream);
271 L const* pf = skdp.template get<0>();
273
274 const auto nthreads_total = MT * std::size_t(nblocks);
275 auto& q = *(stream.queue);
276 try {
277 q.submit([&] (sycl::handler& h) {
278 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
279 sycl::range<1>(MT)),
280 [=] (sycl::nd_item<1> item)
281 [[sycl::reqd_work_group_size(MT)]]
282 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
283 {
284 if constexpr (detail::is_big_kernel<L>()) {
285 (*pf)(item);
286 } else {
287 f(item);
288 }
289 });
290 });
291 } catch (sycl::exception const& ex) {
292 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
293 }
294}
295
296template<int MT, typename T, typename L>
297void launch (T const& n, L const& f)
298{
299 if (amrex::isEmpty(n)) { return; }
300
301 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
302 L const* pf = skdp.template get<0>();
304
305 const auto ec = Gpu::makeExecutionConfig<MT>(n);
306 const auto nthreads_per_block = ec.numThreads.x;
307 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
308 auto& q = Gpu::Device::streamQueue();
309 try {
310 q.submit([&] (sycl::handler& h) {
311 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
312 sycl::range<1>(nthreads_per_block)),
313 [=] (sycl::nd_item<1> item)
314 [[sycl::reqd_work_group_size(MT)]]
315 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
316 {
317 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
318 if constexpr (detail::is_big_kernel<L>()) {
319 (*pf)(i);
320 } else {
321 f(i);
322 }
323 }
324 });
325 });
326 } catch (sycl::exception const& ex) {
327 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
328 }
329}
330
331template <int MT, std::integral T, typename L>
332void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f)
333{
334 if (amrex::isEmpty(n)) { return; }
335
336 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
337 L const* pf = skdp.template get<0>();
339
340 const auto ec = Gpu::makeExecutionConfig<MT>(n);
341 const auto nthreads_per_block = ec.numThreads.x;
342 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
343 auto& q = Gpu::Device::streamQueue();
344 try {
345 if (info.hasReduction()) {
346 q.submit([&] (sycl::handler& h) {
347 sycl::local_accessor<unsigned long long>
348 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
349 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
350 sycl::range<1>(nthreads_per_block)),
351 [=] (sycl::nd_item<1> item)
352 [[sycl::reqd_work_group_size(MT)]]
353 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
354 {
355 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
356 i < std::size_t(n); i += stride) {
357 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
358 item.get_local_range(0));
359 if constexpr (detail::is_big_kernel<L>()) {
360 detail::call_f_scalar_handler(*pf, T(i),
361 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
362 n_active_threads});
363 } else {
364 detail::call_f_scalar_handler(f, T(i),
365 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
366 n_active_threads});
367 }
368 }
369 });
370 });
371 } else {
372 q.submit([&] (sycl::handler& h) {
373 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
374 sycl::range<1>(nthreads_per_block)),
375 [=] (sycl::nd_item<1> item)
376 [[sycl::reqd_work_group_size(MT)]]
377 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
378 {
379 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
380 i < std::size_t(n); i += stride) {
381 if constexpr (detail::is_big_kernel<L>()) {
382 detail::call_f_scalar_handler(*pf, T(i), Gpu::Handler{&item});
383 } else {
384 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
385 }
386 }
387 });
388 });
389 }
390 } catch (sycl::exception const& ex) {
391 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
392 }
393}
394
395template <int MT, typename L, int dim>
396void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f)
397{
398 if (amrex::isEmpty(box)) { return; }
399
400 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
401 L const* pf = skdp.template get<0>();
403
404 const BoxIndexerND<dim> indexer(box);
405 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
406 const auto nthreads_per_block = ec.numThreads.x;
407 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
408 auto& q = Gpu::Device::streamQueue();
409 try {
410 if (info.hasReduction()) {
411 q.submit([&] (sycl::handler& h) {
412 sycl::local_accessor<unsigned long long>
413 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
418 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
419 {
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
424 std::uint64_t(item.get_local_range(0)));
425 if constexpr (detail::is_big_kernel<L>()) {
426 detail::call_f_intvect_handler(*pf,
427 iv, Gpu::Handler{&item,
428 shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
429 n_active_threads});
430 } else {
431 detail::call_f_intvect_handler(f,
432 iv, Gpu::Handler{&item,
433 shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
434 n_active_threads});
435 }
436 }
437 });
438 });
439 } else {
440 q.submit([&] (sycl::handler& h) {
441 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
442 sycl::range<1>(nthreads_per_block)),
443 [=] (sycl::nd_item<1> item)
444 [[sycl::reqd_work_group_size(MT)]]
445 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
446 {
447 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
448 icell < indexer.numPts(); icell += stride) {
449 auto iv = indexer.intVect(icell);
450 if constexpr (detail::is_big_kernel<L>()) {
451 detail::call_f_intvect_handler(*pf,iv,Gpu::Handler{&item});
452 } else {
453 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
454 }
455 }
456 });
457 });
458 }
459 } catch (sycl::exception const& ex) {
460 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461 }
462}
463
464template <int MT, std::integral T, typename L, int dim>
465void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f)
466{
467 if (amrex::isEmpty(box)) { return; }
468
469 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
470 L const* pf = skdp.template get<0>();
472
473 const BoxIndexerND<dim> indexer(box);
474 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
475 const auto nthreads_per_block = ec.numThreads.x;
476 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
477 auto& q = Gpu::Device::streamQueue();
478 try {
479 if (info.hasReduction()) {
480 q.submit([&] (sycl::handler& h) {
481 sycl::local_accessor<unsigned long long>
482 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
483 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
484 sycl::range<1>(nthreads_per_block)),
485 [=] (sycl::nd_item<1> item)
486 [[sycl::reqd_work_group_size(MT)]]
487 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
488 {
489 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
490 icell < indexer.numPts(); icell += stride) {
491 auto iv = indexer.intVect(icell);
492 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
493 std::uint64_t(item.get_local_range(0)));
494 if constexpr (detail::is_big_kernel<L>()) {
495 detail::call_f_intvect_ncomp_handler(*pf, iv, ncomp,
496 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
497 n_active_threads});
498 } else {
499 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
500 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
501 n_active_threads});
502 }
503 }
504 });
505 });
506 } else {
507 q.submit([&] (sycl::handler& h) {
508 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
509 sycl::range<1>(nthreads_per_block)),
510 [=] (sycl::nd_item<1> item)
511 [[sycl::reqd_work_group_size(MT)]]
512 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
513 {
514 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
515 icell < indexer.numPts(); icell += stride) {
516 auto iv = indexer.intVect(icell);
517 if constexpr (detail::is_big_kernel<L>()) {
518 detail::call_f_intvect_ncomp_handler(*pf,iv,ncomp,Gpu::Handler{&item});
519 } else {
520 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
521 }
522 }
523 });
524 });
525 }
526 } catch (sycl::exception const& ex) {
527 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
528 }
529}
530
531template <std::integral T, typename L>
532void ParallelForRNG (T n, L const& f)
533{
534 if (amrex::isEmpty(n)) { return; }
535
536 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
537 L const* pf = skdp.template get<0>();
539
540 const auto ec = Gpu::ExecutionConfig(n);
541 const auto nthreads_per_block = ec.numThreads.x;
542 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
543 auto& q = Gpu::Device::streamQueue();
544 auto& engdescr = *(getRandEngineDescriptor());
545 try {
546 q.submit([&] (sycl::handler& h) {
547 auto engine_acc = engdescr.get_access(h);
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
552 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553 {
554 auto const tid = item.get_global_id(0);
555 auto engine = engine_acc.load(tid);
556 RandomEngine rand_eng{&engine};
557 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
558 if constexpr (detail::is_big_kernel<L>()) {
559 (*pf)(T(i),rand_eng);
560 } else {
561 f(T(i),rand_eng);
562 }
563 }
564 engine_acc.store(engine, tid);
565 });
566 });
567 q.wait_and_throw(); // because next launch might be on a different queue
568 } catch (sycl::exception const& ex) {
569 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570 }
571}
572
573template <typename L, int dim>
574void ParallelForRNG (BoxND<dim> const& box, L const& f)
575{
576 if (amrex::isEmpty(box)) { return; }
577
578 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
579 L const* pf = skdp.template get<0>();
581
582 const BoxIndexerND<dim> indexer(box);
583 const auto ec = Gpu::ExecutionConfig(box.numPts());
584 const auto nthreads_per_block = ec.numThreads.x;
585 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
586 auto& q = Gpu::Device::streamQueue();
587 auto& engdescr = *(getRandEngineDescriptor());
588 try {
589 q.submit([&] (sycl::handler& h) {
590 auto engine_acc = engdescr.get_access(h);
591 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
592 sycl::range<1>(nthreads_per_block)),
593 [=] (sycl::nd_item<1> item)
594 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
595 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
596 {
597 auto const tid = item.get_global_id(0);
598 auto engine = engine_acc.load(tid);
599 RandomEngine rand_eng{&engine};
600 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
601 icell < indexer.numPts(); icell += stride) {
602 auto iv = indexer.intVect(icell);
603 if constexpr (detail::is_big_kernel<L>()) {
604 detail::call_f_intvect_engine(*pf,iv,rand_eng);
605 } else {
606 detail::call_f_intvect_engine(f,iv,rand_eng);
607 }
608 }
609 engine_acc.store(engine, tid);
610 });
611 });
612 q.wait_and_throw(); // because next launch might be on a different queue
613 } catch (sycl::exception const& ex) {
614 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
615 }
616}
617
618template <std::integral T, typename L, int dim>
619void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f)
620{
621 if (amrex::isEmpty(box)) { return; }
622
623 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
624 L const* pf = skdp.template get<0>();
626
627 const BoxIndexerND<dim> indexer(box);
628 const auto ec = Gpu::ExecutionConfig(box.numPts());
629 const auto nthreads_per_block = ec.numThreads.x;
630 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
631 auto& q = Gpu::Device::streamQueue();
632 auto& engdescr = *(getRandEngineDescriptor());
633 try {
634 q.submit([&] (sycl::handler& h) {
635 auto engine_acc = engdescr.get_access(h);
636 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
637 sycl::range<1>(nthreads_per_block)),
638 [=] (sycl::nd_item<1> item)
639 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
640 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
641 {
642 auto const tid = item.get_global_id(0);
643 auto engine = engine_acc.load(tid);
644 RandomEngine rand_eng{&engine};
645 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
646 icell < indexer.numPts(); icell += stride) {
647 auto iv = indexer.intVect(icell);
648 if constexpr (detail::is_big_kernel<L>()) {
649 detail::call_f_intvect_ncomp_engine(*pf,iv,ncomp,rand_eng);
650 } else {
651 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
652 }
653 }
654 engine_acc.store(engine, tid);
655 });
656 });
657 q.wait_and_throw(); // because next launch might be on a different queue
658 } catch (sycl::exception const& ex) {
659 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
660 }
661}
662
663template <int MT, typename L1, typename L2, int dim>
664void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1 const& f1, L2 const& f2)
665{
666 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
667
668 detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());
669 L1 const* pf1 = skdp.template get<0>();
670 L2 const* pf2 = skdp.template get<1>();
671 amrex::ignore_unused(pf1,pf2);
672
673 const BoxIndexerND<dim> indexer1(box1);
674 const BoxIndexerND<dim> indexer2(box2);
675 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
676 const auto nthreads_per_block = ec.numThreads.x;
677 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
678 auto& q = Gpu::Device::streamQueue();
679 try {
680 q.submit([&] (sycl::handler& h) {
681 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
682 sycl::range<1>(nthreads_per_block)),
683 [=] (sycl::nd_item<1> item)
684 [[sycl::reqd_work_group_size(MT)]]
685 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
686 {
687 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
688 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
689 icell < ncells; icell += stride) {
690 if (icell < indexer1.numPts()) {
691 auto iv = indexer1.intVect(icell);
692 if constexpr (detail::is_big_kernel<L1,L2>()) {
693 detail::call_f_intvect(*pf1,iv);
694 } else {
695 detail::call_f_intvect(f1,iv);
696 }
697 }
698 if (icell < indexer2.numPts()) {
699 auto iv = indexer2.intVect(icell);
700 if constexpr (detail::is_big_kernel<L1,L2>()) {
701 detail::call_f_intvect(*pf2,iv);
702 } else {
703 detail::call_f_intvect(f2,iv);
704 }
705 }
706 }
707 });
708 });
709 } catch (sycl::exception const& ex) {
710 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
711 }
712}
713
714template <int MT, typename L1, typename L2, typename L3, int dim>
715void ParallelFor (Gpu::KernelInfo const& /*info*/,
716 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
717 L1 const& f1, L2 const& f2, L3 const& f3)
718{
719 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
720
721 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());
722 L1 const* pf1 = skdp.template get<0>();
723 L2 const* pf2 = skdp.template get<1>();
724 L3 const* pf3 = skdp.template get<2>();
725 amrex::ignore_unused(pf1,pf2,pf3);
726
727 const BoxIndexerND<dim> indexer1(box1);
728 const BoxIndexerND<dim> indexer2(box2);
729 const BoxIndexerND<dim> indexer3(box3);
730 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
731 const auto nthreads_per_block = ec.numThreads.x;
732 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
733 auto& q = Gpu::Device::streamQueue();
734 try {
735 q.submit([&] (sycl::handler& h) {
736 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
737 sycl::range<1>(nthreads_per_block)),
738 [=] (sycl::nd_item<1> item)
739 [[sycl::reqd_work_group_size(MT)]]
740 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
741 {
742 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
743 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
744 icell < ncells; icell += stride) {
745 if (icell < indexer1.numPts()) {
746 auto iv = indexer1.intVect(icell);
747 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
748 detail::call_f_intvect(*pf1,iv);
749 } else {
750 detail::call_f_intvect(f1,iv);
751 }
752 }
753 if (icell < indexer2.numPts()) {
754 auto iv = indexer2.intVect(icell);
755 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
756 detail::call_f_intvect(*pf2,iv);
757 } else {
758 detail::call_f_intvect(f2,iv);
759 }
760 }
761 if (icell < indexer3.numPts()) {
762 auto iv = indexer3.intVect(icell);
763 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
764 detail::call_f_intvect(*pf3,iv);
765 } else {
766 detail::call_f_intvect(f3,iv);
767 }
768 }
769 }
770 });
771 });
772 } catch (sycl::exception const& ex) {
773 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
774 }
775}
776
777template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
778void ParallelFor (Gpu::KernelInfo const& /*info*/,
779 BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,
780 BoxND<dim> const& box2, T2 ncomp2, L2 const& f2)
781{
782 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
783
784 detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());
785 L1 const* pf1 = skdp.template get<0>();
786 L2 const* pf2 = skdp.template get<1>();
787 amrex::ignore_unused(pf1,pf2);
788
789 const BoxIndexerND<dim> indexer1(box1);
790 const BoxIndexerND<dim> indexer2(box2);
791 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
792 const auto nthreads_per_block = ec.numThreads.x;
793 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
794 auto& q = Gpu::Device::streamQueue();
795 try {
796 q.submit([&] (sycl::handler& h) {
797 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
798 sycl::range<1>(nthreads_per_block)),
799 [=] (sycl::nd_item<1> item)
800 [[sycl::reqd_work_group_size(MT)]]
801 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
802 {
803 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
804 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
805 icell < ncells; icell += stride) {
806 if (icell < indexer1.numPts()) {
807 auto iv = indexer1.intVect(icell);
808 if constexpr (detail::is_big_kernel<L1,L2>()) {
809 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
810 } else {
811 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
812 }
813 }
814 if (icell < indexer2.numPts()) {
815 auto iv = indexer2.intVect(icell);
816 if constexpr (detail::is_big_kernel<L1,L2>()) {
817 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
818 } else {
819 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
820 }
821 }
822 }
823 });
824 });
825 } catch (sycl::exception const& ex) {
826 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
827 }
828}
829
830template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
831void ParallelFor (Gpu::KernelInfo const& /*info*/,
832 BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,
833 BoxND<dim> const& box2, T2 ncomp2, L2 const& f2,
834 BoxND<dim> const& box3, T3 ncomp3, L3 const& f3)
835{
836 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
837
838 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());
839 L1 const* pf1 = skdp.template get<0>();
840 L2 const* pf2 = skdp.template get<1>();
841 L3 const* pf3 = skdp.template get<2>();
842 amrex::ignore_unused(pf1,pf2,pf3);
843
844 const BoxIndexerND<dim> indexer1(box1);
845 const BoxIndexerND<dim> indexer2(box2);
846 const BoxIndexerND<dim> indexer3(box3);
847 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
848 const auto nthreads_per_block = ec.numThreads.x;
849 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
850 auto& q = Gpu::Device::streamQueue();
851 try {
852 q.submit([&] (sycl::handler& h) {
853 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
854 sycl::range<1>(nthreads_per_block)),
855 [=] (sycl::nd_item<1> item)
856 [[sycl::reqd_work_group_size(MT)]]
857 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
858 {
859 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
860 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
861 icell < ncells; icell += stride) {
862 if (icell < indexer1.numPts()) {
863 auto iv = indexer1.intVect(icell);
864 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
865 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
866 } else {
867 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
868 }
869 }
870 if (icell < indexer2.numPts()) {
871 auto iv = indexer2.intVect(icell);
872 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
873 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
874 } else {
875 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
876 }
877 }
878 if (icell < indexer3.numPts()) {
879 auto iv = indexer3.intVect(icell);
880 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
881 detail::call_f_intvect_ncomp(*pf3,iv,ncomp3);
882 } else {
883 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
884 }
885 }
886 }
887 });
888 });
889 } catch (sycl::exception const& ex) {
890 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
891 }
892}
893
894#else
895// CUDA or HIP
896
897template <typename L>
898void single_task (gpuStream_t stream, L const& f)
899{
900 AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream, f);
902}
903
904template <int MT, typename L>
905void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
906 L const& f)
907{
908 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream, f);
910}
911
912template <int MT, typename L>
913void launch (int nblocks, gpuStream_t stream, L const& f)
914{
915 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream, f);
917}
918
919template<typename L>
920void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
921 gpuStream_t stream, L const& f)
922{
923 AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes, stream, f);
925}
926
927template<typename L>
928void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
929{
930 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
931}
932
933template<int MT, std::integral T, typename L>
934void launch (T const& n, L const& f)
935{
936 static_assert(sizeof(T) >= 2);
937 if (amrex::isEmpty(n)) { return; }
938 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
939 for (auto const& ec : nec) {
940 const T start_idx = T(ec.start_idx);
941 const T nleft = n - start_idx;
942 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
943 [=] AMREX_GPU_DEVICE () noexcept {
944 // This will not overflow, even though nblocks*MT might.
945 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
946 if (tid < nleft) {
947 f(tid+start_idx);
948 }
949 });
950 }
952}
953
954template<int MT, int dim, typename L>
955void launch (BoxND<dim> const& box, L const& f)
956{
957 if (box.isEmpty()) { return; }
958 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
959 const BoxIndexerND<dim> indexer(box);
960 const auto type = box.ixType();
961 for (auto const& ec : nec) {
962 const auto start_idx = std::uint64_t(ec.start_idx);
963 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
964 [=] AMREX_GPU_DEVICE () noexcept {
965 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
966 if (icell < indexer.numPts()) {
967 auto iv = indexer.intVect(icell);
968 f(BoxND<dim>(iv,iv,type));
969 }
970 });
971 }
973}
974
979template <int MT, std::integral T, typename L>
980requires (MaybeDeviceRunnable<L>::value)
981void
982ParallelFor (Gpu::KernelInfo const&, T n, L const& f)
983{
984 static_assert(sizeof(T) >= 2);
985 if (amrex::isEmpty(n)) { return; }
986 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
987 for (auto const& ec : nec) {
988 const T start_idx = T(ec.start_idx);
989 const T nleft = n - start_idx;
990 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
991 [=] AMREX_GPU_DEVICE () noexcept {
992 // This will not overflow, even though nblocks*MT might.
993 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
994 if (tid < nleft) {
995 detail::call_f_scalar_handler(f, tid+start_idx,
996 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
997 (std::uint64_t)MT)));
998 }
999 });
1000 }
1002}
1003
1008template <int MT, typename L, int dim>
1009requires (MaybeDeviceRunnable<L>::value)
1010void
1011ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f)
1012{
1013 if (amrex::isEmpty(box)) { return; }
1014 const BoxIndexerND<dim> indexer(box);
1015 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1016 for (auto const& ec : nec) {
1017 const auto start_idx = std::uint64_t(ec.start_idx);
1018 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
1019 [=] AMREX_GPU_DEVICE () noexcept {
1020 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1021 if (icell < indexer.numPts()) {
1022 auto iv = indexer.intVect(icell);
1023 detail::call_f_intvect_handler(f, iv,
1024 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
1025 (std::uint64_t)MT)));
1026 }
1027 });
1028 }
1030}
1031
1036template <int MT, std::integral T, typename L, int dim>
1037requires (MaybeDeviceRunnable<L>::value)
1038void
1039ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
1040{
1041 if (amrex::isEmpty(box)) { return; }
1042 const BoxIndexerND<dim> indexer(box);
1043 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1044 for (auto const& ec : nec) {
1045 const auto start_idx = std::uint64_t(ec.start_idx);
1046 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
1047 [=] AMREX_GPU_DEVICE () noexcept {
1048 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1049 if (icell < indexer.numPts()) {
1050 auto iv = indexer.intVect(icell);
1051 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
1052 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
1053 (std::uint64_t)MT)));
1054 }
1055 });
1056 }
1058}
1059
1065template <std::integral T, typename L>
1066requires (MaybeDeviceRunnable<L>::value)
1067void
1068ParallelForRNG (T n, L const& f)
1069{
1070 if (amrex::isEmpty(n)) { return; }
1071 randState_t* rand_state = getRandState();
1072 const auto ec = Gpu::ExecutionConfig(n);
1073 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1075 ec.numThreads, 0, Gpu::gpuStream(),
1076 [=] AMREX_GPU_DEVICE () noexcept {
1077 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1078 RandomEngine engine{&(rand_state[tid])};
1079 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
1080 f(T(i),engine);
1081 }
1082 });
1083 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1085}
1086
1092template <typename L, int dim>
1093requires (MaybeDeviceRunnable<L>::value)
1094void
1095ParallelForRNG (BoxND<dim> const& box, L const& f)
1096{
1097 if (amrex::isEmpty(box)) { return; }
1098 randState_t* rand_state = getRandState();
1099 const BoxIndexerND<dim> indexer(box);
1100 const auto ec = Gpu::ExecutionConfig(box.numPts());
1101 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1103 ec.numThreads, 0, Gpu::gpuStream(),
1104 [=] AMREX_GPU_DEVICE () noexcept {
1105 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1106 RandomEngine engine{&(rand_state[tid])};
1107 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
1108 auto iv = indexer.intVect(icell);
1109 detail::call_f_intvect_engine(f, iv, engine);
1110 }
1111 });
1112 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1114}
1115
1121template <std::integral T, typename L, int dim>
1122requires (MaybeDeviceRunnable<L>::value)
1123void
1124ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f)
1125{
1126 if (amrex::isEmpty(box)) { return; }
1127 randState_t* rand_state = getRandState();
1128 const BoxIndexerND<dim> indexer(box);
1129 const auto ec = Gpu::ExecutionConfig(box.numPts());
1130 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1132 ec.numThreads, 0, Gpu::gpuStream(),
1133 [=] AMREX_GPU_DEVICE () noexcept {
1134 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1135 RandomEngine engine{&(rand_state[tid])};
1136 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
1137 auto iv = indexer.intVect(icell);
1138 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
1139 }
1140 });
1141 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1143}
1144
1149template <int MT, typename L1, typename L2, int dim>
1150requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1151void
1153 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)
1154{
1155 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1156 const BoxIndexerND<dim> indexer1(box1);
1157 const BoxIndexerND<dim> indexer2(box2);
1158 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1159 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1160 [=] AMREX_GPU_DEVICE () noexcept {
1161 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1162 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1163 icell < ncells; icell += stride) {
1164 if (icell < indexer1.numPts()) {
1165 auto iv = indexer1.intVect(icell);
1166 detail::call_f_intvect(f1, iv);
1167 }
1168 if (icell < indexer2.numPts()) {
1169 auto iv = indexer2.intVect(icell);
1170 detail::call_f_intvect(f2, iv);
1171 }
1172 }
1173 });
1175}
1176
1181template <int MT, typename L1, typename L2, typename L3, int dim>
1182requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1183 MaybeDeviceRunnable<L3>::value)
1184void
1186 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1187 L1&& f1, L2&& f2, L3&& f3)
1188{
1189 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1190 const BoxIndexerND<dim> indexer1(box1);
1191 const BoxIndexerND<dim> indexer2(box2);
1192 const BoxIndexerND<dim> indexer3(box3);
1193 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1194 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1195 [=] AMREX_GPU_DEVICE () noexcept {
1196 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1197 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1198 icell < ncells; icell += stride) {
1199 if (icell < indexer1.numPts()) {
1200 auto iv = indexer1.intVect(icell);
1201 detail::call_f_intvect(f1, iv);
1202 }
1203 if (icell < indexer2.numPts()) {
1204 auto iv = indexer2.intVect(icell);
1205 detail::call_f_intvect(f2, iv);
1206 }
1207 if (icell < indexer3.numPts()) {
1208 auto iv = indexer3.intVect(icell);
1209 detail::call_f_intvect(f3, iv);
1210 }
1211 }
1212 });
1214}
1215
1220template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1221requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1222void
1224 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1225 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
1226{
1227 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1228 const BoxIndexerND<dim> indexer1(box1);
1229 const BoxIndexerND<dim> indexer2(box2);
1230 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1231 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1232 [=] AMREX_GPU_DEVICE () noexcept {
1233 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1234 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1235 icell < ncells; icell += stride) {
1236 if (icell < indexer1.numPts()) {
1237 auto iv = indexer1.intVect(icell);
1238 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1239 }
1240 if (icell < indexer2.numPts()) {
1241 auto iv = indexer2.intVect(icell);
1242 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1243 }
1244 }
1245 });
1247}
1248
1253template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1254requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1255 MaybeDeviceRunnable<L3>::value)
1256void
1258 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1259 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1260 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
1261{
1262 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1263 const BoxIndexerND<dim> indexer1(box1);
1264 const BoxIndexerND<dim> indexer2(box2);
1265 const BoxIndexerND<dim> indexer3(box3);
1266 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1267 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1268 [=] AMREX_GPU_DEVICE () noexcept {
1269 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1270 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1271 icell < ncells; icell += stride) {
1272 if (icell < indexer1.numPts()) {
1273 auto iv = indexer1.intVect(icell);
1274 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1275 }
1276 if (icell < indexer2.numPts()) {
1277 auto iv = indexer2.intVect(icell);
1278 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1279 }
1280 if (icell < indexer3.numPts()) {
1281 auto iv = indexer3.intVect(icell);
1282 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1283 }
1284 }
1285 });
1287}
1288
1289#endif
1290
1291template <typename L>
1292void single_task (L&& f) noexcept
1293{
1294 single_task(Gpu::gpuStream(), std::forward<L>(f));
1295}
1296
1297template<typename T, typename L>
1298void launch (T const& n, L&& f) noexcept
1299{
1300 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1301}
1302
1307template <std::integral T, typename L>
1308requires (MaybeDeviceRunnable<L>::value)
1309void
1310ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1311{
1312 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1313}
1314
1319template <typename L, int dim>
1320requires (MaybeDeviceRunnable<L>::value)
1321void
1322ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1323{
1324 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1325}
1326
1331template <std::integral T, typename L, int dim>
1332requires (MaybeDeviceRunnable<L>::value)
1333void
1334ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1335{
1336 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1337}
1338
1343template <typename L1, typename L2, int dim>
1344requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1345void
1347 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1348{
1349 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1350 std::forward<L2>(f2));
1351}
1352
1357template <typename L1, typename L2, typename L3, int dim>
1358requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1359 MaybeDeviceRunnable<L3>::value)
1360void
1362 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1363 L1&& f1, L2&& f2, L3&& f3) noexcept
1364{
1365 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1366 std::forward<L2>(f2), std::forward<L3>(f3));
1367}
1368
1373template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1374requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)
1375void
1377 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1378 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1379{
1380 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1381 box2, ncomp2, std::forward<L2>(f2));
1382}
1383
1388template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1389requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&
1390 MaybeDeviceRunnable<L3>::value)
1391void
1393 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1394 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1395 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1396{
1397 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1398 box2, ncomp2, std::forward<L2>(f2),
1399 box3, ncomp3, std::forward<L3>(f3));
1400}
1401
1402template <std::integral T, typename L>
1403void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1404{
1405 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1406}
1407
1408template <int MT, std::integral T, typename L>
1409void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1410{
1411 ParallelFor<MT>(info, n,std::forward<L>(f));
1412}
1413
1414template <typename L, int dim>
1415void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1416{
1417 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1418}
1419
1420template <int MT, typename L, int dim>
1421void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1422{
1423 ParallelFor<MT>(info, box,std::forward<L>(f));
1424}
1425
1426template <std::integral T, typename L, int dim>
1427void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1428{
1429 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1430}
1431
1432template <int MT, std::integral T, typename L, int dim>
1433void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1434{
1435 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1436}
1437
1438template <typename L1, typename L2, int dim>
1439void For (Gpu::KernelInfo const& info,
1440 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1441{
1442 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1443}
1444
1445template <int MT, typename L1, typename L2, int dim>
1446void For (Gpu::KernelInfo const& info,
1447 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1448{
1449 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1450}
1451
1452template <typename L1, typename L2, typename L3, int dim>
1453void For (Gpu::KernelInfo const& info,
1454 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1455 L1&& f1, L2&& f2, L3&& f3) noexcept
1456{
1457 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1458}
1459
1460template <int MT, typename L1, typename L2, typename L3, int dim>
1461void For (Gpu::KernelInfo const& info,
1462 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1463 L1&& f1, L2&& f2, L3&& f3) noexcept
1464{
1465 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1466}
1467
1468template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1469void For (Gpu::KernelInfo const& info,
1470 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1471 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1472{
1473 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1474}
1475
1476template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1477void For (Gpu::KernelInfo const& info,
1478 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1479 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1480{
1481 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1482}
1483
1484template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1485void For (Gpu::KernelInfo const& info,
1486 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1487 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1488 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1489{
1490 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1491 box1,ncomp1,std::forward<L1>(f1),
1492 box2,ncomp2,std::forward<L2>(f2),
1493 box3,ncomp3,std::forward<L3>(f3));
1494}
1495
1496template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1497void For (Gpu::KernelInfo const& info,
1498 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1499 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1500 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1501{
1502 ParallelFor<MT>(info,
1503 box1,ncomp1,std::forward<L1>(f1),
1504 box2,ncomp2,std::forward<L2>(f2),
1505 box3,ncomp3,std::forward<L3>(f3));
1506}
1507
1512template <std::integral T, typename L>
1513void ParallelFor (T n, L&& f) noexcept
1514{
1515 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1516}
1517
1522template <int MT, std::integral T, typename L>
1523void ParallelFor (T n, L&& f) noexcept
1524{
1525 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1526}
1527
1532template <typename L, int dim>
1533void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1534{
1535 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1536}
1537
1542template <int MT, typename L, int dim>
1543void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1544{
1545 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1546}
1547
1552template <std::integral T, typename L, int dim>
1553void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1554{
1555 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1556}
1557
1562template <int MT, std::integral T, typename L, int dim>
1563void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1564{
1565 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1566}
1567
1572template <typename L1, typename L2, int dim>
1573void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1574{
1575 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1576}
1577
1582template <int MT, typename L1, typename L2, int dim>
1583void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1584{
1585 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1586}
1587
1592template <typename L1, typename L2, typename L3, int dim>
1593void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1594 L1&& f1, L2&& f2, L3&& f3) noexcept
1595{
1596 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1597}
1598
1603template <int MT, typename L1, typename L2, typename L3, int dim>
1604void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1605 L1&& f1, L2&& f2, L3&& f3) noexcept
1606{
1607 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1608}
1609
1614template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1615void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1616 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1617{
1618 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1619}
1620
1625template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1626void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1627 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1628{
1629 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1630}
1631
1636template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1637void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1638 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1639 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1640{
1641 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1642 box1,ncomp1,std::forward<L1>(f1),
1643 box2,ncomp2,std::forward<L2>(f2),
1644 box3,ncomp3,std::forward<L3>(f3));
1645}
1646
1651template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1652void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1653 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1654 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1655{
1656 ParallelFor<MT>(Gpu::KernelInfo{},
1657 box1,ncomp1,std::forward<L1>(f1),
1658 box2,ncomp2,std::forward<L2>(f2),
1659 box3,ncomp3,std::forward<L3>(f3));
1660}
1661
1662template <std::integral T, typename L>
1663void For (T n, L&& f) noexcept
1664{
1665 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1666}
1667
1668template <int MT, std::integral T, typename L>
1669void For (T n, L&& f) noexcept
1670{
1671 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1672}
1673
1674template <typename L, int dim>
1675void For (BoxND<dim> const& box, L&& f) noexcept
1676{
1677 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1678}
1679
1680template <int MT, typename L, int dim>
1681void For (BoxND<dim> const& box, L&& f) noexcept
1682{
1683 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1684}
1685
1686template <std::integral T, typename L, int dim>
1687void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1688{
1689 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1690}
1691
1692template <int MT, std::integral T, typename L, int dim>
1693void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1694{
1695 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1696}
1697
1698template <typename L1, typename L2, int dim>
1699void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1700{
1701 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1702}
1703
1704template <int MT, typename L1, typename L2, int dim>
1705void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1706{
1707 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1708}
1709
1710template <typename L1, typename L2, typename L3, int dim>
1711void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1712 L1&& f1, L2&& f2, L3&& f3) noexcept
1713{
1714 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1715}
1716
1717template <int MT, typename L1, typename L2, typename L3, int dim>
1718void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1719 L1&& f1, L2&& f2, L3&& f3) noexcept
1720{
1721 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1722}
1723
1724template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1725void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1726 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1727{
1728 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1729}
1730
1731template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1732void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1733 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1734{
1735 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1736}
1737
1738template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1739void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1740 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1741 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1742{
1743 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1744 box1,ncomp1,std::forward<L1>(f1),
1745 box2,ncomp2,std::forward<L2>(f2),
1746 box3,ncomp3,std::forward<L3>(f3));
1747}
1748
1749template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1750void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1751 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1752 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1753{
1754 ParallelFor<MT>(Gpu::KernelInfo{},
1755 box1,ncomp1,std::forward<L1>(f1),
1756 box2,ncomp2,std::forward<L2>(f2),
1757 box3,ncomp3,std::forward<L3>(f3));
1758}
1759
1760template <std::integral T, typename L>
1761requires (MaybeHostDeviceRunnable<L>::value)
1762void
1764{
1765 if (Gpu::inLaunchRegion()) {
1766 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1767 } else {
1768#ifdef AMREX_USE_SYCL
1769 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1770#else
1772 for (T i = 0; i < n; ++i) { f(i); }
1773#endif
1774 }
1775}
1776
1777template <int MT, std::integral T, typename L>
1778requires (MaybeHostDeviceRunnable<L>::value)
1779void
1781{
1782 if (Gpu::inLaunchRegion()) {
1783 ParallelFor<MT>(info,n,std::forward<L>(f));
1784 } else {
1785#ifdef AMREX_USE_SYCL
1786 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1787#else
1789 for (T i = 0; i < n; ++i) { f(i); }
1790#endif
1791 }
1792}
1793
1794template <std::integral T, typename L>
1795requires (MaybeHostDeviceRunnable<L>::value)
1796void
1797HostDeviceParallelFor (T n, L&& f) noexcept
1798{
1799 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1800}
1801
1802template <int MT, std::integral T, typename L>
1803requires (MaybeHostDeviceRunnable<L>::value)
1804void
1805HostDeviceParallelFor (T n, L&& f) noexcept
1806{
1807 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1808}
1809
1810template <typename L, int dim>
1811requires (MaybeHostDeviceRunnable<L>::value)
1812void
1813HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f)
1814{
1815 if (Gpu::inLaunchRegion()) {
1816 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1817 } else {
1818#ifdef AMREX_USE_SYCL
1819 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1820#else
1821 LoopConcurrentOnCpu(box,std::forward<L>(f));
1822#endif
1823 }
1824}
1825
1826template <int MT, typename L, int dim>
1827requires (MaybeHostDeviceRunnable<L>::value)
1828void
1829HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f)
1830{
1831 if (Gpu::inLaunchRegion()) {
1832 ParallelFor<MT>(info, box,std::forward<L>(f));
1833 } else {
1834#ifdef AMREX_USE_SYCL
1835 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1836#else
1837 LoopConcurrentOnCpu(box,std::forward<L>(f));
1838#endif
1839 }
1840}
1841
1842template <std::integral T, typename L, int dim>
1843requires (MaybeHostDeviceRunnable<L>::value)
1844void
1845HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f)
1846{
1847 if (Gpu::inLaunchRegion()) {
1848 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1849 } else {
1850#ifdef AMREX_USE_SYCL
1851 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1852#else
1853 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1854#endif
1855 }
1856}
1857
1858template <int MT, std::integral T, typename L, int dim>
1859requires (MaybeHostDeviceRunnable<L>::value)
1860void
1861HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f)
1862{
1863 if (Gpu::inLaunchRegion()) {
1864 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1865 } else {
1866#ifdef AMREX_USE_SYCL
1867 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1868#else
1869 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1870#endif
1871 }
1872}
1873
1874template <typename L1, typename L2, int dim>
1875requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1876void
1878 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)
1879{
1880 if (Gpu::inLaunchRegion()) {
1881 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1882 } else {
1883#ifdef AMREX_USE_SYCL
1884 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1885#else
1886 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1887 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1888#endif
1889 }
1890}
1891
1892template <int MT, typename L1, typename L2, int dim>
1893requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1894void
1896 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)
1897{
1898 if (Gpu::inLaunchRegion()) {
1899 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1900 } else {
1901#ifdef AMREX_USE_SYCL
1902 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1903#else
1904 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1905 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1906#endif
1907 }
1908}
1909
1910template <int MT, typename L1, typename L2, typename L3, int dim>
1911requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1912 MaybeHostDeviceRunnable<L3>::value)
1913void
1915 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1916 L1&& f1, L2&& f2, L3&& f3)
1917{
1918 if (Gpu::inLaunchRegion()) {
1919 ParallelFor<MT>(info,box1,box2,box3,
1920 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1921 } else {
1922#ifdef AMREX_USE_SYCL
1923 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1924#else
1925 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1926 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1927 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1928#endif
1929 }
1930}
1931
1932template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1933requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1934void
1936 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1937 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
1938{
1939 if (Gpu::inLaunchRegion()) {
1940 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1941 } else {
1942#ifdef AMREX_USE_SYCL
1943 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1944#else
1945 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1946 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1947#endif
1948 }
1949}
1950
1951template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
1952requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)
1953void
1955 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1956 BoxND<dim> const& box2, T2 ncomp2, L2&& f2)
1957{
1958 if (Gpu::inLaunchRegion()) {
1959 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1960 } else {
1961#ifdef AMREX_USE_SYCL
1962 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1963#else
1964 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1965 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1966#endif
1967 }
1968}
1969
1970template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1971requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1972 MaybeHostDeviceRunnable<L3>::value)
1973void
1975 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1976 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1977 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
1978{
1979 if (Gpu::inLaunchRegion()) {
1980 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1981 box1,ncomp1,std::forward<L1>(f1),
1982 box2,ncomp2,std::forward<L2>(f2),
1983 box3,ncomp3,std::forward<L3>(f3));
1984 } else {
1985#ifdef AMREX_USE_SYCL
1986 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1987#else
1988 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1989 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1990 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1991#endif
1992 }
1993}
1994
1995template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
1996requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&
1997 MaybeHostDeviceRunnable<L3>::value)
1998void
2000 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2001 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2002 BoxND<dim> const& box3, T3 ncomp3, L3&& f3)
2003{
2004 if (Gpu::inLaunchRegion()) {
2005 ParallelFor<MT>(info,
2006 box1,ncomp1,std::forward<L1>(f1),
2007 box2,ncomp2,std::forward<L2>(f2),
2008 box3,ncomp3,std::forward<L3>(f3));
2009 } else {
2010#ifdef AMREX_USE_SYCL
2011 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2012#else
2013 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
2014 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
2015 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
2016#endif
2017 }
2018}
2019
2020template <std::integral T, typename L>
2021void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
2022{
2023 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
2024}
2025
2026template <int MT, std::integral T, typename L>
2027void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
2028{
2029 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
2030}
2031
2032template <typename L, int dim>
2033void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
2034{
2035 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
2036}
2037
2038template <int MT, typename L, int dim>
2039void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
2040{
2041 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
2042}
2043
2044template <std::integral T, typename L, int dim>
2045void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
2046{
2047 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
2048}
2049
2050template <int MT, std::integral T, typename L, int dim>
2051void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
2052{
2053 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
2054}
2055
2056template <typename L1, typename L2, int dim>
2057void HostDeviceFor (Gpu::KernelInfo const& info,
2058 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2059{
2060 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2061}
2062
2063template <int MT, typename L1, typename L2, int dim>
2064void HostDeviceFor (Gpu::KernelInfo const& info,
2065 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2066{
2067 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2068}
2069
2070template <typename L1, typename L2, typename L3, int dim>
2071void HostDeviceFor (Gpu::KernelInfo const& info,
2072 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2073 L1&& f1, L2&& f2, L3&& f3) noexcept
2074{
2075 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
2076 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2077}
2078
2079template <int MT, typename L1, typename L2, typename L3, int dim>
2080void HostDeviceFor (Gpu::KernelInfo const& info,
2081 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2082 L1&& f1, L2&& f2, L3&& f3) noexcept
2083{
2084 HostDeviceParallelFor<MT>(info, box1,box2,box3,
2085 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2086}
2087
2088template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
2089void HostDeviceFor (Gpu::KernelInfo const& info,
2090 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2091 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2092{
2093 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2094}
2095
2096template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
2097void HostDeviceFor (Gpu::KernelInfo const& info,
2098 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2099 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2100{
2101 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2102}
2103
2104template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
2105void HostDeviceFor (Gpu::KernelInfo const& info,
2106 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2107 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2108 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2109{
2110 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
2111 box1,ncomp1,std::forward<L1>(f1),
2112 box2,ncomp2,std::forward<L2>(f2),
2113 box3,ncomp3,std::forward<L3>(f3));
2114}
2115
2116template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
2117void HostDeviceFor (Gpu::KernelInfo const& info,
2118 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2119 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2120 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2121{
2122 HostDeviceParallelFor<MT>(info,
2123 box1,ncomp1,std::forward<L1>(f1),
2124 box2,ncomp2,std::forward<L2>(f2),
2125 box3,ncomp3,std::forward<L3>(f3));
2126}
2127
2128template <std::integral T, typename L>
2129void HostDeviceParallelFor (T n, L&& f) noexcept
2130{
2131 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
2132}
2133
2134template <int MT, std::integral T, typename L>
2135void HostDeviceParallelFor (T n, L&& f) noexcept
2136{
2137 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
2138}
2139
2140template <typename L, int dim>
2141void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
2142{
2143 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
2144}
2145
2146template <int MT, typename L, int dim>
2147void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
2148{
2149 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
2150}
2151
2152template <std::integral T, typename L, int dim>
2153void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
2154{
2155 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2156}
2157
2158template <int MT, std::integral T, typename L, int dim>
2159void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
2160{
2161 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2162}
2163
2164template <typename L1, typename L2, int dim>
2165void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2166{
2167 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2168}
2169
2170template <int MT, typename L1, typename L2, int dim>
2171void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2172{
2173 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2174}
2175
2176template <typename L1, typename L2, typename L3, int dim>
2177void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2178 L1&& f1, L2&& f2, L3&& f3) noexcept
2179{
2180 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2181 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2182}
2183
2184template <int MT, typename L1, typename L2, typename L3, int dim>
2185void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2186 L1&& f1, L2&& f2, L3&& f3) noexcept
2187{
2188 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2189 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2190}
2191
2192template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>
2193void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2194 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2195{
2196 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2197}
2198
2199template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>
2200void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2201 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2202{
2203 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2204}
2205
2206template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
2207void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2208 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2209 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2210{
2211 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2212 box1,ncomp1,std::forward<L1>(f1),
2213 box2,ncomp2,std::forward<L2>(f2),
2214 box3,ncomp3,std::forward<L3>(f3));
2215}
2216
2217template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>
2218void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2219 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2220 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2221{
2222 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2223 box1,ncomp1,std::forward<L1>(f1),
2224 box2,ncomp2,std::forward<L2>(f2),
2225 box3,ncomp3,std::forward<L3>(f3));
2226}
2227
2228}
2229
2230#endif
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:39
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ bool isEmpty() const noexcept
Checks if it is an empty BoxND.
Definition AMReX_Box.H:208
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:136
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:239
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:128
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:726
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:829
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:25
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:122
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:15
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:45
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1239
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1151
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334
Definition AMReX_Box.H:2170
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2187
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2211
Definition AMReX_GpuLaunch.H:120
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72