Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
8namespace detail {
9
10 // call_f_scalar_handler
11
12 template <typename F, typename N>
14 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
15 noexcept -> decltype(f(0))
16 {
17 return f(i);
18 }
19
20 template <typename F, typename N>
22 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
23 noexcept -> decltype(f(0,Gpu::Handler{}))
24 {
25 return f(i, handler);
26 }
27
28 // call_f_intvect_inner
29
30 template <typename F, std::size_t...Ns, class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
34 {
35 return f(iv[0], 0, 0, args...);
36 }
37
38 template <typename F, std::size_t...Ns, class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
42 {
43 return f(iv[0], iv[1], 0, args...);
44 }
45
46 template <typename F, int dim, std::size_t...Ns, class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
50 {
51 return f(iv, args...);
52 }
53
54 template <typename F, int dim, std::size_t...Ns, class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
58 {
59 return f(iv[Ns]..., args...);
60 }
61
62 // call_f_intvect
63
64 template <typename F, int dim>
66 auto call_f_intvect (F const& f, IntVectND<dim> iv)
67 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
68 {
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
70 }
71
72 // call_f_intvect_engine
73
74 template <typename F, int dim>
76 auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
78 {
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
80 }
81
82 // call_f_intvect_handler
83
84 template <typename F, int dim>
86 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)
87 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
88 {
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
90 }
91
92 template <typename F, int dim>
94 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
95 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
96 {
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
98 }
99
100 // call_f_intvect_ncomp
101
102 template <typename F, typename T, int dim>
104 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
106 {
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
109 }
110 }
111
112 // call_f_intvect_ncomp_engine
113
114 template <typename F, typename T, int dim>
116 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
118 {
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
121 }
122 }
123
124 // call_f_intvect_ncomp_handler
125
126 template <typename F, typename T, int dim>
128 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
129 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
130 {
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
133 }
134 }
135
136 template <typename F, typename T, int dim>
138 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
139 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
140 {
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
143 }
144 }
145
146}
148
149#ifdef AMREX_USE_SYCL
150
151template <typename L>
152void single_task (gpuStream_t stream, L const& f) noexcept
153{
154 detail::SyclKernelDevPtr<L> skdp(f, stream);
155 L const* pf = skdp.template get<0>();
157
158 auto& q = *(stream.queue);
159 try {
160 q.submit([&] (sycl::handler& h) {
161 if constexpr (detail::is_big_kernel<L>()) {
162 h.single_task(*pf);
163 } else {
164 h.single_task(f);
165 }
166 });
167 } catch (sycl::exception const& ex) {
168 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
169 }
170}
171
172template<typename L>
173void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
174 gpuStream_t stream, L const& f) noexcept
175{
176 detail::SyclKernelDevPtr<L> skdp(f, stream);
177 L const* pf = skdp.template get<0>();
179
180 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
181 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
182 / sizeof(unsigned long long);
183 auto& q = *(stream.queue);
184 try {
185 q.submit([&] (sycl::handler& h) {
186 sycl::local_accessor<unsigned long long>
187 shared_data(sycl::range<1>(shared_mem_numull), h);
188 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
189 sycl::range<1>(nthreads_per_block)),
190 [=] (sycl::nd_item<1> item)
191 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
192 {
193 if constexpr (detail::is_big_kernel<L>()) {
194 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
195 } else {
196 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
197 }
198 });
199 });
200 } catch (sycl::exception const& ex) {
201 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
202 }
203}
204
205template<typename L>
206void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
207{
208 detail::SyclKernelDevPtr<L> skdp(f, stream);
209 L const* pf = skdp.template get<0>();
211
212 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
213 auto& q = *(stream.queue);
214 try {
215 q.submit([&] (sycl::handler& h) {
216 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
217 sycl::range<1>(nthreads_per_block)),
218 [=] (sycl::nd_item<1> item)
219 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
220 {
221 if constexpr (detail::is_big_kernel<L>()) {
222 (*pf)(item);
223 } else {
224 f(item);
225 }
226 });
227 });
228 } catch (sycl::exception const& ex) {
229 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230 }
231}
232
233template <int MT, typename L>
234void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
235 L const& f) noexcept
236{
237 detail::SyclKernelDevPtr<L> skdp(f, stream);
238 L const* pf = skdp.template get<0>();
240
241 const auto nthreads_total = MT * std::size_t(nblocks);
242 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
243 / sizeof(unsigned long long);
244 auto& q = *(stream.queue);
245 try {
246 q.submit([&] (sycl::handler& h) {
247 sycl::local_accessor<unsigned long long>
248 shared_data(sycl::range<1>(shared_mem_numull), h);
249 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
250 sycl::range<1>(MT)),
251 [=] (sycl::nd_item<1> item)
252 [[sycl::reqd_work_group_size(MT)]]
253 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
254 {
255 if constexpr (detail::is_big_kernel<L>()) {
256 (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
257 } else {
258 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
259 }
260 });
261 });
262 } catch (sycl::exception const& ex) {
263 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
264 }
265}
266
267template <int MT, typename L>
268void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
269{
270 detail::SyclKernelDevPtr<L> skdp(f, stream);
271 L const* pf = skdp.template get<0>();
273
274 const auto nthreads_total = MT * std::size_t(nblocks);
275 auto& q = *(stream.queue);
276 try {
277 q.submit([&] (sycl::handler& h) {
278 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
279 sycl::range<1>(MT)),
280 [=] (sycl::nd_item<1> item)
281 [[sycl::reqd_work_group_size(MT)]]
282 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
283 {
284 if constexpr (detail::is_big_kernel<L>()) {
285 (*pf)(item);
286 } else {
287 f(item);
288 }
289 });
290 });
291 } catch (sycl::exception const& ex) {
292 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
293 }
294}
295
296template<int MT, typename T, typename L>
297void launch (T const& n, L const& f) noexcept
298{
299 if (amrex::isEmpty(n)) { return; }
300
301 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
302 L const* pf = skdp.template get<0>();
304
305 const auto ec = Gpu::makeExecutionConfig<MT>(n);
306 const auto nthreads_per_block = ec.numThreads.x;
307 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
308 auto& q = Gpu::Device::streamQueue();
309 try {
310 q.submit([&] (sycl::handler& h) {
311 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
312 sycl::range<1>(nthreads_per_block)),
313 [=] (sycl::nd_item<1> item)
314 [[sycl::reqd_work_group_size(MT)]]
315 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
316 {
317 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
318 if constexpr (detail::is_big_kernel<L>()) {
319 (*pf)(i);
320 } else {
321 f(i);
322 }
323 }
324 });
325 });
326 } catch (sycl::exception const& ex) {
327 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
328 }
329}
330
331template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
332void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
333{
334 if (amrex::isEmpty(n)) { return; }
335
336 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
337 L const* pf = skdp.template get<0>();
339
340 const auto ec = Gpu::makeExecutionConfig<MT>(n);
341 const auto nthreads_per_block = ec.numThreads.x;
342 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
343 auto& q = Gpu::Device::streamQueue();
344 try {
345 if (info.hasReduction()) {
346 q.submit([&] (sycl::handler& h) {
347 sycl::local_accessor<unsigned long long>
348 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
349 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
350 sycl::range<1>(nthreads_per_block)),
351 [=] (sycl::nd_item<1> item)
352 [[sycl::reqd_work_group_size(MT)]]
353 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
354 {
355 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
356 i < std::size_t(n); i += stride) {
357 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
358 item.get_local_range(0));
359 if constexpr (detail::is_big_kernel<L>()) {
360 detail::call_f_scalar_handler(*pf, T(i),
361 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
362 n_active_threads});
363 } else {
364 detail::call_f_scalar_handler(f, T(i),
365 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
366 n_active_threads});
367 }
368 }
369 });
370 });
371 } else {
372 q.submit([&] (sycl::handler& h) {
373 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
374 sycl::range<1>(nthreads_per_block)),
375 [=] (sycl::nd_item<1> item)
376 [[sycl::reqd_work_group_size(MT)]]
377 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
378 {
379 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
380 i < std::size_t(n); i += stride) {
381 if constexpr (detail::is_big_kernel<L>()) {
382 detail::call_f_scalar_handler(*pf, T(i), Gpu::Handler{&item});
383 } else {
384 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
385 }
386 }
387 });
388 });
389 }
390 } catch (sycl::exception const& ex) {
391 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
392 }
393}
394
395template <int MT, typename L, int dim>
396void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
397{
398 if (amrex::isEmpty(box)) { return; }
399
400 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
401 L const* pf = skdp.template get<0>();
403
404 const BoxIndexerND<dim> indexer(box);
405 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
406 const auto nthreads_per_block = ec.numThreads.x;
407 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
408 auto& q = Gpu::Device::streamQueue();
409 try {
410 if (info.hasReduction()) {
411 q.submit([&] (sycl::handler& h) {
412 sycl::local_accessor<unsigned long long>
413 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
418 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
419 {
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
424 std::uint64_t(item.get_local_range(0)));
425 if constexpr (detail::is_big_kernel<L>()) {
426 detail::call_f_intvect_handler(*pf,
427 iv, Gpu::Handler{&item,
428 shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
429 n_active_threads});
430 } else {
431 detail::call_f_intvect_handler(f,
432 iv, Gpu::Handler{&item,
433 shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
434 n_active_threads});
435 }
436 }
437 });
438 });
439 } else {
440 q.submit([&] (sycl::handler& h) {
441 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
442 sycl::range<1>(nthreads_per_block)),
443 [=] (sycl::nd_item<1> item)
444 [[sycl::reqd_work_group_size(MT)]]
445 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
446 {
447 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
448 icell < indexer.numPts(); icell += stride) {
449 auto iv = indexer.intVect(icell);
450 if constexpr (detail::is_big_kernel<L>()) {
451 detail::call_f_intvect_handler(*pf,iv,Gpu::Handler{&item});
452 } else {
453 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
454 }
455 }
456 });
457 });
458 }
459 } catch (sycl::exception const& ex) {
460 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461 }
462}
463
464template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
465void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
466{
467 if (amrex::isEmpty(box)) { return; }
468
469 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
470 L const* pf = skdp.template get<0>();
472
473 const BoxIndexerND<dim> indexer(box);
474 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
475 const auto nthreads_per_block = ec.numThreads.x;
476 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
477 auto& q = Gpu::Device::streamQueue();
478 try {
479 if (info.hasReduction()) {
480 q.submit([&] (sycl::handler& h) {
481 sycl::local_accessor<unsigned long long>
482 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
483 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
484 sycl::range<1>(nthreads_per_block)),
485 [=] (sycl::nd_item<1> item)
486 [[sycl::reqd_work_group_size(MT)]]
487 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
488 {
489 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
490 icell < indexer.numPts(); icell += stride) {
491 auto iv = indexer.intVect(icell);
492 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
493 std::uint64_t(item.get_local_range(0)));
494 if constexpr (detail::is_big_kernel<L>()) {
495 detail::call_f_intvect_ncomp_handler(*pf, iv, ncomp,
496 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
497 n_active_threads});
498 } else {
499 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
500 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
501 n_active_threads});
502 }
503 }
504 });
505 });
506 } else {
507 q.submit([&] (sycl::handler& h) {
508 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
509 sycl::range<1>(nthreads_per_block)),
510 [=] (sycl::nd_item<1> item)
511 [[sycl::reqd_work_group_size(MT)]]
512 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
513 {
514 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
515 icell < indexer.numPts(); icell += stride) {
516 auto iv = indexer.intVect(icell);
517 if constexpr (detail::is_big_kernel<L>()) {
518 detail::call_f_intvect_ncomp_handler(*pf,iv,ncomp,Gpu::Handler{&item});
519 } else {
520 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
521 }
522 }
523 });
524 });
525 }
526 } catch (sycl::exception const& ex) {
527 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
528 }
529}
530
531template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
532void ParallelForRNG (T n, L const& f) noexcept
533{
534 if (amrex::isEmpty(n)) { return; }
535
536 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
537 L const* pf = skdp.template get<0>();
539
540 const auto ec = Gpu::ExecutionConfig(n);
541 const auto nthreads_per_block = ec.numThreads.x;
542 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
543 auto& q = Gpu::Device::streamQueue();
544 auto& engdescr = *(getRandEngineDescriptor());
545 try {
546 q.submit([&] (sycl::handler& h) {
547 auto engine_acc = engdescr.get_access(h);
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
552 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553 {
554 auto const tid = item.get_global_id(0);
555 auto engine = engine_acc.load(tid);
556 RandomEngine rand_eng{&engine};
557 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
558 if constexpr (detail::is_big_kernel<L>()) {
559 (*pf)(T(i),rand_eng);
560 } else {
561 f(T(i),rand_eng);
562 }
563 }
564 engine_acc.store(engine, tid);
565 });
566 });
567 q.wait_and_throw(); // because next launch might be on a different queue
568 } catch (sycl::exception const& ex) {
569 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570 }
571}
572
573template <typename L, int dim>
574void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
575{
576 if (amrex::isEmpty(box)) { return; }
577
578 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
579 L const* pf = skdp.template get<0>();
581
582 const BoxIndexerND<dim> indexer(box);
583 const auto ec = Gpu::ExecutionConfig(box.numPts());
584 const auto nthreads_per_block = ec.numThreads.x;
585 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
586 auto& q = Gpu::Device::streamQueue();
587 auto& engdescr = *(getRandEngineDescriptor());
588 try {
589 q.submit([&] (sycl::handler& h) {
590 auto engine_acc = engdescr.get_access(h);
591 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
592 sycl::range<1>(nthreads_per_block)),
593 [=] (sycl::nd_item<1> item)
594 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
595 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
596 {
597 auto const tid = item.get_global_id(0);
598 auto engine = engine_acc.load(tid);
599 RandomEngine rand_eng{&engine};
600 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
601 icell < indexer.numPts(); icell += stride) {
602 auto iv = indexer.intVect(icell);
603 if constexpr (detail::is_big_kernel<L>()) {
604 detail::call_f_intvect_engine(*pf,iv,rand_eng);
605 } else {
606 detail::call_f_intvect_engine(f,iv,rand_eng);
607 }
608 }
609 engine_acc.store(engine, tid);
610 });
611 });
612 q.wait_and_throw(); // because next launch might be on a different queue
613 } catch (sycl::exception const& ex) {
614 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
615 }
616}
617
618template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
619void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
620{
621 if (amrex::isEmpty(box)) { return; }
622
623 detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
624 L const* pf = skdp.template get<0>();
626
627 const BoxIndexerND<dim> indexer(box);
628 const auto ec = Gpu::ExecutionConfig(box.numPts());
629 const auto nthreads_per_block = ec.numThreads.x;
630 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
631 auto& q = Gpu::Device::streamQueue();
632 auto& engdescr = *(getRandEngineDescriptor());
633 try {
634 q.submit([&] (sycl::handler& h) {
635 auto engine_acc = engdescr.get_access(h);
636 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
637 sycl::range<1>(nthreads_per_block)),
638 [=] (sycl::nd_item<1> item)
639 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
640 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
641 {
642 auto const tid = item.get_global_id(0);
643 auto engine = engine_acc.load(tid);
644 RandomEngine rand_eng{&engine};
645 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
646 icell < indexer.numPts(); icell += stride) {
647 auto iv = indexer.intVect(icell);
648 if constexpr (detail::is_big_kernel<L>()) {
649 detail::call_f_intvect_ncomp_engine(*pf,iv,ncomp,rand_eng);
650 } else {
651 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
652 }
653 }
654 engine_acc.store(engine, tid);
655 });
656 });
657 q.wait_and_throw(); // because next launch might be on a different queue
658 } catch (sycl::exception const& ex) {
659 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
660 }
661}
662
663template <int MT, typename L1, typename L2, int dim>
664void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1 const& f1, L2 const& f2) noexcept
665{
666 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
667
668 detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());
669 L1 const* pf1 = skdp.template get<0>();
670 L2 const* pf2 = skdp.template get<1>();
671 amrex::ignore_unused(pf1,pf2);
672
673 const BoxIndexerND<dim> indexer1(box1);
674 const BoxIndexerND<dim> indexer2(box2);
675 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
676 const auto nthreads_per_block = ec.numThreads.x;
677 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
678 auto& q = Gpu::Device::streamQueue();
679 try {
680 q.submit([&] (sycl::handler& h) {
681 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
682 sycl::range<1>(nthreads_per_block)),
683 [=] (sycl::nd_item<1> item)
684 [[sycl::reqd_work_group_size(MT)]]
685 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
686 {
687 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
688 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
689 icell < ncells; icell += stride) {
690 if (icell < indexer1.numPts()) {
691 auto iv = indexer1.intVect(icell);
692 if constexpr (detail::is_big_kernel<L1,L2>()) {
693 detail::call_f_intvect(*pf1,iv);
694 } else {
695 detail::call_f_intvect(f1,iv);
696 }
697 }
698 if (icell < indexer2.numPts()) {
699 auto iv = indexer2.intVect(icell);
700 if constexpr (detail::is_big_kernel<L1,L2>()) {
701 detail::call_f_intvect(*pf2,iv);
702 } else {
703 detail::call_f_intvect(f2,iv);
704 }
705 }
706 }
707 });
708 });
709 } catch (sycl::exception const& ex) {
710 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
711 }
712}
713
714template <int MT, typename L1, typename L2, typename L3, int dim>
715void ParallelFor (Gpu::KernelInfo const& /*info*/,
716 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
717 L1 const& f1, L2 const& f2, L3 const& f3) noexcept
718{
719 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
720
721 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());
722 L1 const* pf1 = skdp.template get<0>();
723 L2 const* pf2 = skdp.template get<1>();
724 L3 const* pf3 = skdp.template get<2>();
725 amrex::ignore_unused(pf1,pf2,pf3);
726
727 const BoxIndexerND<dim> indexer1(box1);
728 const BoxIndexerND<dim> indexer2(box2);
729 const BoxIndexerND<dim> indexer3(box3);
730 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
731 const auto nthreads_per_block = ec.numThreads.x;
732 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
733 auto& q = Gpu::Device::streamQueue();
734 try {
735 q.submit([&] (sycl::handler& h) {
736 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
737 sycl::range<1>(nthreads_per_block)),
738 [=] (sycl::nd_item<1> item)
739 [[sycl::reqd_work_group_size(MT)]]
740 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
741 {
742 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
743 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
744 icell < ncells; icell += stride) {
745 if (icell < indexer1.numPts()) {
746 auto iv = indexer1.intVect(icell);
747 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
748 detail::call_f_intvect(*pf1,iv);
749 } else {
750 detail::call_f_intvect(f1,iv);
751 }
752 }
753 if (icell < indexer2.numPts()) {
754 auto iv = indexer2.intVect(icell);
755 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
756 detail::call_f_intvect(*pf2,iv);
757 } else {
758 detail::call_f_intvect(f2,iv);
759 }
760 }
761 if (icell < indexer3.numPts()) {
762 auto iv = indexer3.intVect(icell);
763 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
764 detail::call_f_intvect(*pf3,iv);
765 } else {
766 detail::call_f_intvect(f3,iv);
767 }
768 }
769 }
770 });
771 });
772 } catch (sycl::exception const& ex) {
773 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
774 }
775}
776
777template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
778 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
779 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
780void ParallelFor (Gpu::KernelInfo const& /*info*/,
781 BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,
782 BoxND<dim> const& box2, T2 ncomp2, L2 const& f2) noexcept
783{
784 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
785
786 detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());
787 L1 const* pf1 = skdp.template get<0>();
788 L2 const* pf2 = skdp.template get<1>();
789 amrex::ignore_unused(pf1,pf2);
790
791 const BoxIndexerND<dim> indexer1(box1);
792 const BoxIndexerND<dim> indexer2(box2);
793 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
794 const auto nthreads_per_block = ec.numThreads.x;
795 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
796 auto& q = Gpu::Device::streamQueue();
797 try {
798 q.submit([&] (sycl::handler& h) {
799 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
800 sycl::range<1>(nthreads_per_block)),
801 [=] (sycl::nd_item<1> item)
802 [[sycl::reqd_work_group_size(MT)]]
803 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
804 {
805 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
806 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
807 icell < ncells; icell += stride) {
808 if (icell < indexer1.numPts()) {
809 auto iv = indexer1.intVect(icell);
810 if constexpr (detail::is_big_kernel<L1,L2>()) {
811 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
812 } else {
813 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
814 }
815 }
816 if (icell < indexer2.numPts()) {
817 auto iv = indexer2.intVect(icell);
818 if constexpr (detail::is_big_kernel<L1,L2>()) {
819 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
820 } else {
821 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
822 }
823 }
824 }
825 });
826 });
827 } catch (sycl::exception const& ex) {
828 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
829 }
830}
831
832template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
833 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
834 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
835 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
836void ParallelFor (Gpu::KernelInfo const& /*info*/,
837 BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,
838 BoxND<dim> const& box2, T2 ncomp2, L2 const& f2,
839 BoxND<dim> const& box3, T3 ncomp3, L3 const& f3) noexcept
840{
841 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
842
843 detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());
844 L1 const* pf1 = skdp.template get<0>();
845 L2 const* pf2 = skdp.template get<1>();
846 L3 const* pf3 = skdp.template get<2>();
847 amrex::ignore_unused(pf1,pf2,pf3);
848
849 const BoxIndexerND<dim> indexer1(box1);
850 const BoxIndexerND<dim> indexer2(box2);
851 const BoxIndexerND<dim> indexer3(box3);
852 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
853 const auto nthreads_per_block = ec.numThreads.x;
854 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
855 auto& q = Gpu::Device::streamQueue();
856 try {
857 q.submit([&] (sycl::handler& h) {
858 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
859 sycl::range<1>(nthreads_per_block)),
860 [=] (sycl::nd_item<1> item)
861 [[sycl::reqd_work_group_size(MT)]]
862 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
863 {
864 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
865 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
866 icell < ncells; icell += stride) {
867 if (icell < indexer1.numPts()) {
868 auto iv = indexer1.intVect(icell);
869 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
870 detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);
871 } else {
872 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
873 }
874 }
875 if (icell < indexer2.numPts()) {
876 auto iv = indexer2.intVect(icell);
877 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
878 detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);
879 } else {
880 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
881 }
882 }
883 if (icell < indexer3.numPts()) {
884 auto iv = indexer3.intVect(icell);
885 if constexpr (detail::is_big_kernel<L1,L2,L3>()) {
886 detail::call_f_intvect_ncomp(*pf3,iv,ncomp3);
887 } else {
888 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
889 }
890 }
891 }
892 });
893 });
894 } catch (sycl::exception const& ex) {
895 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
896 }
897}
898
899#else
900// CUDA or HIP
901
902template <typename L>
903void single_task (gpuStream_t stream, L const& f) noexcept
904{
905 AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream, f);
907}
908
909template <int MT, typename L>
910void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
911 L const& f) noexcept
912{
913 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream, f);
915}
916
917template <int MT, typename L>
918void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
919{
920 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream, f);
922}
923
924template<typename L>
925void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
926 gpuStream_t stream, L const& f) noexcept
927{
928 AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes, stream, f);
930}
931
932template<typename L>
933void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
934{
935 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
936}
937
938template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
939void launch (T const& n, L const& f) noexcept
940{
941 static_assert(sizeof(T) >= 2);
942 if (amrex::isEmpty(n)) { return; }
943 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
944 for (auto const& ec : nec) {
945 const T start_idx = T(ec.start_idx);
946 const T nleft = n - start_idx;
947 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
948 [=] AMREX_GPU_DEVICE () noexcept {
949 // This will not overflow, even though nblocks*MT might.
950 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
951 if (tid < nleft) {
952 f(tid+start_idx);
953 }
954 });
955 }
957}
958
959template<int MT, int dim, typename L>
960void launch (BoxND<dim> const& box, L const& f) noexcept
961{
962 if (box.isEmpty()) { return; }
963 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
964 const BoxIndexerND<dim> indexer(box);
965 const auto type = box.ixType();
966 for (auto const& ec : nec) {
967 const auto start_idx = std::uint64_t(ec.start_idx);
968 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
969 [=] AMREX_GPU_DEVICE () noexcept {
970 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
971 if (icell < indexer.numPts()) {
972 auto iv = indexer.intVect(icell);
973 f(BoxND<dim>(iv,iv,type));
974 }
975 });
976 }
978}
979
984template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
985std::enable_if_t<MaybeDeviceRunnable<L>::value>
986ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
987{
988 static_assert(sizeof(T) >= 2);
989 if (amrex::isEmpty(n)) { return; }
990 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
991 for (auto const& ec : nec) {
992 const T start_idx = T(ec.start_idx);
993 const T nleft = n - start_idx;
994 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
995 [=] AMREX_GPU_DEVICE () noexcept {
996 // This will not overflow, even though nblocks*MT might.
997 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
998 if (tid < nleft) {
999 detail::call_f_scalar_handler(f, tid+start_idx,
1000 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
1001 (std::uint64_t)MT)));
1002 }
1003 });
1004 }
1006}
1007
1012template <int MT, typename L, int dim>
1013std::enable_if_t<MaybeDeviceRunnable<L>::value>
1014ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
1015{
1016 if (amrex::isEmpty(box)) { return; }
1017 const BoxIndexerND<dim> indexer(box);
1018 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1019 for (auto const& ec : nec) {
1020 const auto start_idx = std::uint64_t(ec.start_idx);
1021 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
1022 [=] AMREX_GPU_DEVICE () noexcept {
1023 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1024 if (icell < indexer.numPts()) {
1025 auto iv = indexer.intVect(icell);
1026 detail::call_f_intvect_handler(f, iv,
1027 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
1028 (std::uint64_t)MT)));
1029 }
1030 });
1031 }
1033}
1034
1039template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1040std::enable_if_t<MaybeDeviceRunnable<L>::value>
1041ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
1042{
1043 if (amrex::isEmpty(box)) { return; }
1044 const BoxIndexerND<dim> indexer(box);
1045 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
1046 for (auto const& ec : nec) {
1047 const auto start_idx = std::uint64_t(ec.start_idx);
1048 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
1049 [=] AMREX_GPU_DEVICE () noexcept {
1050 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
1051 if (icell < indexer.numPts()) {
1052 auto iv = indexer.intVect(icell);
1053 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
1054 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
1055 (std::uint64_t)MT)));
1056 }
1057 });
1058 }
1060}
1061
1067template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1068std::enable_if_t<MaybeDeviceRunnable<L>::value>
1069ParallelForRNG (T n, L const& f) noexcept
1070{
1071 if (amrex::isEmpty(n)) { return; }
1072 randState_t* rand_state = getRandState();
1073 const auto ec = Gpu::ExecutionConfig(n);
1074 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1076 ec.numThreads, 0, Gpu::gpuStream(),
1077 [=] AMREX_GPU_DEVICE () noexcept {
1078 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1079 RandomEngine engine{&(rand_state[tid])};
1080 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
1081 f(T(i),engine);
1082 }
1083 });
1084 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1086}
1087
1093template <typename L, int dim>
1094std::enable_if_t<MaybeDeviceRunnable<L>::value>
1095ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
1096{
1097 if (amrex::isEmpty(box)) { return; }
1098 randState_t* rand_state = getRandState();
1099 const BoxIndexerND<dim> indexer(box);
1100 const auto ec = Gpu::ExecutionConfig(box.numPts());
1101 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1103 ec.numThreads, 0, Gpu::gpuStream(),
1104 [=] AMREX_GPU_DEVICE () noexcept {
1105 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1106 RandomEngine engine{&(rand_state[tid])};
1107 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
1108 auto iv = indexer.intVect(icell);
1109 detail::call_f_intvect_engine(f, iv, engine);
1110 }
1111 });
1112 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1114}
1115
1121template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1122std::enable_if_t<MaybeDeviceRunnable<L>::value>
1123ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
1124{
1125 if (amrex::isEmpty(box)) { return; }
1126 randState_t* rand_state = getRandState();
1127 const BoxIndexerND<dim> indexer(box);
1128 const auto ec = Gpu::ExecutionConfig(box.numPts());
1129 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
1131 ec.numThreads, 0, Gpu::gpuStream(),
1132 [=] AMREX_GPU_DEVICE () noexcept {
1133 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
1134 RandomEngine engine{&(rand_state[tid])};
1135 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
1136 auto iv = indexer.intVect(icell);
1137 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
1138 }
1139 });
1140 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
1142}
1143
1148template <int MT, typename L1, typename L2, int dim>
1149std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1151 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1152{
1153 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1154 const BoxIndexerND<dim> indexer1(box1);
1155 const BoxIndexerND<dim> indexer2(box2);
1156 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1157 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1158 [=] AMREX_GPU_DEVICE () noexcept {
1159 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1160 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1161 icell < ncells; icell += stride) {
1162 if (icell < indexer1.numPts()) {
1163 auto iv = indexer1.intVect(icell);
1164 detail::call_f_intvect(f1, iv);
1165 }
1166 if (icell < indexer2.numPts()) {
1167 auto iv = indexer2.intVect(icell);
1168 detail::call_f_intvect(f2, iv);
1169 }
1170 }
1171 });
1173}
1174
1179template <int MT, typename L1, typename L2, typename L3, int dim>
1180std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1182 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1183 L1&& f1, L2&& f2, L3&& f3) noexcept
1184{
1185 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1186 const BoxIndexerND<dim> indexer1(box1);
1187 const BoxIndexerND<dim> indexer2(box2);
1188 const BoxIndexerND<dim> indexer3(box3);
1189 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1190 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1191 [=] AMREX_GPU_DEVICE () noexcept {
1192 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1193 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1194 icell < ncells; icell += stride) {
1195 if (icell < indexer1.numPts()) {
1196 auto iv = indexer1.intVect(icell);
1197 detail::call_f_intvect(f1, iv);
1198 }
1199 if (icell < indexer2.numPts()) {
1200 auto iv = indexer2.intVect(icell);
1201 detail::call_f_intvect(f2, iv);
1202 }
1203 if (icell < indexer3.numPts()) {
1204 auto iv = indexer3.intVect(icell);
1205 detail::call_f_intvect(f3, iv);
1206 }
1207 }
1208 });
1210}
1211
1216template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1217 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1218 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1219std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1221 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1222 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1223{
1224 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1225 const BoxIndexerND<dim> indexer1(box1);
1226 const BoxIndexerND<dim> indexer2(box2);
1227 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1228 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1229 [=] AMREX_GPU_DEVICE () noexcept {
1230 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1231 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1232 icell < ncells; icell += stride) {
1233 if (icell < indexer1.numPts()) {
1234 auto iv = indexer1.intVect(icell);
1235 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1236 }
1237 if (icell < indexer2.numPts()) {
1238 auto iv = indexer2.intVect(icell);
1239 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1240 }
1241 }
1242 });
1244}
1245
1250template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1251 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1252 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1253 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1254std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1256 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1257 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1258 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1259{
1260 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1261 const BoxIndexerND<dim> indexer1(box1);
1262 const BoxIndexerND<dim> indexer2(box2);
1263 const BoxIndexerND<dim> indexer3(box3);
1264 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1265 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1266 [=] AMREX_GPU_DEVICE () noexcept {
1267 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1268 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1269 icell < ncells; icell += stride) {
1270 if (icell < indexer1.numPts()) {
1271 auto iv = indexer1.intVect(icell);
1272 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1273 }
1274 if (icell < indexer2.numPts()) {
1275 auto iv = indexer2.intVect(icell);
1276 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1277 }
1278 if (icell < indexer3.numPts()) {
1279 auto iv = indexer3.intVect(icell);
1280 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1281 }
1282 }
1283 });
1285}
1286
1287#endif
1288
1289template <typename L>
1290void single_task (L&& f) noexcept
1291{
1292 single_task(Gpu::gpuStream(), std::forward<L>(f));
1293}
1294
1295template<typename T, typename L>
1296void launch (T const& n, L&& f) noexcept
1297{
1298 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1299}
1300
1305template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1306std::enable_if_t<MaybeDeviceRunnable<L>::value>
1307ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1308{
1309 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1310}
1311
1316template <typename L, int dim>
1317std::enable_if_t<MaybeDeviceRunnable<L>::value>
1318ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1319{
1320 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1321}
1322
1327template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1328std::enable_if_t<MaybeDeviceRunnable<L>::value>
1329ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1330{
1331 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1332}
1333
1338template <typename L1, typename L2, int dim>
1339std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1341 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1342{
1343 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1344 std::forward<L2>(f2));
1345}
1346
1351template <typename L1, typename L2, typename L3, int dim>
1352std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1354 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1355 L1&& f1, L2&& f2, L3&& f3) noexcept
1356{
1357 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1358 std::forward<L2>(f2), std::forward<L3>(f3));
1359}
1360
1365template <typename T1, typename T2, typename L1, typename L2, int dim,
1366 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1367 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1368std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1370 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1371 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1372{
1373 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1374 box2, ncomp2, std::forward<L2>(f2));
1375}
1376
1381template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1382 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1383 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1384 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1385std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1387 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1388 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1389 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1390{
1391 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1392 box2, ncomp2, std::forward<L2>(f2),
1393 box3, ncomp3, std::forward<L3>(f3));
1394}
1395
1396template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1397void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1398{
1399 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1400}
1401
1402template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1403void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1404{
1405 ParallelFor<MT>(info, n,std::forward<L>(f));
1406}
1407
1408template <typename L, int dim>
1409void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1410{
1411 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1412}
1413
1414template <int MT, typename L, int dim>
1415void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1416{
1417 ParallelFor<MT>(info, box,std::forward<L>(f));
1418}
1419
1420template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1421void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1422{
1423 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1424}
1425
1426template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1427void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1428{
1429 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1430}
1431
1432template <typename L1, typename L2, int dim>
1433void For (Gpu::KernelInfo const& info,
1434 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1435{
1436 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1437}
1438
1439template <int MT, typename L1, typename L2, int dim>
1440void For (Gpu::KernelInfo const& info,
1441 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1442{
1443 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1444}
1445
1446template <typename L1, typename L2, typename L3, int dim>
1447void For (Gpu::KernelInfo const& info,
1448 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1449 L1&& f1, L2&& f2, L3&& f3) noexcept
1450{
1451 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1452}
1453
1454template <int MT, typename L1, typename L2, typename L3, int dim>
1455void For (Gpu::KernelInfo const& info,
1456 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1457 L1&& f1, L2&& f2, L3&& f3) noexcept
1458{
1459 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1460}
1461
1462template <typename T1, typename T2, typename L1, typename L2, int dim,
1463 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1464 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1465void For (Gpu::KernelInfo const& info,
1466 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1467 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1468{
1469 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1470}
1471
1472template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1473 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1474 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1475void For (Gpu::KernelInfo const& info,
1476 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1477 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1478{
1479 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1480}
1481
1482template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1483 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1484 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1485 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1486void For (Gpu::KernelInfo const& info,
1487 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1488 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1489 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1490{
1491 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1492 box1,ncomp1,std::forward<L1>(f1),
1493 box2,ncomp2,std::forward<L2>(f2),
1494 box3,ncomp3,std::forward<L3>(f3));
1495}
1496
1497template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1498 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1499 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1500 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1501void For (Gpu::KernelInfo const& info,
1502 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1503 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1504 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1505{
1506 ParallelFor<MT>(info,
1507 box1,ncomp1,std::forward<L1>(f1),
1508 box2,ncomp2,std::forward<L2>(f2),
1509 box3,ncomp3,std::forward<L3>(f3));
1510}
1511
1516template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1517void ParallelFor (T n, L&& f) noexcept
1518{
1519 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1520}
1521
1526template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1527void ParallelFor (T n, L&& f) noexcept
1528{
1529 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1530}
1531
1536template <typename L, int dim>
1537void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1538{
1539 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1540}
1541
1546template <int MT, typename L, int dim>
1547void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1548{
1549 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1550}
1551
1556template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1557void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1558{
1559 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1560}
1561
1566template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1567void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1568{
1569 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1570}
1571
1576template <typename L1, typename L2, int dim>
1577void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1578{
1579 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1580}
1581
1586template <int MT, typename L1, typename L2, int dim>
1587void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1588{
1589 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1590}
1591
1596template <typename L1, typename L2, typename L3, int dim>
1597void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1598 L1&& f1, L2&& f2, L3&& f3) noexcept
1599{
1600 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1601}
1602
1607template <int MT, typename L1, typename L2, typename L3, int dim>
1608void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1609 L1&& f1, L2&& f2, L3&& f3) noexcept
1610{
1611 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1612}
1613
1618template <typename T1, typename T2, typename L1, typename L2, int dim,
1619 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1620 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1621void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1622 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1623{
1624 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1625}
1626
1631template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1632 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1633 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1634void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1635 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1636{
1637 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1638}
1639
1644template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1645 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1646 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1647 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1648void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1649 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1650 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1651{
1652 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1653 box1,ncomp1,std::forward<L1>(f1),
1654 box2,ncomp2,std::forward<L2>(f2),
1655 box3,ncomp3,std::forward<L3>(f3));
1656}
1657
1662template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1663 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1664 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1665 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1666void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1667 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1668 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1669{
1670 ParallelFor<MT>(Gpu::KernelInfo{},
1671 box1,ncomp1,std::forward<L1>(f1),
1672 box2,ncomp2,std::forward<L2>(f2),
1673 box3,ncomp3,std::forward<L3>(f3));
1674}
1675
1676template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1677void For (T n, L&& f) noexcept
1678{
1679 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1680}
1681
1682template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1683void For (T n, L&& f) noexcept
1684{
1685 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1686}
1687
1688template <typename L, int dim>
1689void For (BoxND<dim> const& box, L&& f) noexcept
1690{
1691 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1692}
1693
1694template <int MT, typename L, int dim>
1695void For (BoxND<dim> const& box, L&& f) noexcept
1696{
1697 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1698}
1699
1700template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1701void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1702{
1703 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1704}
1705
1706template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1707void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1708{
1709 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1710}
1711
1712template <typename L1, typename L2, int dim>
1713void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1714{
1715 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1716}
1717
1718template <int MT, typename L1, typename L2, int dim>
1719void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1720{
1721 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1722}
1723
1724template <typename L1, typename L2, typename L3, int dim>
1725void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1726 L1&& f1, L2&& f2, L3&& f3) noexcept
1727{
1728 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1729}
1730
1731template <int MT, typename L1, typename L2, typename L3, int dim>
1732void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1733 L1&& f1, L2&& f2, L3&& f3) noexcept
1734{
1735 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1736}
1737
1738template <typename T1, typename T2, typename L1, typename L2, int dim,
1739 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1740 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1741void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1742 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1743{
1744 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1745}
1746
1747template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1748 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1749 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1750void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1751 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1752{
1753 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1754}
1755
1756template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1757 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1758 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1759 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1760void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1761 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1762 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1763{
1764 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1765 box1,ncomp1,std::forward<L1>(f1),
1766 box2,ncomp2,std::forward<L2>(f2),
1767 box3,ncomp3,std::forward<L3>(f3));
1768}
1769
1770template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1771 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1772 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1773 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1774void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1775 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1776 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1777{
1778 ParallelFor<MT>(Gpu::KernelInfo{},
1779 box1,ncomp1,std::forward<L1>(f1),
1780 box2,ncomp2,std::forward<L2>(f2),
1781 box3,ncomp3,std::forward<L3>(f3));
1782}
1783
1784template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1785std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1786HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1787{
1788 if (Gpu::inLaunchRegion()) {
1789 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1790 } else {
1791#ifdef AMREX_USE_SYCL
1792 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1793#else
1795 for (T i = 0; i < n; ++i) { f(i); }
1796#endif
1797 }
1798}
1799
1800template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1801std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1802HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1803{
1804 if (Gpu::inLaunchRegion()) {
1805 ParallelFor<MT>(info,n,std::forward<L>(f));
1806 } else {
1807#ifdef AMREX_USE_SYCL
1808 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1809#else
1811 for (T i = 0; i < n; ++i) { f(i); }
1812#endif
1813 }
1814}
1815
1816template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1817std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1818HostDeviceParallelFor (T n, L&& f) noexcept
1819{
1820 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1821}
1822
1823template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1824std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1825HostDeviceParallelFor (T n, L&& f) noexcept
1826{
1827 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1828}
1829
1830template <typename L, int dim>
1831std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1832HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1833{
1834 if (Gpu::inLaunchRegion()) {
1835 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1836 } else {
1837#ifdef AMREX_USE_SYCL
1838 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1839#else
1840 LoopConcurrentOnCpu(box,std::forward<L>(f));
1841#endif
1842 }
1843}
1844
1845template <int MT, typename L, int dim>
1846std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1847HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1848{
1849 if (Gpu::inLaunchRegion()) {
1850 ParallelFor<MT>(info, box,std::forward<L>(f));
1851 } else {
1852#ifdef AMREX_USE_SYCL
1853 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1854#else
1855 LoopConcurrentOnCpu(box,std::forward<L>(f));
1856#endif
1857 }
1858}
1859
1860template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1861std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1862HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1863{
1864 if (Gpu::inLaunchRegion()) {
1865 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1866 } else {
1867#ifdef AMREX_USE_SYCL
1868 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1869#else
1870 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1871#endif
1872 }
1873}
1874
1875template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1876std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1877HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1878{
1879 if (Gpu::inLaunchRegion()) {
1880 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1881 } else {
1882#ifdef AMREX_USE_SYCL
1883 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1884#else
1885 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1886#endif
1887 }
1888}
1889
1890template <typename L1, typename L2, int dim>
1891std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1893 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1894{
1895 if (Gpu::inLaunchRegion()) {
1896 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1897 } else {
1898#ifdef AMREX_USE_SYCL
1899 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1900#else
1901 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1902 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1903#endif
1904 }
1905}
1906
1907template <int MT, typename L1, typename L2, int dim>
1908std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1910 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1911{
1912 if (Gpu::inLaunchRegion()) {
1913 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1914 } else {
1915#ifdef AMREX_USE_SYCL
1916 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1917#else
1918 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1919 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1920#endif
1921 }
1922}
1923
1924template <int MT, typename L1, typename L2, typename L3, int dim>
1925std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1927 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1928 L1&& f1, L2&& f2, L3&& f3) noexcept
1929{
1930 if (Gpu::inLaunchRegion()) {
1931 ParallelFor<MT>(info,box1,box2,box3,
1932 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1933 } else {
1934#ifdef AMREX_USE_SYCL
1935 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1936#else
1937 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1938 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1939 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1940#endif
1941 }
1942}
1943
1944template <typename T1, typename T2, typename L1, typename L2, int dim,
1945 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1946 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1947std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1949 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1950 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1951{
1952 if (Gpu::inLaunchRegion()) {
1953 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1954 } else {
1955#ifdef AMREX_USE_SYCL
1956 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1957#else
1958 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1959 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1960#endif
1961 }
1962}
1963
1964template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1965 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1966 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1967std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1969 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1970 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1971{
1972 if (Gpu::inLaunchRegion()) {
1973 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1974 } else {
1975#ifdef AMREX_USE_SYCL
1976 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1977#else
1978 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1979 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1980#endif
1981 }
1982}
1983
1984template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1985 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1986 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1987 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1988std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1990 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1991 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1992 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1993{
1994 if (Gpu::inLaunchRegion()) {
1995 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1996 box1,ncomp1,std::forward<L1>(f1),
1997 box2,ncomp2,std::forward<L2>(f2),
1998 box3,ncomp3,std::forward<L3>(f3));
1999 } else {
2000#ifdef AMREX_USE_SYCL
2001 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2002#else
2003 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
2004 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
2005 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
2006#endif
2007 }
2008}
2009
2010template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2011 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2012 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2013 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2014std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
2016 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2017 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2018 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2019{
2020 if (Gpu::inLaunchRegion()) {
2021 ParallelFor<MT>(info,
2022 box1,ncomp1,std::forward<L1>(f1),
2023 box2,ncomp2,std::forward<L2>(f2),
2024 box3,ncomp3,std::forward<L3>(f3));
2025 } else {
2026#ifdef AMREX_USE_SYCL
2027 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
2028#else
2029 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
2030 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
2031 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
2032#endif
2033 }
2034}
2035
2036template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
2037void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
2038{
2039 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
2040}
2041
2042template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
2043void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
2044{
2045 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
2046}
2047
2048template <typename L, int dim>
2049void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
2050{
2051 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
2052}
2053
2054template <int MT, typename L, int dim>
2055void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
2056{
2057 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
2058}
2059
2060template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
2061void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
2062{
2063 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
2064}
2065
2066template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
2067void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
2068{
2069 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
2070}
2071
2072template <typename L1, typename L2, int dim>
2073void HostDeviceFor (Gpu::KernelInfo const& info,
2074 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2075{
2076 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2077}
2078
2079template <int MT, typename L1, typename L2, int dim>
2080void HostDeviceFor (Gpu::KernelInfo const& info,
2081 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2082{
2083 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2084}
2085
2086template <typename L1, typename L2, typename L3, int dim>
2087void HostDeviceFor (Gpu::KernelInfo const& info,
2088 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2089 L1&& f1, L2&& f2, L3&& f3) noexcept
2090{
2091 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
2092 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2093}
2094
2095template <int MT, typename L1, typename L2, typename L3, int dim>
2096void HostDeviceFor (Gpu::KernelInfo const& info,
2097 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2098 L1&& f1, L2&& f2, L3&& f3) noexcept
2099{
2100 HostDeviceParallelFor<MT>(info, box1,box2,box3,
2101 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2102}
2103
2104template <typename T1, typename T2, typename L1, typename L2, int dim,
2105 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2106 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2107void HostDeviceFor (Gpu::KernelInfo const& info,
2108 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2109 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2110{
2111 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2112}
2113
2114template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
2115 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2116 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2117void HostDeviceFor (Gpu::KernelInfo const& info,
2118 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2119 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2120{
2121 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2122}
2123
2124template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2125 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2126 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2127 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2128void HostDeviceFor (Gpu::KernelInfo const& info,
2129 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2130 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2131 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2132{
2133 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
2134 box1,ncomp1,std::forward<L1>(f1),
2135 box2,ncomp2,std::forward<L2>(f2),
2136 box3,ncomp3,std::forward<L3>(f3));
2137}
2138
2139template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2140 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2141 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2142 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2143void HostDeviceFor (Gpu::KernelInfo const& info,
2144 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2145 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2146 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2147{
2148 HostDeviceParallelFor<MT>(info,
2149 box1,ncomp1,std::forward<L1>(f1),
2150 box2,ncomp2,std::forward<L2>(f2),
2151 box3,ncomp3,std::forward<L3>(f3));
2152}
2153
2154template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
2155void HostDeviceParallelFor (T n, L&& f) noexcept
2156{
2157 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
2158}
2159
2160template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
2161void HostDeviceParallelFor (T n, L&& f) noexcept
2162{
2163 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
2164}
2165
2166template <typename L, int dim>
2167void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
2168{
2169 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
2170}
2171
2172template <int MT, typename L, int dim>
2173void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
2174{
2175 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
2176}
2177
2178template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
2179void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
2180{
2181 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2182}
2183
2184template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
2185void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
2186{
2187 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2188}
2189
2190template <typename L1, typename L2, int dim>
2191void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2192{
2193 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2194}
2195
2196template <int MT, typename L1, typename L2, int dim>
2197void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2198{
2199 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2200}
2201
2202template <typename L1, typename L2, typename L3, int dim>
2203void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2204 L1&& f1, L2&& f2, L3&& f3) noexcept
2205{
2206 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2207 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2208}
2209
2210template <int MT, typename L1, typename L2, typename L3, int dim>
2211void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2212 L1&& f1, L2&& f2, L3&& f3) noexcept
2213{
2214 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2215 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2216}
2217
2218template <typename T1, typename T2, typename L1, typename L2, int dim,
2219 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2220 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2221void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2222 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2223{
2224 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2225}
2226
2227template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
2228 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2229 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2230void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2231 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2232{
2233 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2234}
2235
2236template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2237 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2238 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2239 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2240void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2241 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2242 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2243{
2244 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2245 box1,ncomp1,std::forward<L1>(f1),
2246 box2,ncomp2,std::forward<L2>(f2),
2247 box3,ncomp3,std::forward<L3>(f3));
2248}
2249
2250template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2251 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2252 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2253 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2254void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2255 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2256 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2257{
2258 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2259 box1,ncomp1,std::forward<L1>(f1),
2260 box2,ncomp2,std::forward<L2>(f2),
2261 box3,ncomp3,std::forward<L3>(f3));
2262}
2263
2264}
2265
2266#endif
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:36
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:38
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:122
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:879
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:766
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1319
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1231
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:119
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72