Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
8namespace detail {
9
10 // call_f_scalar_handler
11
12 template <typename F, typename N>
14 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
15 noexcept -> decltype(f(0))
16 {
17 f(i);
18 }
19
20 template <typename F, typename N>
22 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
23 noexcept -> decltype(f(0,Gpu::Handler{}))
24 {
25 f(i, handler);
26 }
27
28 // call_f_intvect_inner
29
30 template <typename F, std::size_t...Ns, class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
34 {
35 f(iv[0], 0, 0, args...);
36 }
37
38 template <typename F, std::size_t...Ns, class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
42 {
43 f(iv[0], iv[1], 0, args...);
44 }
45
46 template <typename F, int dim, std::size_t...Ns, class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
50 {
51 f(iv, args...);
52 }
53
54 template <typename F, int dim, std::size_t...Ns, class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
58 {
59 f(iv[Ns]..., args...);
60 }
61
62 // call_f_intvect
63
64 template <typename F, int dim>
66 auto call_f_intvect (F const& f, IntVectND<dim> iv)
67 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
68 {
69 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
70 }
71
72 // call_f_intvect_engine
73
74 template <typename F, int dim>
76 auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
78 {
79 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
80 }
81
82 // call_f_intvect_handler
83
84 template <typename F, int dim>
86 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)
87 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
88 {
89 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
90 }
91
92 template <typename F, int dim>
94 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
95 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
96 {
97 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
98 }
99
100 // call_f_intvect_ncomp
101
102 template <typename F, typename T, int dim>
104 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
106 {
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
109 }
110 }
111
112 // call_f_intvect_ncomp_engine
113
114 template <typename F, typename T, int dim>
116 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
118 {
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
121 }
122 }
123
124 // call_f_intvect_ncomp_handler
125
126 template <typename F, typename T, int dim>
128 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
129 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
130 {
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
133 }
134 }
135
136 template <typename F, typename T, int dim>
138 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
139 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
140 {
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
143 }
144 }
145
146}
148
149#ifdef AMREX_USE_SYCL
150
151template <typename L>
152void single_task (gpuStream_t stream, L const& f) noexcept
153{
154 auto& q = *(stream.queue);
155 try {
156 q.submit([&] (sycl::handler& h) {
157 h.single_task([=] () { f(); });
158 });
159 } catch (sycl::exception const& ex) {
160 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
161 }
162}
163
164template<typename L>
165void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
166 gpuStream_t stream, L const& f) noexcept
167{
168 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
169 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
170 / sizeof(unsigned long long);
171 auto& q = *(stream.queue);
172 try {
173 q.submit([&] (sycl::handler& h) {
174 sycl::local_accessor<unsigned long long>
175 shared_data(sycl::range<1>(shared_mem_numull), h);
176 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
177 sycl::range<1>(nthreads_per_block)),
178 [=] (sycl::nd_item<1> item)
179 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
180 {
181 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
182 });
183 });
184 } catch (sycl::exception const& ex) {
185 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
186 }
187}
188
189template<typename L>
190void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
191{
192 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
193 auto& q = *(stream.queue);
194 try {
195 q.submit([&] (sycl::handler& h) {
196 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
197 sycl::range<1>(nthreads_per_block)),
198 [=] (sycl::nd_item<1> item)
199 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
200 {
201 f(item);
202 });
203 });
204 } catch (sycl::exception const& ex) {
205 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
206 }
207}
208
209template <int MT, typename L>
210void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
211 L const& f) noexcept
212{
213 const auto nthreads_total = MT * std::size_t(nblocks);
214 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
215 / sizeof(unsigned long long);
216 auto& q = *(stream.queue);
217 try {
218 q.submit([&] (sycl::handler& h) {
219 sycl::local_accessor<unsigned long long>
220 shared_data(sycl::range<1>(shared_mem_numull), h);
221 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
222 sycl::range<1>(MT)),
223 [=] (sycl::nd_item<1> item)
224 [[sycl::reqd_work_group_size(MT)]]
225 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
226 {
227 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
228 });
229 });
230 } catch (sycl::exception const& ex) {
231 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
232 }
233}
234
235template <int MT, typename L>
236void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
237{
238 const auto nthreads_total = MT * std::size_t(nblocks);
239 auto& q = *(stream.queue);
240 try {
241 q.submit([&] (sycl::handler& h) {
242 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
243 sycl::range<1>(MT)),
244 [=] (sycl::nd_item<1> item)
245 [[sycl::reqd_work_group_size(MT)]]
246 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
247 {
248 f(item);
249 });
250 });
251 } catch (sycl::exception const& ex) {
252 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
253 }
254}
255
256template<int MT, typename T, typename L>
257void launch (T const& n, L const& f) noexcept
258{
259 if (amrex::isEmpty(n)) { return; }
260 const auto ec = Gpu::makeExecutionConfig<MT>(n);
261 const auto nthreads_per_block = ec.numThreads.x;
262 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
263 auto& q = Gpu::Device::streamQueue();
264 try {
265 q.submit([&] (sycl::handler& h) {
266 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
267 sycl::range<1>(nthreads_per_block)),
268 [=] (sycl::nd_item<1> item)
269 [[sycl::reqd_work_group_size(MT)]]
270 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
271 {
272 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
273 f(i);
274 }
275 });
276 });
277 } catch (sycl::exception const& ex) {
278 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
279 }
280}
281
282template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
283void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
284{
285 if (amrex::isEmpty(n)) { return; }
286 const auto ec = Gpu::makeExecutionConfig<MT>(n);
287 const auto nthreads_per_block = ec.numThreads.x;
288 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
289 auto& q = Gpu::Device::streamQueue();
290 try {
291 if (info.hasReduction()) {
292 q.submit([&] (sycl::handler& h) {
293 sycl::local_accessor<unsigned long long>
294 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
295 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
296 sycl::range<1>(nthreads_per_block)),
297 [=] (sycl::nd_item<1> item)
298 [[sycl::reqd_work_group_size(MT)]]
299 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
300 {
301 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
302 i < std::size_t(n); i += stride) {
303 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
304 item.get_local_range(0));
305 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
306 n_active_threads});
307 }
308 });
309 });
310 } else {
311 q.submit([&] (sycl::handler& h) {
312 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
313 sycl::range<1>(nthreads_per_block)),
314 [=] (sycl::nd_item<1> item)
315 [[sycl::reqd_work_group_size(MT)]]
316 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
317 {
318 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
319 i < std::size_t(n); i += stride) {
320 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
321 }
322 });
323 });
324 }
325 } catch (sycl::exception const& ex) {
326 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
327 }
328}
329
330template <int MT, typename L, int dim>
331void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
332{
333 if (amrex::isEmpty(box)) { return; }
334 const BoxIndexerND<dim> indexer(box);
335 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
336 const auto nthreads_per_block = ec.numThreads.x;
337 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
338 auto& q = Gpu::Device::streamQueue();
339 try {
340 if (info.hasReduction()) {
341 q.submit([&] (sycl::handler& h) {
342 sycl::local_accessor<unsigned long long>
343 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
344 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
345 sycl::range<1>(nthreads_per_block)),
346 [=] (sycl::nd_item<1> item)
347 [[sycl::reqd_work_group_size(MT)]]
348 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
349 {
350 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
351 icell < indexer.numPts(); icell += stride) {
352 auto iv = indexer.intVect(icell);
353 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
354 std::uint64_t(item.get_local_range(0)));
355 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
356 n_active_threads});
357 }
358 });
359 });
360 } else {
361 q.submit([&] (sycl::handler& h) {
362 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
363 sycl::range<1>(nthreads_per_block)),
364 [=] (sycl::nd_item<1> item)
365 [[sycl::reqd_work_group_size(MT)]]
366 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
367 {
368 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
369 icell < indexer.numPts(); icell += stride) {
370 auto iv = indexer.intVect(icell);
371 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
372 }
373 });
374 });
375 }
376 } catch (sycl::exception const& ex) {
377 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
378 }
379}
380
381template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
382void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
383{
384 if (amrex::isEmpty(box)) { return; }
385 const BoxIndexerND<dim> indexer(box);
386 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
387 const auto nthreads_per_block = ec.numThreads.x;
388 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
389 auto& q = Gpu::Device::streamQueue();
390 try {
391 if (info.hasReduction()) {
392 q.submit([&] (sycl::handler& h) {
393 sycl::local_accessor<unsigned long long>
394 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
395 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
396 sycl::range<1>(nthreads_per_block)),
397 [=] (sycl::nd_item<1> item)
398 [[sycl::reqd_work_group_size(MT)]]
399 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
400 {
401 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
402 icell < indexer.numPts(); icell += stride) {
403 auto iv = indexer.intVect(icell);
404 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
405 std::uint64_t(item.get_local_range(0)));
406 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
407 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
408 n_active_threads});
409 }
410 });
411 });
412 } else {
413 q.submit([&] (sycl::handler& h) {
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
418 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
419 {
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
424 }
425 });
426 });
427 }
428 } catch (sycl::exception const& ex) {
429 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
430 }
431}
432
433template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
434void ParallelForRNG (T n, L const& f) noexcept
435{
436 if (amrex::isEmpty(n)) { return; }
437 const auto ec = Gpu::ExecutionConfig(n);
438 const auto nthreads_per_block = ec.numThreads.x;
439 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
440 auto& q = Gpu::Device::streamQueue();
441 auto& engdescr = *(getRandEngineDescriptor());
442 try {
443 q.submit([&] (sycl::handler& h) {
444 auto engine_acc = engdescr.get_access(h);
445 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
446 sycl::range<1>(nthreads_per_block)),
447 [=] (sycl::nd_item<1> item)
448 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
449 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
450 {
451 auto const tid = item.get_global_id(0);
452 auto engine = engine_acc.load(tid);
453 RandomEngine rand_eng{&engine};
454 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
455 f(T(i),rand_eng);
456 }
457 engine_acc.store(engine, tid);
458 });
459 });
460 q.wait_and_throw(); // because next launch might be on a different queue
461 } catch (sycl::exception const& ex) {
462 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
463 }
464}
465
466template <typename L, int dim>
467void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
468{
469 if (amrex::isEmpty(box)) { return; }
470 const BoxIndexerND<dim> indexer(box);
471 const auto ec = Gpu::ExecutionConfig(box.numPts());
472 const auto nthreads_per_block = ec.numThreads.x;
473 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
474 auto& q = Gpu::Device::streamQueue();
475 auto& engdescr = *(getRandEngineDescriptor());
476 try {
477 q.submit([&] (sycl::handler& h) {
478 auto engine_acc = engdescr.get_access(h);
479 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
480 sycl::range<1>(nthreads_per_block)),
481 [=] (sycl::nd_item<1> item)
482 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
483 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
484 {
485 auto const tid = item.get_global_id(0);
486 auto engine = engine_acc.load(tid);
487 RandomEngine rand_eng{&engine};
488 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
489 icell < indexer.numPts(); icell += stride) {
490 auto iv = indexer.intVect(icell);
491 detail::call_f_intvect_engine(f,iv,rand_eng);
492 }
493 engine_acc.store(engine, tid);
494 });
495 });
496 q.wait_and_throw(); // because next launch might be on a different queue
497 } catch (sycl::exception const& ex) {
498 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
499 }
500}
501
502template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
503void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
504{
505 if (amrex::isEmpty(box)) { return; }
506 const BoxIndexerND<dim> indexer(box);
507 const auto ec = Gpu::ExecutionConfig(box.numPts());
508 const auto nthreads_per_block = ec.numThreads.x;
509 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
510 auto& q = Gpu::Device::streamQueue();
511 auto& engdescr = *(getRandEngineDescriptor());
512 try {
513 q.submit([&] (sycl::handler& h) {
514 auto engine_acc = engdescr.get_access(h);
515 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
516 sycl::range<1>(nthreads_per_block)),
517 [=] (sycl::nd_item<1> item)
518 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
519 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
520 {
521 auto const tid = item.get_global_id(0);
522 auto engine = engine_acc.load(tid);
523 RandomEngine rand_eng{&engine};
524 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
525 icell < indexer.numPts(); icell += stride) {
526 auto iv = indexer.intVect(icell);
527 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
528 }
529 engine_acc.store(engine, tid);
530 });
531 });
532 q.wait_and_throw(); // because next launch might be on a different queue
533 } catch (sycl::exception const& ex) {
534 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
535 }
536}
537
538template <int MT, typename L1, typename L2, int dim>
539void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
540{
541 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
542 const BoxIndexerND<dim> indexer1(box1);
543 const BoxIndexerND<dim> indexer2(box2);
544 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
545 const auto nthreads_per_block = ec.numThreads.x;
546 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
547 auto& q = Gpu::Device::streamQueue();
548 try {
549 q.submit([&] (sycl::handler& h) {
550 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
551 sycl::range<1>(nthreads_per_block)),
552 [=] (sycl::nd_item<1> item)
553 [[sycl::reqd_work_group_size(MT)]]
554 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
555 {
556 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
557 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
558 icell < ncells; icell += stride) {
559 if (icell < indexer1.numPts()) {
560 auto iv = indexer1.intVect(icell);
561 detail::call_f_intvect(f1,iv);
562 }
563 if (icell < indexer2.numPts()) {
564 auto iv = indexer2.intVect(icell);
565 detail::call_f_intvect(f2,iv);
566 }
567 }
568 });
569 });
570 } catch (sycl::exception const& ex) {
571 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
572 }
573}
574
575template <int MT, typename L1, typename L2, typename L3, int dim>
576void ParallelFor (Gpu::KernelInfo const& /*info*/,
577 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
578 L1&& f1, L2&& f2, L3&& f3) noexcept
579{
580 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
581 const BoxIndexerND<dim> indexer1(box1);
582 const BoxIndexerND<dim> indexer2(box2);
583 const BoxIndexerND<dim> indexer3(box3);
584 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
585 const auto nthreads_per_block = ec.numThreads.x;
586 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
587 auto& q = Gpu::Device::streamQueue();
588 try {
589 q.submit([&] (sycl::handler& h) {
590 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
591 sycl::range<1>(nthreads_per_block)),
592 [=] (sycl::nd_item<1> item)
593 [[sycl::reqd_work_group_size(MT)]]
594 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
595 {
596 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
597 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
598 icell < ncells; icell += stride) {
599 if (icell < indexer1.numPts()) {
600 auto iv = indexer1.intVect(icell);
601 detail::call_f_intvect(f1,iv);
602 }
603 if (icell < indexer2.numPts()) {
604 auto iv = indexer2.intVect(icell);
605 detail::call_f_intvect(f2,iv);
606 }
607 if (icell < indexer3.numPts()) {
608 auto iv = indexer3.intVect(icell);
609 detail::call_f_intvect(f3,iv);
610 }
611 }
612 });
613 });
614 } catch (sycl::exception const& ex) {
615 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
616 }
617}
618
619template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
620 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
621 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
622void ParallelFor (Gpu::KernelInfo const& /*info*/,
623 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
624 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
625{
626 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
627 const BoxIndexerND<dim> indexer1(box1);
628 const BoxIndexerND<dim> indexer2(box2);
629 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
630 const auto nthreads_per_block = ec.numThreads.x;
631 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
632 auto& q = Gpu::Device::streamQueue();
633 try {
634 q.submit([&] (sycl::handler& h) {
635 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
636 sycl::range<1>(nthreads_per_block)),
637 [=] (sycl::nd_item<1> item)
638 [[sycl::reqd_work_group_size(MT)]]
639 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
640 {
641 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
642 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
643 icell < ncells; icell += stride) {
644 if (icell < indexer1.numPts()) {
645 auto iv = indexer1.intVect(icell);
646 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
647 }
648 if (icell < indexer2.numPts()) {
649 auto iv = indexer2.intVect(icell);
650 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
651 }
652 }
653 });
654 });
655 } catch (sycl::exception const& ex) {
656 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
657 }
658}
659
660template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
661 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
662 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
663 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
664void ParallelFor (Gpu::KernelInfo const& /*info*/,
665 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
666 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
667 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
668{
669 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
670 const BoxIndexerND<dim> indexer1(box1);
671 const BoxIndexerND<dim> indexer2(box2);
672 const BoxIndexerND<dim> indexer3(box3);
673 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
674 const auto nthreads_per_block = ec.numThreads.x;
675 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
676 auto& q = Gpu::Device::streamQueue();
677 try {
678 q.submit([&] (sycl::handler& h) {
679 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
680 sycl::range<1>(nthreads_per_block)),
681 [=] (sycl::nd_item<1> item)
682 [[sycl::reqd_work_group_size(MT)]]
683 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
684 {
685 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
686 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
687 icell < ncells; icell += stride) {
688 if (icell < indexer1.numPts()) {
689 auto iv = indexer1.intVect(icell);
690 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
691 }
692 if (icell < indexer2.numPts()) {
693 auto iv = indexer2.intVect(icell);
694 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
695 }
696 if (icell < indexer3.numPts()) {
697 auto iv = indexer3.intVect(icell);
698 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
699 }
700 }
701 });
702 });
703 } catch (sycl::exception const& ex) {
704 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
705 }
706}
707
708#else
709// CUDA or HIP
710
711template <typename L>
712void single_task (gpuStream_t stream, L const& f) noexcept
713{
715 [=] AMREX_GPU_DEVICE () noexcept {f();});
717}
718
719template <int MT, typename L>
720void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
721 L const& f) noexcept
722{
723 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
724 [=] AMREX_GPU_DEVICE () noexcept { f(); });
726}
727
728template <int MT, typename L>
729void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
730{
731 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
732 [=] AMREX_GPU_DEVICE () noexcept { f(); });
734}
735
736template<typename L>
737void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
738 gpuStream_t stream, L const& f) noexcept
739{
740 AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
741 stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
743}
744
745template<typename L>
746void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
747{
748 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
749}
750
751template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
752void launch (T const& n, L const& f) noexcept
753{
754 static_assert(sizeof(T) >= 2);
755 if (amrex::isEmpty(n)) { return; }
756 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
757 for (auto const& ec : nec) {
758 const T start_idx = T(ec.start_idx);
759 const T nleft = n - start_idx;
760 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
761 [=] AMREX_GPU_DEVICE () noexcept {
762 // This will not overflow, even though nblocks*MT might.
763 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
764 if (tid < nleft) {
765 f(tid+start_idx);
766 }
767 });
768 }
770}
771
772template<int MT, int dim, typename L>
773void launch (BoxND<dim> const& box, L const& f) noexcept
774{
775 if (box.isEmpty()) { return; }
776 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
777 const BoxIndexerND<dim> indexer(box);
778 const auto type = box.ixType();
779 for (auto const& ec : nec) {
780 const auto start_idx = std::uint64_t(ec.start_idx);
781 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
782 [=] AMREX_GPU_DEVICE () noexcept {
783 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
784 if (icell < indexer.numPts()) {
785 auto iv = indexer.intVect(icell);
786 f(BoxND<dim>(iv,iv,type));
787 }
788 });
789 }
791}
792
797template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
798std::enable_if_t<MaybeDeviceRunnable<L>::value>
799ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
800{
801 static_assert(sizeof(T) >= 2);
802 if (amrex::isEmpty(n)) { return; }
803 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
804 for (auto const& ec : nec) {
805 const T start_idx = T(ec.start_idx);
806 const T nleft = n - start_idx;
807 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
808 [=] AMREX_GPU_DEVICE () noexcept {
809 // This will not overflow, even though nblocks*MT might.
810 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
811 if (tid < nleft) {
812 detail::call_f_scalar_handler(f, tid+start_idx,
813 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
814 (std::uint64_t)MT)));
815 }
816 });
817 }
819}
820
825template <int MT, typename L, int dim>
826std::enable_if_t<MaybeDeviceRunnable<L>::value>
827ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
828{
829 if (amrex::isEmpty(box)) { return; }
830 const BoxIndexerND<dim> indexer(box);
831 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
832 for (auto const& ec : nec) {
833 const auto start_idx = std::uint64_t(ec.start_idx);
834 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
835 [=] AMREX_GPU_DEVICE () noexcept {
836 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
837 if (icell < indexer.numPts()) {
838 auto iv = indexer.intVect(icell);
839 detail::call_f_intvect_handler(f, iv,
840 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
841 (std::uint64_t)MT)));
842 }
843 });
844 }
846}
847
852template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
853std::enable_if_t<MaybeDeviceRunnable<L>::value>
854ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
855{
856 if (amrex::isEmpty(box)) { return; }
857 const BoxIndexerND<dim> indexer(box);
858 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
859 for (auto const& ec : nec) {
860 const auto start_idx = std::uint64_t(ec.start_idx);
861 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
862 [=] AMREX_GPU_DEVICE () noexcept {
863 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
864 if (icell < indexer.numPts()) {
865 auto iv = indexer.intVect(icell);
866 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
867 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
868 (std::uint64_t)MT)));
869 }
870 });
871 }
873}
874
880template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
881std::enable_if_t<MaybeDeviceRunnable<L>::value>
882ParallelForRNG (T n, L const& f) noexcept
883{
884 if (amrex::isEmpty(n)) { return; }
885 randState_t* rand_state = getRandState();
886 const auto ec = Gpu::ExecutionConfig(n);
887 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
889 ec.numThreads, 0, Gpu::gpuStream(),
890 [=] AMREX_GPU_DEVICE () noexcept {
891 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
892 RandomEngine engine{&(rand_state[tid])};
893 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
894 f(T(i),engine);
895 }
896 });
897 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
899}
900
906template <typename L, int dim>
907std::enable_if_t<MaybeDeviceRunnable<L>::value>
908ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
909{
910 if (amrex::isEmpty(box)) { return; }
911 randState_t* rand_state = getRandState();
912 const BoxIndexerND<dim> indexer(box);
913 const auto ec = Gpu::ExecutionConfig(box.numPts());
914 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
916 ec.numThreads, 0, Gpu::gpuStream(),
917 [=] AMREX_GPU_DEVICE () noexcept {
918 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
919 RandomEngine engine{&(rand_state[tid])};
920 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
921 auto iv = indexer.intVect(icell);
922 detail::call_f_intvect_engine(f, iv, engine);
923 }
924 });
925 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
927}
928
934template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
935std::enable_if_t<MaybeDeviceRunnable<L>::value>
936ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
937{
938 if (amrex::isEmpty(box)) { return; }
939 randState_t* rand_state = getRandState();
940 const BoxIndexerND<dim> indexer(box);
941 const auto ec = Gpu::ExecutionConfig(box.numPts());
942 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
944 ec.numThreads, 0, Gpu::gpuStream(),
945 [=] AMREX_GPU_DEVICE () noexcept {
946 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
947 RandomEngine engine{&(rand_state[tid])};
948 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
949 auto iv = indexer.intVect(icell);
950 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
951 }
952 });
953 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
955}
956
961template <int MT, typename L1, typename L2, int dim>
962std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
964 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
965{
966 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
967 const BoxIndexerND<dim> indexer1(box1);
968 const BoxIndexerND<dim> indexer2(box2);
969 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
970 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
971 [=] AMREX_GPU_DEVICE () noexcept {
972 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
973 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
974 icell < ncells; icell += stride) {
975 if (icell < indexer1.numPts()) {
976 auto iv = indexer1.intVect(icell);
977 detail::call_f_intvect(f1, iv);
978 }
979 if (icell < indexer2.numPts()) {
980 auto iv = indexer2.intVect(icell);
981 detail::call_f_intvect(f2, iv);
982 }
983 }
984 });
986}
987
992template <int MT, typename L1, typename L2, typename L3, int dim>
993std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
995 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
996 L1&& f1, L2&& f2, L3&& f3) noexcept
997{
998 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
999 const BoxIndexerND<dim> indexer1(box1);
1000 const BoxIndexerND<dim> indexer2(box2);
1001 const BoxIndexerND<dim> indexer3(box3);
1002 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1003 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1004 [=] AMREX_GPU_DEVICE () noexcept {
1005 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1006 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1007 icell < ncells; icell += stride) {
1008 if (icell < indexer1.numPts()) {
1009 auto iv = indexer1.intVect(icell);
1010 detail::call_f_intvect(f1, iv);
1011 }
1012 if (icell < indexer2.numPts()) {
1013 auto iv = indexer2.intVect(icell);
1014 detail::call_f_intvect(f2, iv);
1015 }
1016 if (icell < indexer3.numPts()) {
1017 auto iv = indexer3.intVect(icell);
1018 detail::call_f_intvect(f3, iv);
1019 }
1020 }
1021 });
1023}
1024
1029template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1030 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1031 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1032std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1034 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1035 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1036{
1037 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1038 const BoxIndexerND<dim> indexer1(box1);
1039 const BoxIndexerND<dim> indexer2(box2);
1040 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1041 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1042 [=] AMREX_GPU_DEVICE () noexcept {
1043 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1044 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1045 icell < ncells; icell += stride) {
1046 if (icell < indexer1.numPts()) {
1047 auto iv = indexer1.intVect(icell);
1048 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1049 }
1050 if (icell < indexer2.numPts()) {
1051 auto iv = indexer2.intVect(icell);
1052 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1053 }
1054 }
1055 });
1057}
1058
1063template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1064 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1065 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1066 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1067std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1069 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1070 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1071 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1072{
1073 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1074 const BoxIndexerND<dim> indexer1(box1);
1075 const BoxIndexerND<dim> indexer2(box2);
1076 const BoxIndexerND<dim> indexer3(box3);
1077 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1078 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1079 [=] AMREX_GPU_DEVICE () noexcept {
1080 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1081 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1082 icell < ncells; icell += stride) {
1083 if (icell < indexer1.numPts()) {
1084 auto iv = indexer1.intVect(icell);
1085 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1086 }
1087 if (icell < indexer2.numPts()) {
1088 auto iv = indexer2.intVect(icell);
1089 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1090 }
1091 if (icell < indexer3.numPts()) {
1092 auto iv = indexer3.intVect(icell);
1093 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1094 }
1095 }
1096 });
1098}
1099
1100#endif
1101
1102template <typename L>
1103void single_task (L&& f) noexcept
1104{
1105 single_task(Gpu::gpuStream(), std::forward<L>(f));
1106}
1107
1108template<typename T, typename L>
1109void launch (T const& n, L&& f) noexcept
1110{
1111 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1112}
1113
1118template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1119std::enable_if_t<MaybeDeviceRunnable<L>::value>
1120ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1121{
1122 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1123}
1124
1129template <typename L, int dim>
1130std::enable_if_t<MaybeDeviceRunnable<L>::value>
1131ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1132{
1133 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1134}
1135
1140template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1141std::enable_if_t<MaybeDeviceRunnable<L>::value>
1142ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1143{
1144 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1145}
1146
1151template <typename L1, typename L2, int dim>
1152std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1154 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1155{
1156 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1157 std::forward<L2>(f2));
1158}
1159
1164template <typename L1, typename L2, typename L3, int dim>
1165std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1167 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1168 L1&& f1, L2&& f2, L3&& f3) noexcept
1169{
1170 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1171 std::forward<L2>(f2), std::forward<L3>(f3));
1172}
1173
1178template <typename T1, typename T2, typename L1, typename L2, int dim,
1179 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1180 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1181std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1183 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1184 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1185{
1186 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1187 box2, ncomp2, std::forward<L2>(f2));
1188}
1189
1194template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1195 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1196 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1197 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1198std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1200 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1201 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1202 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1203{
1204 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1205 box2, ncomp2, std::forward<L2>(f2),
1206 box3, ncomp3, std::forward<L3>(f3));
1207}
1208
1209template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1210void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1211{
1212 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1213}
1214
1215template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1216void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1217{
1218 ParallelFor<MT>(info, n,std::forward<L>(f));
1219}
1220
1221template <typename L, int dim>
1222void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1223{
1224 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1225}
1226
1227template <int MT, typename L, int dim>
1228void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1229{
1230 ParallelFor<MT>(info, box,std::forward<L>(f));
1231}
1232
1233template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1234void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1235{
1236 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1237}
1238
1239template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1240void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1241{
1242 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1243}
1244
1245template <typename L1, typename L2, int dim>
1246void For (Gpu::KernelInfo const& info,
1247 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1248{
1249 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1250}
1251
1252template <int MT, typename L1, typename L2, int dim>
1253void For (Gpu::KernelInfo const& info,
1254 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1255{
1256 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1257}
1258
1259template <typename L1, typename L2, typename L3, int dim>
1260void For (Gpu::KernelInfo const& info,
1261 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1262 L1&& f1, L2&& f2, L3&& f3) noexcept
1263{
1264 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1265}
1266
1267template <int MT, typename L1, typename L2, typename L3, int dim>
1268void For (Gpu::KernelInfo const& info,
1269 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1270 L1&& f1, L2&& f2, L3&& f3) noexcept
1271{
1272 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1273}
1274
1275template <typename T1, typename T2, typename L1, typename L2, int dim,
1276 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1277 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1278void For (Gpu::KernelInfo const& info,
1279 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1280 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1281{
1282 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1283}
1284
1285template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1286 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1287 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1288void For (Gpu::KernelInfo const& info,
1289 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1290 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1291{
1292 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1293}
1294
1295template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1296 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1297 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1298 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1299void For (Gpu::KernelInfo const& info,
1300 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1301 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1302 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1303{
1304 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1305 box1,ncomp1,std::forward<L1>(f1),
1306 box2,ncomp2,std::forward<L2>(f2),
1307 box3,ncomp3,std::forward<L3>(f3));
1308}
1309
1310template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1311 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1312 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1313 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1314void For (Gpu::KernelInfo const& info,
1315 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1316 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1317 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1318{
1319 ParallelFor<MT>(info,
1320 box1,ncomp1,std::forward<L1>(f1),
1321 box2,ncomp2,std::forward<L2>(f2),
1322 box3,ncomp3,std::forward<L3>(f3));
1323}
1324
1329template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1330void ParallelFor (T n, L&& f) noexcept
1331{
1332 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1333}
1334
1339template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1340void ParallelFor (T n, L&& f) noexcept
1341{
1342 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1343}
1344
1349template <typename L, int dim>
1350void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1351{
1352 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1353}
1354
1359template <int MT, typename L, int dim>
1360void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1361{
1362 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1363}
1364
1369template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1370void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1371{
1372 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1373}
1374
1379template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1380void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1381{
1382 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1383}
1384
1389template <typename L1, typename L2, int dim>
1390void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1391{
1392 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1393}
1394
1399template <int MT, typename L1, typename L2, int dim>
1400void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1401{
1402 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1403}
1404
1409template <typename L1, typename L2, typename L3, int dim>
1410void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1411 L1&& f1, L2&& f2, L3&& f3) noexcept
1412{
1413 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1414}
1415
1420template <int MT, typename L1, typename L2, typename L3, int dim>
1421void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1422 L1&& f1, L2&& f2, L3&& f3) noexcept
1423{
1424 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1425}
1426
1431template <typename T1, typename T2, typename L1, typename L2, int dim,
1432 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1433 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1434void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1435 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1436{
1437 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1438}
1439
1444template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1445 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1446 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1447void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1448 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1449{
1450 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1451}
1452
1457template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1458 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1459 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1460 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1461void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1462 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1463 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1464{
1465 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1466 box1,ncomp1,std::forward<L1>(f1),
1467 box2,ncomp2,std::forward<L2>(f2),
1468 box3,ncomp3,std::forward<L3>(f3));
1469}
1470
1475template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1476 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1477 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1478 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1479void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1480 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1481 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1482{
1483 ParallelFor<MT>(Gpu::KernelInfo{},
1484 box1,ncomp1,std::forward<L1>(f1),
1485 box2,ncomp2,std::forward<L2>(f2),
1486 box3,ncomp3,std::forward<L3>(f3));
1487}
1488
1489template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1490void For (T n, L&& f) noexcept
1491{
1492 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1493}
1494
1495template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1496void For (T n, L&& f) noexcept
1497{
1498 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1499}
1500
1501template <typename L, int dim>
1502void For (BoxND<dim> const& box, L&& f) noexcept
1503{
1504 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1505}
1506
1507template <int MT, typename L, int dim>
1508void For (BoxND<dim> const& box, L&& f) noexcept
1509{
1510 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1511}
1512
1513template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1514void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1515{
1516 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1517}
1518
1519template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1520void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1521{
1522 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1523}
1524
1525template <typename L1, typename L2, int dim>
1526void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1527{
1528 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1529}
1530
1531template <int MT, typename L1, typename L2, int dim>
1532void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1533{
1534 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1535}
1536
1537template <typename L1, typename L2, typename L3, int dim>
1538void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1539 L1&& f1, L2&& f2, L3&& f3) noexcept
1540{
1541 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1542}
1543
1544template <int MT, typename L1, typename L2, typename L3, int dim>
1545void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1546 L1&& f1, L2&& f2, L3&& f3) noexcept
1547{
1548 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1549}
1550
1551template <typename T1, typename T2, typename L1, typename L2, int dim,
1552 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1553 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1554void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1555 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1556{
1557 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1558}
1559
1560template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1561 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1562 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1563void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1564 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1565{
1566 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1567}
1568
1569template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1570 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1571 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1572 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1573void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1574 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1575 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1576{
1577 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1578 box1,ncomp1,std::forward<L1>(f1),
1579 box2,ncomp2,std::forward<L2>(f2),
1580 box3,ncomp3,std::forward<L3>(f3));
1581}
1582
1583template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1584 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1585 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1586 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1587void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1588 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1589 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1590{
1591 ParallelFor<MT>(Gpu::KernelInfo{},
1592 box1,ncomp1,std::forward<L1>(f1),
1593 box2,ncomp2,std::forward<L2>(f2),
1594 box3,ncomp3,std::forward<L3>(f3));
1595}
1596
1597template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1598std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1599HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1600{
1601 if (Gpu::inLaunchRegion()) {
1602 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1603 } else {
1604#ifdef AMREX_USE_SYCL
1605 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1606#else
1608 for (T i = 0; i < n; ++i) { f(i); }
1609#endif
1610 }
1611}
1612
1613template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1614std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1615HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1616{
1617 if (Gpu::inLaunchRegion()) {
1618 ParallelFor<MT>(info,n,std::forward<L>(f));
1619 } else {
1620#ifdef AMREX_USE_SYCL
1621 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1622#else
1624 for (T i = 0; i < n; ++i) { f(i); }
1625#endif
1626 }
1627}
1628
1629template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1630std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1631HostDeviceParallelFor (T n, L&& f) noexcept
1632{
1633 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1634}
1635
1636template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1637std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1638HostDeviceParallelFor (T n, L&& f) noexcept
1639{
1640 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1641}
1642
1643template <typename L, int dim>
1644std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1645HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1646{
1647 if (Gpu::inLaunchRegion()) {
1648 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1649 } else {
1650#ifdef AMREX_USE_SYCL
1651 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1652#else
1653 LoopConcurrentOnCpu(box,std::forward<L>(f));
1654#endif
1655 }
1656}
1657
1658template <int MT, typename L, int dim>
1659std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1660HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1661{
1662 if (Gpu::inLaunchRegion()) {
1663 ParallelFor<MT>(info, box,std::forward<L>(f));
1664 } else {
1665#ifdef AMREX_USE_SYCL
1666 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1667#else
1668 LoopConcurrentOnCpu(box,std::forward<L>(f));
1669#endif
1670 }
1671}
1672
1673template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1674std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1675HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1676{
1677 if (Gpu::inLaunchRegion()) {
1678 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1679 } else {
1680#ifdef AMREX_USE_SYCL
1681 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1682#else
1683 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1684#endif
1685 }
1686}
1687
1688template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1689std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1690HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1691{
1692 if (Gpu::inLaunchRegion()) {
1693 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1694 } else {
1695#ifdef AMREX_USE_SYCL
1696 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1697#else
1698 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1699#endif
1700 }
1701}
1702
1703template <typename L1, typename L2, int dim>
1704std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1706 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1707{
1708 if (Gpu::inLaunchRegion()) {
1709 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1710 } else {
1711#ifdef AMREX_USE_SYCL
1712 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1713#else
1714 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1715 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1716#endif
1717 }
1718}
1719
1720template <int MT, typename L1, typename L2, int dim>
1721std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1723 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1724{
1725 if (Gpu::inLaunchRegion()) {
1726 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1727 } else {
1728#ifdef AMREX_USE_SYCL
1729 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1730#else
1731 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1732 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1733#endif
1734 }
1735}
1736
1737template <int MT, typename L1, typename L2, typename L3, int dim>
1738std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1740 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1741 L1&& f1, L2&& f2, L3&& f3) noexcept
1742{
1743 if (Gpu::inLaunchRegion()) {
1744 ParallelFor<MT>(info,box1,box2,box3,
1745 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1746 } else {
1747#ifdef AMREX_USE_SYCL
1748 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1749#else
1750 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1751 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1752 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1753#endif
1754 }
1755}
1756
1757template <typename T1, typename T2, typename L1, typename L2, int dim,
1758 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1759 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1760std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1762 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1763 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1764{
1765 if (Gpu::inLaunchRegion()) {
1766 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1767 } else {
1768#ifdef AMREX_USE_SYCL
1769 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1770#else
1771 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1772 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1773#endif
1774 }
1775}
1776
1777template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1778 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1779 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1780std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1782 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1783 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1784{
1785 if (Gpu::inLaunchRegion()) {
1786 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1787 } else {
1788#ifdef AMREX_USE_SYCL
1789 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1790#else
1791 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1792 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1793#endif
1794 }
1795}
1796
1797template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1798 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1799 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1800 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1801std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1803 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1804 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1805 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1806{
1807 if (Gpu::inLaunchRegion()) {
1808 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1809 box1,ncomp1,std::forward<L1>(f1),
1810 box2,ncomp2,std::forward<L2>(f2),
1811 box3,ncomp3,std::forward<L3>(f3));
1812 } else {
1813#ifdef AMREX_USE_SYCL
1814 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1815#else
1816 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1817 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1818 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1819#endif
1820 }
1821}
1822
1823template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1824 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1825 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1826 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1827std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1829 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1830 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1831 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1832{
1833 if (Gpu::inLaunchRegion()) {
1834 ParallelFor<MT>(info,
1835 box1,ncomp1,std::forward<L1>(f1),
1836 box2,ncomp2,std::forward<L2>(f2),
1837 box3,ncomp3,std::forward<L3>(f3));
1838 } else {
1839#ifdef AMREX_USE_SYCL
1840 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1841#else
1842 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1843 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1844 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1845#endif
1846 }
1847}
1848
1849template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1850void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1851{
1852 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1853}
1854
1855template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1856void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1857{
1858 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1859}
1860
1861template <typename L, int dim>
1862void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1863{
1864 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1865}
1866
1867template <int MT, typename L, int dim>
1868void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1869{
1870 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1871}
1872
1873template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1874void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1875{
1876 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1877}
1878
1879template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1880void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1881{
1882 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1883}
1884
1885template <typename L1, typename L2, int dim>
1886void HostDeviceFor (Gpu::KernelInfo const& info,
1887 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1888{
1889 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1890}
1891
1892template <int MT, typename L1, typename L2, int dim>
1893void HostDeviceFor (Gpu::KernelInfo const& info,
1894 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1895{
1896 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1897}
1898
1899template <typename L1, typename L2, typename L3, int dim>
1900void HostDeviceFor (Gpu::KernelInfo const& info,
1901 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1902 L1&& f1, L2&& f2, L3&& f3) noexcept
1903{
1904 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1905 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1906}
1907
1908template <int MT, typename L1, typename L2, typename L3, int dim>
1909void HostDeviceFor (Gpu::KernelInfo const& info,
1910 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1911 L1&& f1, L2&& f2, L3&& f3) noexcept
1912{
1913 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1914 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1915}
1916
1917template <typename T1, typename T2, typename L1, typename L2, int dim,
1918 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1919 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1920void HostDeviceFor (Gpu::KernelInfo const& info,
1921 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1922 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1923{
1924 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1925}
1926
1927template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1928 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1929 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1930void HostDeviceFor (Gpu::KernelInfo const& info,
1931 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1932 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1933{
1934 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1935}
1936
1937template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1938 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1939 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1940 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1941void HostDeviceFor (Gpu::KernelInfo const& info,
1942 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1943 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1944 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1945{
1946 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1947 box1,ncomp1,std::forward<L1>(f1),
1948 box2,ncomp2,std::forward<L2>(f2),
1949 box3,ncomp3,std::forward<L3>(f3));
1950}
1951
1952template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1953 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1954 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1955 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1956void HostDeviceFor (Gpu::KernelInfo const& info,
1957 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1958 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1959 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1960{
1961 HostDeviceParallelFor<MT>(info,
1962 box1,ncomp1,std::forward<L1>(f1),
1963 box2,ncomp2,std::forward<L2>(f2),
1964 box3,ncomp3,std::forward<L3>(f3));
1965}
1966
1967template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1968void HostDeviceParallelFor (T n, L&& f) noexcept
1969{
1970 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1971}
1972
1973template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1974void HostDeviceParallelFor (T n, L&& f) noexcept
1975{
1976 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1977}
1978
1979template <typename L, int dim>
1980void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1981{
1982 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1983}
1984
1985template <int MT, typename L, int dim>
1986void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1987{
1988 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1989}
1990
1991template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1992void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1993{
1994 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1995}
1996
1997template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1998void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1999{
2000 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
2001}
2002
2003template <typename L1, typename L2, int dim>
2004void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2005{
2006 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2007}
2008
2009template <int MT, typename L1, typename L2, int dim>
2010void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2011{
2012 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2013}
2014
2015template <typename L1, typename L2, typename L3, int dim>
2016void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2017 L1&& f1, L2&& f2, L3&& f3) noexcept
2018{
2019 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2020 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2021}
2022
2023template <int MT, typename L1, typename L2, typename L3, int dim>
2024void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2025 L1&& f1, L2&& f2, L3&& f3) noexcept
2026{
2027 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2028 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2029}
2030
2031template <typename T1, typename T2, typename L1, typename L2, int dim,
2032 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2033 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2034void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2035 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2036{
2037 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2038}
2039
2040template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
2041 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2042 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2043void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2044 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2045{
2046 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2047}
2048
2049template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2050 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2051 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2052 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2053void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2054 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2055 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2056{
2057 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2058 box1,ncomp1,std::forward<L1>(f1),
2059 box2,ncomp2,std::forward<L2>(f2),
2060 box3,ncomp3,std::forward<L3>(f3));
2061}
2062
2063template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2064 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2065 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2066 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2067void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2068 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2069 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2070{
2071 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2072 box1,ncomp1,std::forward<L1>(f1),
2073 box2,ncomp2,std::forward<L2>(f2),
2074 box3,ncomp3,std::forward<L3>(f3));
2075}
2076
2077}
2078
2079#endif
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:186
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:140
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:922
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:809
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1362
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1274
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72