Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
7namespace detail {
8
9 // call_f_scalar_handler
10
11 template <typename F, typename N>
13 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
14 noexcept -> decltype(f(0))
15 {
16 f(i);
17 }
18
19 template <typename F, typename N>
21 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
22 noexcept -> decltype(f(0,Gpu::Handler{}))
23 {
24 f(i, handler);
25 }
26
27 // call_f_intvect_inner
28
29 template <typename F, std::size_t...Ns, class...Args>
31 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
32 noexcept -> decltype(f(0, 0, 0, args...))
33 {
34 f(iv[0], 0, 0, args...);
35 }
36
37 template <typename F, std::size_t...Ns, class...Args>
39 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
40 noexcept -> decltype(f(0, 0, 0, args...))
41 {
42 f(iv[0], iv[1], 0, args...);
43 }
44
45 template <typename F, int dim, std::size_t...Ns, class...Args>
47 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
48 noexcept -> decltype(f(iv, args...))
49 {
50 f(iv, args...);
51 }
52
53 template <typename F, int dim, std::size_t...Ns, class...Args>
55 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
56 noexcept -> decltype(f(iv[Ns]..., args...))
57 {
58 f(iv[Ns]..., args...);
59 }
60
61 // call_f_intvect
62
63 template <typename F, int dim>
65 auto call_f_intvect (F const& f, IntVectND<dim> iv)
66 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
67 {
68 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
69 }
70
71 // call_f_intvect_engine
72
73 template <typename F, int dim>
76 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
77 {
78 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
79 }
80
81 // call_f_intvect_handler
82
83 template <typename F, int dim>
86 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
87 {
88 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
89 }
90
91 template <typename F, int dim>
93 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
94 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
95 {
96 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
97 }
98
99 // call_f_intvect_ncomp
100
101 template <typename F, typename T, int dim>
103 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
104 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
105 {
106 for (T n = 0; n < ncomp; ++n) {
107 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
108 }
109 }
110
111 // call_f_intvect_ncomp_engine
112
113 template <typename F, typename T, int dim>
115 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
116 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
117 {
118 for (T n = 0; n < ncomp; ++n) {
119 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
120 }
121 }
122
123 // call_f_intvect_ncomp_handler
124
125 template <typename F, typename T, int dim>
127 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
128 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
129 {
130 for (T n = 0; n < ncomp; ++n) {
131 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
132 }
133 }
134
135 template <typename F, typename T, int dim>
137 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
138 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
139 {
140 for (T n = 0; n < ncomp; ++n) {
141 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
142 }
143 }
144
145}
146
147#ifdef AMREX_USE_SYCL
148
149template <typename L>
150void single_task (gpuStream_t stream, L const& f) noexcept
151{
152 auto& q = *(stream.queue);
153 try {
154 q.submit([&] (sycl::handler& h) {
155 h.single_task([=] () { f(); });
156 });
157 } catch (sycl::exception const& ex) {
158 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
159 }
160}
161
162template<typename L>
163void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
164 gpuStream_t stream, L const& f) noexcept
165{
166 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
168 / sizeof(unsigned long long);
169 auto& q = *(stream.queue);
170 try {
171 q.submit([&] (sycl::handler& h) {
172 sycl::local_accessor<unsigned long long>
173 shared_data(sycl::range<1>(shared_mem_numull), h);
174 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175 sycl::range<1>(nthreads_per_block)),
176 [=] (sycl::nd_item<1> item)
177 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
178 {
179 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
180 });
181 });
182 } catch (sycl::exception const& ex) {
183 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
184 }
185}
186
187template<typename L>
188void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
189{
190 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191 auto& q = *(stream.queue);
192 try {
193 q.submit([&] (sycl::handler& h) {
194 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195 sycl::range<1>(nthreads_per_block)),
196 [=] (sycl::nd_item<1> item)
197 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
198 {
199 f(item);
200 });
201 });
202 } catch (sycl::exception const& ex) {
203 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
204 }
205}
206
207template <int MT, typename L>
208void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
209 L const& f) noexcept
210{
211 const auto nthreads_total = MT * std::size_t(nblocks);
212 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
213 / sizeof(unsigned long long);
214 auto& q = *(stream.queue);
215 try {
216 q.submit([&] (sycl::handler& h) {
217 sycl::local_accessor<unsigned long long>
218 shared_data(sycl::range<1>(shared_mem_numull), h);
219 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
220 sycl::range<1>(MT)),
221 [=] (sycl::nd_item<1> item)
222 [[sycl::reqd_work_group_size(MT)]]
223 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
224 {
225 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
226 });
227 });
228 } catch (sycl::exception const& ex) {
229 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230 }
231}
232
233template <int MT, typename L>
234void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
235{
236 const auto nthreads_total = MT * std::size_t(nblocks);
237 auto& q = *(stream.queue);
238 try {
239 q.submit([&] (sycl::handler& h) {
240 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
241 sycl::range<1>(MT)),
242 [=] (sycl::nd_item<1> item)
243 [[sycl::reqd_work_group_size(MT)]]
244 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
245 {
246 f(item);
247 });
248 });
249 } catch (sycl::exception const& ex) {
250 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
251 }
252}
253
254template<int MT, typename T, typename L>
255void launch (T const& n, L const& f) noexcept
256{
257 if (amrex::isEmpty(n)) { return; }
258 const auto ec = Gpu::makeExecutionConfig<MT>(n);
259 const auto nthreads_per_block = ec.numThreads.x;
260 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
261 auto& q = Gpu::Device::streamQueue();
262 try {
263 q.submit([&] (sycl::handler& h) {
264 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
265 sycl::range<1>(nthreads_per_block)),
266 [=] (sycl::nd_item<1> item)
267 [[sycl::reqd_work_group_size(MT)]]
268 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
269 {
270 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
271 f(i);
272 }
273 });
274 });
275 } catch (sycl::exception const& ex) {
276 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
277 }
278}
279
280template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
281void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
282{
283 if (amrex::isEmpty(n)) { return; }
284 const auto ec = Gpu::makeExecutionConfig<MT>(n);
285 const auto nthreads_per_block = ec.numThreads.x;
286 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
287 auto& q = Gpu::Device::streamQueue();
288 try {
289 if (info.hasReduction()) {
290 q.submit([&] (sycl::handler& h) {
291 sycl::local_accessor<unsigned long long>
292 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
293 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
294 sycl::range<1>(nthreads_per_block)),
295 [=] (sycl::nd_item<1> item)
296 [[sycl::reqd_work_group_size(MT)]]
297 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
298 {
299 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
300 i < std::size_t(n); i += stride) {
301 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
302 item.get_local_range(0));
303 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
304 n_active_threads});
305 }
306 });
307 });
308 } else {
309 q.submit([&] (sycl::handler& h) {
310 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
311 sycl::range<1>(nthreads_per_block)),
312 [=] (sycl::nd_item<1> item)
313 [[sycl::reqd_work_group_size(MT)]]
314 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
315 {
316 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
317 i < std::size_t(n); i += stride) {
318 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
319 }
320 });
321 });
322 }
323 } catch (sycl::exception const& ex) {
324 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
325 }
326}
327
328template <int MT, typename L, int dim>
329void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
330{
331 if (amrex::isEmpty(box)) { return; }
332 const BoxIndexerND<dim> indexer(box);
333 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
334 const auto nthreads_per_block = ec.numThreads.x;
335 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
336 auto& q = Gpu::Device::streamQueue();
337 try {
338 if (info.hasReduction()) {
339 q.submit([&] (sycl::handler& h) {
340 sycl::local_accessor<unsigned long long>
341 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
342 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
343 sycl::range<1>(nthreads_per_block)),
344 [=] (sycl::nd_item<1> item)
345 [[sycl::reqd_work_group_size(MT)]]
346 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
347 {
348 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
349 icell < indexer.numPts(); icell += stride) {
350 auto iv = indexer.intVect(icell);
351 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
352 std::uint64_t(item.get_local_range(0)));
353 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
354 n_active_threads});
355 }
356 });
357 });
358 } else {
359 q.submit([&] (sycl::handler& h) {
360 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
361 sycl::range<1>(nthreads_per_block)),
362 [=] (sycl::nd_item<1> item)
363 [[sycl::reqd_work_group_size(MT)]]
364 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
365 {
366 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
367 icell < indexer.numPts(); icell += stride) {
368 auto iv = indexer.intVect(icell);
369 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
370 }
371 });
372 });
373 }
374 } catch (sycl::exception const& ex) {
375 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
376 }
377}
378
379template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
380void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
381{
382 if (amrex::isEmpty(box)) { return; }
383 const BoxIndexerND<dim> indexer(box);
384 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
385 const auto nthreads_per_block = ec.numThreads.x;
386 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
387 auto& q = Gpu::Device::streamQueue();
388 try {
389 if (info.hasReduction()) {
390 q.submit([&] (sycl::handler& h) {
391 sycl::local_accessor<unsigned long long>
392 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
393 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
394 sycl::range<1>(nthreads_per_block)),
395 [=] (sycl::nd_item<1> item)
396 [[sycl::reqd_work_group_size(MT)]]
397 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
398 {
399 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
400 icell < indexer.numPts(); icell += stride) {
401 auto iv = indexer.intVect(icell);
402 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
403 std::uint64_t(item.get_local_range(0)));
405 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
406 n_active_threads});
407 }
408 });
409 });
410 } else {
411 q.submit([&] (sycl::handler& h) {
412 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
413 sycl::range<1>(nthreads_per_block)),
414 [=] (sycl::nd_item<1> item)
415 [[sycl::reqd_work_group_size(MT)]]
416 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
417 {
418 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
419 icell < indexer.numPts(); icell += stride) {
420 auto iv = indexer.intVect(icell);
421 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
422 }
423 });
424 });
425 }
426 } catch (sycl::exception const& ex) {
427 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
428 }
429}
430
431template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
432void ParallelForRNG (T n, L const& f) noexcept
433{
434 if (amrex::isEmpty(n)) { return; }
435 const auto ec = Gpu::ExecutionConfig(n);
436 const auto nthreads_per_block = ec.numThreads.x;
437 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
438 auto& q = Gpu::Device::streamQueue();
439 auto& engdescr = *(getRandEngineDescriptor());
440 try {
441 q.submit([&] (sycl::handler& h) {
442 auto engine_acc = engdescr.get_access(h);
443 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
444 sycl::range<1>(nthreads_per_block)),
445 [=] (sycl::nd_item<1> item)
446 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
447 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
448 {
449 auto const tid = item.get_global_id(0);
450 auto engine = engine_acc.load(tid);
451 RandomEngine rand_eng{&engine};
452 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
453 f(T(i),rand_eng);
454 }
455 engine_acc.store(engine, tid);
456 });
457 });
458 q.wait_and_throw(); // because next launch might be on a different queue
459 } catch (sycl::exception const& ex) {
460 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461 }
462}
463
464template <typename L, int dim>
465void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
466{
467 if (amrex::isEmpty(box)) { return; }
468 const BoxIndexerND<dim> indexer(box);
469 const auto ec = Gpu::ExecutionConfig(box.numPts());
470 const auto nthreads_per_block = ec.numThreads.x;
471 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
472 auto& q = Gpu::Device::streamQueue();
473 auto& engdescr = *(getRandEngineDescriptor());
474 try {
475 q.submit([&] (sycl::handler& h) {
476 auto engine_acc = engdescr.get_access(h);
477 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
478 sycl::range<1>(nthreads_per_block)),
479 [=] (sycl::nd_item<1> item)
480 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
481 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
482 {
483 auto const tid = item.get_global_id(0);
484 auto engine = engine_acc.load(tid);
485 RandomEngine rand_eng{&engine};
486 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
487 icell < indexer.numPts(); icell += stride) {
488 auto iv = indexer.intVect(icell);
489 detail::call_f_intvect_engine(f,iv,rand_eng);
490 }
491 engine_acc.store(engine, tid);
492 });
493 });
494 q.wait_and_throw(); // because next launch might be on a different queue
495 } catch (sycl::exception const& ex) {
496 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
497 }
498}
499
500template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
501void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
502{
503 if (amrex::isEmpty(box)) { return; }
504 const BoxIndexerND<dim> indexer(box);
505 const auto ec = Gpu::ExecutionConfig(box.numPts());
506 const auto nthreads_per_block = ec.numThreads.x;
507 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
508 auto& q = Gpu::Device::streamQueue();
509 auto& engdescr = *(getRandEngineDescriptor());
510 try {
511 q.submit([&] (sycl::handler& h) {
512 auto engine_acc = engdescr.get_access(h);
513 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
514 sycl::range<1>(nthreads_per_block)),
515 [=] (sycl::nd_item<1> item)
516 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
517 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
518 {
519 auto const tid = item.get_global_id(0);
520 auto engine = engine_acc.load(tid);
521 RandomEngine rand_eng{&engine};
522 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
523 icell < indexer.numPts(); icell += stride) {
524 auto iv = indexer.intVect(icell);
525 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
526 }
527 engine_acc.store(engine, tid);
528 });
529 });
530 q.wait_and_throw(); // because next launch might be on a different queue
531 } catch (sycl::exception const& ex) {
532 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
533 }
534}
535
536template <int MT, typename L1, typename L2, int dim>
537void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
538{
539 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
540 const BoxIndexerND<dim> indexer1(box1);
541 const BoxIndexerND<dim> indexer2(box2);
542 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
543 const auto nthreads_per_block = ec.numThreads.x;
544 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
545 auto& q = Gpu::Device::streamQueue();
546 try {
547 q.submit([&] (sycl::handler& h) {
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(MT)]]
552 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553 {
554 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
555 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
556 icell < ncells; icell += stride) {
557 if (icell < indexer1.numPts()) {
558 auto iv = indexer1.intVect(icell);
560 }
561 if (icell < indexer2.numPts()) {
562 auto iv = indexer2.intVect(icell);
564 }
565 }
566 });
567 });
568 } catch (sycl::exception const& ex) {
569 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570 }
571}
572
573template <int MT, typename L1, typename L2, typename L3, int dim>
574void ParallelFor (Gpu::KernelInfo const& /*info*/,
575 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
576 L1&& f1, L2&& f2, L3&& f3) noexcept
577{
578 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
579 const BoxIndexerND<dim> indexer1(box1);
580 const BoxIndexerND<dim> indexer2(box2);
581 const BoxIndexerND<dim> indexer3(box3);
582 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
583 const auto nthreads_per_block = ec.numThreads.x;
584 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
585 auto& q = Gpu::Device::streamQueue();
586 try {
587 q.submit([&] (sycl::handler& h) {
588 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
589 sycl::range<1>(nthreads_per_block)),
590 [=] (sycl::nd_item<1> item)
591 [[sycl::reqd_work_group_size(MT)]]
592 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
593 {
594 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
595 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
596 icell < ncells; icell += stride) {
597 if (icell < indexer1.numPts()) {
598 auto iv = indexer1.intVect(icell);
600 }
601 if (icell < indexer2.numPts()) {
602 auto iv = indexer2.intVect(icell);
604 }
605 if (icell < indexer3.numPts()) {
606 auto iv = indexer3.intVect(icell);
608 }
609 }
610 });
611 });
612 } catch (sycl::exception const& ex) {
613 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
614 }
615}
616
617template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
618 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
619 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
620void ParallelFor (Gpu::KernelInfo const& /*info*/,
621 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
622 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
623{
624 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
625 const BoxIndexerND<dim> indexer1(box1);
626 const BoxIndexerND<dim> indexer2(box2);
627 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
628 const auto nthreads_per_block = ec.numThreads.x;
629 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
630 auto& q = Gpu::Device::streamQueue();
631 try {
632 q.submit([&] (sycl::handler& h) {
633 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
634 sycl::range<1>(nthreads_per_block)),
635 [=] (sycl::nd_item<1> item)
636 [[sycl::reqd_work_group_size(MT)]]
637 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
638 {
639 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
640 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
641 icell < ncells; icell += stride) {
642 if (icell < indexer1.numPts()) {
643 auto iv = indexer1.intVect(icell);
644 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
645 }
646 if (icell < indexer2.numPts()) {
647 auto iv = indexer2.intVect(icell);
648 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
649 }
650 }
651 });
652 });
653 } catch (sycl::exception const& ex) {
654 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
655 }
656}
657
658template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
659 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
660 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
661 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
662void ParallelFor (Gpu::KernelInfo const& /*info*/,
663 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
664 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
665 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
666{
667 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
668 const BoxIndexerND<dim> indexer1(box1);
669 const BoxIndexerND<dim> indexer2(box2);
670 const BoxIndexerND<dim> indexer3(box3);
671 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
672 const auto nthreads_per_block = ec.numThreads.x;
673 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
674 auto& q = Gpu::Device::streamQueue();
675 try {
676 q.submit([&] (sycl::handler& h) {
677 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
678 sycl::range<1>(nthreads_per_block)),
679 [=] (sycl::nd_item<1> item)
680 [[sycl::reqd_work_group_size(MT)]]
681 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
682 {
683 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
684 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
685 icell < ncells; icell += stride) {
686 if (icell < indexer1.numPts()) {
687 auto iv = indexer1.intVect(icell);
688 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
689 }
690 if (icell < indexer2.numPts()) {
691 auto iv = indexer2.intVect(icell);
692 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
693 }
694 if (icell < indexer3.numPts()) {
695 auto iv = indexer3.intVect(icell);
696 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
697 }
698 }
699 });
700 });
701 } catch (sycl::exception const& ex) {
702 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
703 }
704}
705
706#else
707// CUDA or HIP
708
709template <typename L>
710void single_task (gpuStream_t stream, L const& f) noexcept
711{
713 [=] AMREX_GPU_DEVICE () noexcept {f();});
715}
716
717template <int MT, typename L>
718void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
719 L const& f) noexcept
720{
721 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
722 [=] AMREX_GPU_DEVICE () noexcept { f(); });
724}
725
726template <int MT, typename L>
727void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
728{
729 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
730 [=] AMREX_GPU_DEVICE () noexcept { f(); });
732}
733
734template<typename L>
735void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736 gpuStream_t stream, L const& f) noexcept
737{
738 AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740 stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
742}
743
744template<typename L>
745void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
746{
747 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
748}
749
750template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
751void launch (T const& n, L const& f) noexcept
752{
753 static_assert(sizeof(T) >= 2);
754 if (amrex::isEmpty(n)) { return; }
755 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
756 for (auto const& ec : nec) {
757 const T start_idx = T(ec.start_idx);
758 const T nleft = n - start_idx;
759 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
760 [=] AMREX_GPU_DEVICE () noexcept {
761 // This will not overflow, even though nblocks*MT might.
762 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
763 if (tid < nleft) {
764 f(tid+start_idx);
765 }
766 });
767 }
769}
770
771template<int MT, int dim, typename L>
772void launch (BoxND<dim> const& box, L const& f) noexcept
773{
774 if (box.isEmpty()) { return; }
775 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
776 const BoxIndexerND<dim> indexer(box);
777 const auto type = box.ixType();
778 for (auto const& ec : nec) {
779 const auto start_idx = std::uint64_t(ec.start_idx);
780 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
781 [=] AMREX_GPU_DEVICE () noexcept {
782 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
783 if (icell < indexer.numPts()) {
784 auto iv = indexer.intVect(icell);
785 f(BoxND<dim>(iv,iv,type));
786 }
787 });
788 }
790}
791
792template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
793std::enable_if_t<MaybeDeviceRunnable<L>::value>
794ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
795{
796 static_assert(sizeof(T) >= 2);
797 if (amrex::isEmpty(n)) { return; }
798 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
799 for (auto const& ec : nec) {
800 const T start_idx = T(ec.start_idx);
801 const T nleft = n - start_idx;
802 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
803 [=] AMREX_GPU_DEVICE () noexcept {
804 // This will not overflow, even though nblocks*MT might.
805 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
806 if (tid < nleft) {
807 detail::call_f_scalar_handler(f, tid+start_idx,
808 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
809 (std::uint64_t)MT)));
810 }
811 });
812 }
814}
815
816template <int MT, typename L, int dim>
817std::enable_if_t<MaybeDeviceRunnable<L>::value>
818ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
819{
820 if (amrex::isEmpty(box)) { return; }
821 const BoxIndexerND<dim> indexer(box);
822 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
823 for (auto const& ec : nec) {
824 const auto start_idx = std::uint64_t(ec.start_idx);
825 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
826 [=] AMREX_GPU_DEVICE () noexcept {
827 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
828 if (icell < indexer.numPts()) {
829 auto iv = indexer.intVect(icell);
831 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
832 (std::uint64_t)MT)));
833 }
834 });
835 }
837}
838
839template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
840std::enable_if_t<MaybeDeviceRunnable<L>::value>
841ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
842{
843 if (amrex::isEmpty(box)) { return; }
844 const BoxIndexerND<dim> indexer(box);
845 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
846 for (auto const& ec : nec) {
847 const auto start_idx = std::uint64_t(ec.start_idx);
848 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
849 [=] AMREX_GPU_DEVICE () noexcept {
850 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
851 if (icell < indexer.numPts()) {
852 auto iv = indexer.intVect(icell);
854 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
855 (std::uint64_t)MT)));
856 }
857 });
858 }
860}
861
862template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
863std::enable_if_t<MaybeDeviceRunnable<L>::value>
864ParallelForRNG (T n, L const& f) noexcept
865{
866 if (amrex::isEmpty(n)) { return; }
867 randState_t* rand_state = getRandState();
868 const auto ec = Gpu::ExecutionConfig(n);
869 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
871 ec.numThreads, 0, Gpu::gpuStream(),
872 [=] AMREX_GPU_DEVICE () noexcept {
873 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
874 RandomEngine engine{&(rand_state[tid])};
875 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
876 f(T(i),engine);
877 }
878 });
879 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
881}
882
883template <typename L, int dim>
884std::enable_if_t<MaybeDeviceRunnable<L>::value>
885ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
886{
887 if (amrex::isEmpty(box)) { return; }
888 randState_t* rand_state = getRandState();
889 const BoxIndexerND<dim> indexer(box);
890 const auto ec = Gpu::ExecutionConfig(box.numPts());
891 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
893 ec.numThreads, 0, Gpu::gpuStream(),
894 [=] AMREX_GPU_DEVICE () noexcept {
895 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
896 RandomEngine engine{&(rand_state[tid])};
897 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
898 auto iv = indexer.intVect(icell);
899 detail::call_f_intvect_engine(f, iv, engine);
900 }
901 });
902 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
904}
905
906template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
907std::enable_if_t<MaybeDeviceRunnable<L>::value>
908ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
909{
910 if (amrex::isEmpty(box)) { return; }
911 randState_t* rand_state = getRandState();
912 const BoxIndexerND<dim> indexer(box);
913 const auto ec = Gpu::ExecutionConfig(box.numPts());
914 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
916 ec.numThreads, 0, Gpu::gpuStream(),
917 [=] AMREX_GPU_DEVICE () noexcept {
918 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
919 RandomEngine engine{&(rand_state[tid])};
920 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
921 auto iv = indexer.intVect(icell);
922 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
923 }
924 });
925 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
927}
928
929template <int MT, typename L1, typename L2, int dim>
930std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
932 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
933{
934 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
935 const BoxIndexerND<dim> indexer1(box1);
936 const BoxIndexerND<dim> indexer2(box2);
937 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
938 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
939 [=] AMREX_GPU_DEVICE () noexcept {
940 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
941 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
942 icell < ncells; icell += stride) {
943 if (icell < indexer1.numPts()) {
944 auto iv = indexer1.intVect(icell);
946 }
947 if (icell < indexer2.numPts()) {
948 auto iv = indexer2.intVect(icell);
950 }
951 }
952 });
954}
955
956template <int MT, typename L1, typename L2, typename L3, int dim>
957std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
959 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
960 L1&& f1, L2&& f2, L3&& f3) noexcept
961{
962 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
963 const BoxIndexerND<dim> indexer1(box1);
964 const BoxIndexerND<dim> indexer2(box2);
965 const BoxIndexerND<dim> indexer3(box3);
966 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
967 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
968 [=] AMREX_GPU_DEVICE () noexcept {
969 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
970 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
971 icell < ncells; icell += stride) {
972 if (icell < indexer1.numPts()) {
973 auto iv = indexer1.intVect(icell);
975 }
976 if (icell < indexer2.numPts()) {
977 auto iv = indexer2.intVect(icell);
979 }
980 if (icell < indexer3.numPts()) {
981 auto iv = indexer3.intVect(icell);
983 }
984 }
985 });
987}
988
989template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
990 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
991 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
992std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
994 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
995 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
996{
997 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
998 const BoxIndexerND<dim> indexer1(box1);
999 const BoxIndexerND<dim> indexer2(box2);
1000 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1001 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1002 [=] AMREX_GPU_DEVICE () noexcept {
1003 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1004 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1005 icell < ncells; icell += stride) {
1006 if (icell < indexer1.numPts()) {
1007 auto iv = indexer1.intVect(icell);
1008 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1009 }
1010 if (icell < indexer2.numPts()) {
1011 auto iv = indexer2.intVect(icell);
1012 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1013 }
1014 }
1015 });
1017}
1018
1019template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1020 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1021 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1022 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1023std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1025 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1026 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1027 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1028{
1029 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1030 const BoxIndexerND<dim> indexer1(box1);
1031 const BoxIndexerND<dim> indexer2(box2);
1032 const BoxIndexerND<dim> indexer3(box3);
1033 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1034 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1035 [=] AMREX_GPU_DEVICE () noexcept {
1036 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1037 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1038 icell < ncells; icell += stride) {
1039 if (icell < indexer1.numPts()) {
1040 auto iv = indexer1.intVect(icell);
1041 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1042 }
1043 if (icell < indexer2.numPts()) {
1044 auto iv = indexer2.intVect(icell);
1045 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1046 }
1047 if (icell < indexer3.numPts()) {
1048 auto iv = indexer3.intVect(icell);
1049 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1050 }
1051 }
1052 });
1054}
1055
1056#endif
1057
1058template <typename L>
1059void single_task (L&& f) noexcept
1060{
1061 single_task(Gpu::gpuStream(), std::forward<L>(f));
1062}
1063
1064template<typename T, typename L>
1065void launch (T const& n, L&& f) noexcept
1066{
1067 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1068}
1069
1070template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1071std::enable_if_t<MaybeDeviceRunnable<L>::value>
1072ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1073{
1074 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1075}
1076
1077template <typename L, int dim>
1078std::enable_if_t<MaybeDeviceRunnable<L>::value>
1079ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1080{
1081 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1082}
1083
1084template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1085std::enable_if_t<MaybeDeviceRunnable<L>::value>
1086ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1087{
1088 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1089}
1090
1091template <typename L1, typename L2, int dim>
1092std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1094 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1095{
1096 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1097 std::forward<L2>(f2));
1098}
1099
1100template <typename L1, typename L2, typename L3, int dim>
1101std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1103 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1104 L1&& f1, L2&& f2, L3&& f3) noexcept
1105{
1106 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1107 std::forward<L2>(f2), std::forward<L3>(f3));
1108}
1109
1110template <typename T1, typename T2, typename L1, typename L2, int dim,
1111 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1112 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1113std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1115 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1116 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1117{
1118 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1119 box2, ncomp2, std::forward<L2>(f2));
1120}
1121
1122template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1123 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1124 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1125 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1126std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1128 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1129 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1130 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1131{
1132 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1133 box2, ncomp2, std::forward<L2>(f2),
1134 box3, ncomp3, std::forward<L3>(f3));
1135}
1136
1137template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1138void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1139{
1140 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1141}
1142
1143template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1144void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1145{
1146 ParallelFor<MT>(info, n,std::forward<L>(f));
1147}
1148
1149template <typename L, int dim>
1150void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1151{
1152 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1153}
1154
1155template <int MT, typename L, int dim>
1156void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1157{
1158 ParallelFor<MT>(info, box,std::forward<L>(f));
1159}
1160
1161template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1162void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1163{
1164 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1165}
1166
1167template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1168void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1169{
1170 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1171}
1172
1173template <typename L1, typename L2, int dim>
1174void For (Gpu::KernelInfo const& info,
1175 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1176{
1177 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1178}
1179
1180template <int MT, typename L1, typename L2, int dim>
1181void For (Gpu::KernelInfo const& info,
1182 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1183{
1184 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1185}
1186
1187template <typename L1, typename L2, typename L3, int dim>
1188void For (Gpu::KernelInfo const& info,
1189 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1190 L1&& f1, L2&& f2, L3&& f3) noexcept
1191{
1192 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1193}
1194
1195template <int MT, typename L1, typename L2, typename L3, int dim>
1196void For (Gpu::KernelInfo const& info,
1197 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1198 L1&& f1, L2&& f2, L3&& f3) noexcept
1199{
1200 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1201}
1202
1203template <typename T1, typename T2, typename L1, typename L2, int dim,
1204 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1205 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1206void For (Gpu::KernelInfo const& info,
1207 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1208 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1209{
1210 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1211}
1212
1213template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1214 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1215 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1216void For (Gpu::KernelInfo const& info,
1217 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1218 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1219{
1220 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1221}
1222
1223template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1224 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1225 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1226 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1227void For (Gpu::KernelInfo const& info,
1228 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1229 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1230 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1231{
1232 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1233 box1,ncomp1,std::forward<L1>(f1),
1234 box2,ncomp2,std::forward<L2>(f2),
1235 box3,ncomp3,std::forward<L3>(f3));
1236}
1237
1238template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1239 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1240 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1241 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1242void For (Gpu::KernelInfo const& info,
1243 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1244 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1245 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1246{
1247 ParallelFor<MT>(info,
1248 box1,ncomp1,std::forward<L1>(f1),
1249 box2,ncomp2,std::forward<L2>(f2),
1250 box3,ncomp3,std::forward<L3>(f3));
1251}
1252
1253template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1254void ParallelFor (T n, L&& f) noexcept
1255{
1256 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1257}
1258
1259template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1260void ParallelFor (T n, L&& f) noexcept
1261{
1262 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1263}
1264
1265template <typename L, int dim>
1266void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1267{
1268 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1269}
1270
1271template <int MT, typename L, int dim>
1272void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1273{
1274 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1275}
1276
1277template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1278void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1279{
1280 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1281}
1282
1283template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1284void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1285{
1286 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1287}
1288
1289template <typename L1, typename L2, int dim>
1290void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1291{
1292 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1293}
1294
1295template <int MT, typename L1, typename L2, int dim>
1296void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1297{
1298 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1299}
1300
1301template <typename L1, typename L2, typename L3, int dim>
1302void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1303 L1&& f1, L2&& f2, L3&& f3) noexcept
1304{
1305 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1306}
1307
1308template <int MT, typename L1, typename L2, typename L3, int dim>
1309void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1310 L1&& f1, L2&& f2, L3&& f3) noexcept
1311{
1312 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1313}
1314
1315template <typename T1, typename T2, typename L1, typename L2, int dim,
1316 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1317 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1318void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1319 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1320{
1321 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1322}
1323
1324template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1325 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1326 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1327void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1328 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1329{
1330 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1331}
1332
1333template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1334 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1335 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1336 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1337void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1338 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1339 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1340{
1341 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1342 box1,ncomp1,std::forward<L1>(f1),
1343 box2,ncomp2,std::forward<L2>(f2),
1344 box3,ncomp3,std::forward<L3>(f3));
1345}
1346
1347template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1348 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1349 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1350 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1351void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1352 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1353 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1354{
1355 ParallelFor<MT>(Gpu::KernelInfo{},
1356 box1,ncomp1,std::forward<L1>(f1),
1357 box2,ncomp2,std::forward<L2>(f2),
1358 box3,ncomp3,std::forward<L3>(f3));
1359}
1360
1361template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1362void For (T n, L&& f) noexcept
1363{
1364 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1365}
1366
1367template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1368void For (T n, L&& f) noexcept
1369{
1370 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1371}
1372
1373template <typename L, int dim>
1374void For (BoxND<dim> const& box, L&& f) noexcept
1375{
1376 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1377}
1378
1379template <int MT, typename L, int dim>
1380void For (BoxND<dim> const& box, L&& f) noexcept
1381{
1382 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1383}
1384
1385template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1386void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1387{
1388 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1389}
1390
1391template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1392void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1393{
1394 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1395}
1396
1397template <typename L1, typename L2, int dim>
1398void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1399{
1400 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1401}
1402
1403template <int MT, typename L1, typename L2, int dim>
1404void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1405{
1406 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1407}
1408
1409template <typename L1, typename L2, typename L3, int dim>
1410void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1411 L1&& f1, L2&& f2, L3&& f3) noexcept
1412{
1413 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1414}
1415
1416template <int MT, typename L1, typename L2, typename L3, int dim>
1417void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1418 L1&& f1, L2&& f2, L3&& f3) noexcept
1419{
1420 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1421}
1422
1423template <typename T1, typename T2, typename L1, typename L2, int dim,
1424 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1425 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1426void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1427 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1428{
1429 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1430}
1431
1432template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1433 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1434 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1435void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1436 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1437{
1438 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1439}
1440
1441template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1442 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1443 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1444 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1445void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1446 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1447 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1448{
1449 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1450 box1,ncomp1,std::forward<L1>(f1),
1451 box2,ncomp2,std::forward<L2>(f2),
1452 box3,ncomp3,std::forward<L3>(f3));
1453}
1454
1455template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1456 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1457 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1458 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1459void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1460 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1461 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1462{
1463 ParallelFor<MT>(Gpu::KernelInfo{},
1464 box1,ncomp1,std::forward<L1>(f1),
1465 box2,ncomp2,std::forward<L2>(f2),
1466 box3,ncomp3,std::forward<L3>(f3));
1467}
1468
1469template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1470std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1471HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1472{
1473 if (Gpu::inLaunchRegion()) {
1474 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1475 } else {
1476#ifdef AMREX_USE_SYCL
1477 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1478#else
1480 for (T i = 0; i < n; ++i) { f(i); }
1481#endif
1482 }
1483}
1484
1485template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1486std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1487HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1488{
1489 if (Gpu::inLaunchRegion()) {
1490 ParallelFor<MT>(info,n,std::forward<L>(f));
1491 } else {
1492#ifdef AMREX_USE_SYCL
1493 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1494#else
1496 for (T i = 0; i < n; ++i) { f(i); }
1497#endif
1498 }
1499}
1500
1501template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1502std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1503HostDeviceParallelFor (T n, L&& f) noexcept
1504{
1505 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1506}
1507
1508template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1509std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1510HostDeviceParallelFor (T n, L&& f) noexcept
1511{
1512 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1513}
1514
1515template <typename L, int dim>
1516std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1517HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1518{
1519 if (Gpu::inLaunchRegion()) {
1520 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1521 } else {
1522#ifdef AMREX_USE_SYCL
1523 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1524#else
1525 LoopConcurrentOnCpu(box,std::forward<L>(f));
1526#endif
1527 }
1528}
1529
1530template <int MT, typename L, int dim>
1531std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1532HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1533{
1534 if (Gpu::inLaunchRegion()) {
1535 ParallelFor<MT>(info, box,std::forward<L>(f));
1536 } else {
1537#ifdef AMREX_USE_SYCL
1538 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1539#else
1540 LoopConcurrentOnCpu(box,std::forward<L>(f));
1541#endif
1542 }
1543}
1544
1545template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1546std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1547HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1548{
1549 if (Gpu::inLaunchRegion()) {
1550 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1551 } else {
1552#ifdef AMREX_USE_SYCL
1553 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1554#else
1555 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1556#endif
1557 }
1558}
1559
1560template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1561std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1562HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1563{
1564 if (Gpu::inLaunchRegion()) {
1565 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1566 } else {
1567#ifdef AMREX_USE_SYCL
1568 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1569#else
1570 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1571#endif
1572 }
1573}
1574
1575template <typename L1, typename L2, int dim>
1576std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1578 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1579{
1580 if (Gpu::inLaunchRegion()) {
1581 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1582 } else {
1583#ifdef AMREX_USE_SYCL
1584 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1585#else
1586 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1587 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1588#endif
1589 }
1590}
1591
1592template <int MT, typename L1, typename L2, int dim>
1593std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1595 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1596{
1597 if (Gpu::inLaunchRegion()) {
1598 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1599 } else {
1600#ifdef AMREX_USE_SYCL
1601 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1602#else
1603 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1604 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1605#endif
1606 }
1607}
1608
1609template <int MT, typename L1, typename L2, typename L3, int dim>
1610std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1612 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1613 L1&& f1, L2&& f2, L3&& f3) noexcept
1614{
1615 if (Gpu::inLaunchRegion()) {
1616 ParallelFor<MT>(info,box1,box2,box3,
1617 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1618 } else {
1619#ifdef AMREX_USE_SYCL
1620 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1621#else
1622 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1623 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1624 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1625#endif
1626 }
1627}
1628
1629template <typename T1, typename T2, typename L1, typename L2, int dim,
1630 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1631 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1632std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1634 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1635 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1636{
1637 if (Gpu::inLaunchRegion()) {
1638 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1639 } else {
1640#ifdef AMREX_USE_SYCL
1641 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1642#else
1643 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1644 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1645#endif
1646 }
1647}
1648
1649template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1650 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1651 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1652std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1654 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1655 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1656{
1657 if (Gpu::inLaunchRegion()) {
1658 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1659 } else {
1660#ifdef AMREX_USE_SYCL
1661 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1662#else
1663 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1664 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1665#endif
1666 }
1667}
1668
1669template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1670 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1671 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1672 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1673std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1675 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1676 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1677 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1678{
1679 if (Gpu::inLaunchRegion()) {
1680 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1681 box1,ncomp1,std::forward<L1>(f1),
1682 box2,ncomp2,std::forward<L2>(f2),
1683 box3,ncomp3,std::forward<L3>(f3));
1684 } else {
1685#ifdef AMREX_USE_SYCL
1686 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1687#else
1688 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1689 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1690 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1691#endif
1692 }
1693}
1694
1695template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1696 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1697 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1698 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1699std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1701 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1702 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1703 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1704{
1705 if (Gpu::inLaunchRegion()) {
1706 ParallelFor<MT>(info,
1707 box1,ncomp1,std::forward<L1>(f1),
1708 box2,ncomp2,std::forward<L2>(f2),
1709 box3,ncomp3,std::forward<L3>(f3));
1710 } else {
1711#ifdef AMREX_USE_SYCL
1712 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1713#else
1714 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1715 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1716 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1717#endif
1718 }
1719}
1720
1721template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1722void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1723{
1724 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1725}
1726
1727template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1728void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1729{
1730 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1731}
1732
1733template <typename L, int dim>
1734void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1735{
1736 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1737}
1738
1739template <int MT, typename L, int dim>
1740void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1741{
1742 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1743}
1744
1745template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1746void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1747{
1748 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1749}
1750
1751template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1752void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1753{
1754 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1755}
1756
1757template <typename L1, typename L2, int dim>
1758void HostDeviceFor (Gpu::KernelInfo const& info,
1759 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1760{
1761 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1762}
1763
1764template <int MT, typename L1, typename L2, int dim>
1765void HostDeviceFor (Gpu::KernelInfo const& info,
1766 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1767{
1768 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1769}
1770
1771template <typename L1, typename L2, typename L3, int dim>
1772void HostDeviceFor (Gpu::KernelInfo const& info,
1773 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1774 L1&& f1, L2&& f2, L3&& f3) noexcept
1775{
1776 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1777 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1778}
1779
1780template <int MT, typename L1, typename L2, typename L3, int dim>
1781void HostDeviceFor (Gpu::KernelInfo const& info,
1782 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1783 L1&& f1, L2&& f2, L3&& f3) noexcept
1784{
1785 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1786 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1787}
1788
1789template <typename T1, typename T2, typename L1, typename L2, int dim,
1790 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1791 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1792void HostDeviceFor (Gpu::KernelInfo const& info,
1793 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1794 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1795{
1796 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1797}
1798
1799template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1800 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1801 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1802void HostDeviceFor (Gpu::KernelInfo const& info,
1803 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1804 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1805{
1806 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1807}
1808
1809template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1810 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1811 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1812 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1813void HostDeviceFor (Gpu::KernelInfo const& info,
1814 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1815 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1816 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1817{
1818 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1819 box1,ncomp1,std::forward<L1>(f1),
1820 box2,ncomp2,std::forward<L2>(f2),
1821 box3,ncomp3,std::forward<L3>(f3));
1822}
1823
1824template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1825 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1826 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1827 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1828void HostDeviceFor (Gpu::KernelInfo const& info,
1829 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1830 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1831 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1832{
1833 HostDeviceParallelFor<MT>(info,
1834 box1,ncomp1,std::forward<L1>(f1),
1835 box2,ncomp2,std::forward<L2>(f2),
1836 box3,ncomp3,std::forward<L3>(f3));
1837}
1838
1839template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1840void HostDeviceParallelFor (T n, L&& f) noexcept
1841{
1842 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1843}
1844
1845template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
1846void HostDeviceParallelFor (T n, L&& f) noexcept
1847{
1848 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1849}
1850
1851template <typename L, int dim>
1852void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1853{
1854 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1855}
1856
1857template <int MT, typename L, int dim>
1858void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1859{
1860 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1861}
1862
1863template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1864void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1865{
1866 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1867}
1868
1869template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral<T>::value> >
1870void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1871{
1872 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1873}
1874
1875template <typename L1, typename L2, int dim>
1876void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1877{
1878 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1879}
1880
1881template <int MT, typename L1, typename L2, int dim>
1882void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1883{
1884 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1885}
1886
1887template <typename L1, typename L2, typename L3, int dim>
1888void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1889 L1&& f1, L2&& f2, L3&& f3) noexcept
1890{
1891 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
1892 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1893}
1894
1895template <int MT, typename L1, typename L2, typename L3, int dim>
1896void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1897 L1&& f1, L2&& f2, L3&& f3) noexcept
1898{
1899 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
1900 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1901}
1902
1903template <typename T1, typename T2, typename L1, typename L2, int dim,
1904 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1905 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1906void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1907 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1908{
1909 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1910}
1911
1912template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1913 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1914 typename M2=std::enable_if_t<std::is_integral<T2>::value> >
1915void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1916 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1917{
1918 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1919}
1920
1921template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1922 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1923 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1924 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1925void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1926 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1927 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1928{
1929 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1930 box1,ncomp1,std::forward<L1>(f1),
1931 box2,ncomp2,std::forward<L2>(f2),
1932 box3,ncomp3,std::forward<L3>(f3));
1933}
1934
1935template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1936 typename M1=std::enable_if_t<std::is_integral<T1>::value>,
1937 typename M2=std::enable_if_t<std::is_integral<T2>::value>,
1938 typename M3=std::enable_if_t<std::is_integral<T3>::value> >
1939void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1940 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1941 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1942{
1943 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
1944 box1,ncomp1,std::forward<L1>(f1),
1945 box2,ncomp2,std::forward<L2>(f2),
1946 box3,ncomp3,std::forward<L3>(f3));
1947}
1948
1949}
1950
1951#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:43
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:176
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
Definition AMReX_GpuKernelInfo.H:8
Definition AMReX_IntVect.H:48
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:125
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:86
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition AMReX_GpuLaunchFunctsC.H:75
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T n) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n))
Definition AMReX_GpuLaunchFunctsC.H:103
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp(F const &f, IntVectND< dim > iv, T ncomp) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition AMReX_GpuLaunchFunctsG.H:103
AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i) noexcept -> decltype(f(0))
Definition AMReX_GpuLaunchFunctsC.H:13
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition AMReX_GpuLaunchFunctsG.H:65
AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition AMReX_GpuLaunchFunctsC.H:65
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T n, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n, engine))
Definition AMReX_GpuLaunchFunctsC.H:93
AMREX_FORCE_INLINE auto call_f_intvect_inner(std::index_sequence< Ns... >, F const &f, IntVectND< 1 > iv, Args...args) noexcept -> decltype(f(0, 0, 0, args...))
Definition AMReX_GpuLaunchFunctsC.H:31
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:378
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:120
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:869
AMREX_GPU_HOST_DEVICE constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:179
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:756
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
AMREX_FORCE_INLINE randState_t * getRandState()
Definition AMReX_RandomEngine.H:55
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1307
curandState_t randState_t
Definition AMReX_RandomEngine.H:48
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:134
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1221
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2044
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition AMReX_Box.H:2068
Definition AMReX_GpuLaunch.H:128
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:57