Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
7namespace detail {
8
9 // call_f_scalar_handler
10
11 template <typename F, typename N>
13 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
14 noexcept -> decltype(f(0))
15 {
16 f(i);
17 }
18
19 template <typename F, typename N>
21 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
22 noexcept -> decltype(f(0,Gpu::Handler{}))
23 {
24 f(i, handler);
25 }
26
27 // call_f_intvect_inner
28
29 template <typename F, std::size_t...Ns, class...Args>
31 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
32 noexcept -> decltype(f(0, 0, 0, args...))
33 {
34 f(iv[0], 0, 0, args...);
35 }
36
37 template <typename F, std::size_t...Ns, class...Args>
39 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
40 noexcept -> decltype(f(0, 0, 0, args...))
41 {
42 f(iv[0], iv[1], 0, args...);
43 }
44
45 template <typename F, int dim, std::size_t...Ns, class...Args>
47 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
48 noexcept -> decltype(f(iv, args...))
49 {
50 f(iv, args...);
51 }
52
53 template <typename F, int dim, std::size_t...Ns, class...Args>
55 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
56 noexcept -> decltype(f(iv[Ns]..., args...))
57 {
58 f(iv[Ns]..., args...);
59 }
60
61 // call_f_intvect
62
63 template <typename F, int dim>
65 auto call_f_intvect (F const& f, IntVectND<dim> iv)
66 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
67 {
68 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
69 }
70
71 // call_f_intvect_engine
72
73 template <typename F, int dim>
76 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
77 {
78 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
79 }
80
81 // call_f_intvect_handler
82
83 template <typename F, int dim>
86 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
87 {
88 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
89 }
90
91 template <typename F, int dim>
93 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
94 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
95 {
96 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
97 }
98
99 // call_f_intvect_ncomp
100
101 template <typename F, typename T, int dim>
103 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
104 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
105 {
106 for (T n = 0; n < ncomp; ++n) {
107 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
108 }
109 }
110
111 // call_f_intvect_ncomp_engine
112
113 template <typename F, typename T, int dim>
115 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
116 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
117 {
118 for (T n = 0; n < ncomp; ++n) {
119 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
120 }
121 }
122
123 // call_f_intvect_ncomp_handler
124
125 template <typename F, typename T, int dim>
127 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
128 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
129 {
130 for (T n = 0; n < ncomp; ++n) {
131 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
132 }
133 }
134
135 template <typename F, typename T, int dim>
137 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
138 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
139 {
140 for (T n = 0; n < ncomp; ++n) {
141 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
142 }
143 }
144
145}
146
147#ifdef AMREX_USE_SYCL
148
149template <typename L>
150void single_task (gpuStream_t stream, L const& f) noexcept
151{
152 auto& q = *(stream.queue);
153 try {
154 q.submit([&] (sycl::handler& h) {
155 h.single_task([=] () { f(); });
156 });
157 } catch (sycl::exception const& ex) {
158 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
159 }
160}
161
162template<typename L>
163void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
164 gpuStream_t stream, L const& f) noexcept
165{
166 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
168 / sizeof(unsigned long long);
169 auto& q = *(stream.queue);
170 try {
171 q.submit([&] (sycl::handler& h) {
172 sycl::local_accessor<unsigned long long>
173 shared_data(sycl::range<1>(shared_mem_numull), h);
174 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175 sycl::range<1>(nthreads_per_block)),
176 [=] (sycl::nd_item<1> item)
177 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
178 {
179 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
180 });
181 });
182 } catch (sycl::exception const& ex) {
183 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
184 }
185}
186
187template<typename L>
188void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
189{
190 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191 auto& q = *(stream.queue);
192 try {
193 q.submit([&] (sycl::handler& h) {
194 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195 sycl::range<1>(nthreads_per_block)),
196 [=] (sycl::nd_item<1> item)
197 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
198 {
199 f(item);
200 });
201 });
202 } catch (sycl::exception const& ex) {
203 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
204 }
205}
206
207template <int MT, typename L>
208void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
209 L const& f) noexcept
210{
211 const auto nthreads_total = MT * std::size_t(nblocks);
212 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
213 / sizeof(unsigned long long);
214 auto& q = *(stream.queue);
215 try {
216 q.submit([&] (sycl::handler& h) {
217 sycl::local_accessor<unsigned long long>
218 shared_data(sycl::range<1>(shared_mem_numull), h);
219 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
220 sycl::range<1>(MT)),
221 [=] (sycl::nd_item<1> item)
222 [[sycl::reqd_work_group_size(MT)]]
223 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
224 {
225 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
226 });
227 });
228 } catch (sycl::exception const& ex) {
229 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
230 }
231}
232
233template <int MT, typename L>
234void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
235{
236 const auto nthreads_total = MT * std::size_t(nblocks);
237 auto& q = *(stream.queue);
238 try {
239 q.submit([&] (sycl::handler& h) {
240 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
241 sycl::range<1>(MT)),
242 [=] (sycl::nd_item<1> item)
243 [[sycl::reqd_work_group_size(MT)]]
244 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
245 {
246 f(item);
247 });
248 });
249 } catch (sycl::exception const& ex) {
250 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
251 }
252}
253
254template<int MT, typename T, typename L>
255void launch (T const& n, L const& f) noexcept
256{
257 if (amrex::isEmpty(n)) { return; }
258 const auto ec = Gpu::makeExecutionConfig<MT>(n);
259 const auto nthreads_per_block = ec.numThreads.x;
260 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
261 auto& q = Gpu::Device::streamQueue();
262 try {
263 q.submit([&] (sycl::handler& h) {
264 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
265 sycl::range<1>(nthreads_per_block)),
266 [=] (sycl::nd_item<1> item)
267 [[sycl::reqd_work_group_size(MT)]]
268 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
269 {
270 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
271 f(i);
272 }
273 });
274 });
275 } catch (sycl::exception const& ex) {
276 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
277 }
278}
279
280template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
281void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
282{
283 if (amrex::isEmpty(n)) { return; }
284 const auto ec = Gpu::makeExecutionConfig<MT>(n);
285 const auto nthreads_per_block = ec.numThreads.x;
286 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
287 auto& q = Gpu::Device::streamQueue();
288 try {
289 if (info.hasReduction()) {
290 q.submit([&] (sycl::handler& h) {
291 sycl::local_accessor<unsigned long long>
292 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
293 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
294 sycl::range<1>(nthreads_per_block)),
295 [=] (sycl::nd_item<1> item)
296 [[sycl::reqd_work_group_size(MT)]]
297 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
298 {
299 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
300 i < std::size_t(n); i += stride) {
301 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
302 item.get_local_range(0));
303 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
304 n_active_threads});
305 }
306 });
307 });
308 } else {
309 q.submit([&] (sycl::handler& h) {
310 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
311 sycl::range<1>(nthreads_per_block)),
312 [=] (sycl::nd_item<1> item)
313 [[sycl::reqd_work_group_size(MT)]]
314 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
315 {
316 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
317 i < std::size_t(n); i += stride) {
318 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
319 }
320 });
321 });
322 }
323 } catch (sycl::exception const& ex) {
324 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
325 }
326}
327
328template <int MT, typename L, int dim>
329void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
330{
331 if (amrex::isEmpty(box)) { return; }
332 const BoxIndexerND<dim> indexer(box);
333 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
334 const auto nthreads_per_block = ec.numThreads.x;
335 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
336 auto& q = Gpu::Device::streamQueue();
337 try {
338 if (info.hasReduction()) {
339 q.submit([&] (sycl::handler& h) {
340 sycl::local_accessor<unsigned long long>
341 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
342 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
343 sycl::range<1>(nthreads_per_block)),
344 [=] (sycl::nd_item<1> item)
345 [[sycl::reqd_work_group_size(MT)]]
346 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
347 {
348 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
349 icell < indexer.numPts(); icell += stride) {
350 auto iv = indexer.intVect(icell);
351 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
352 std::uint64_t(item.get_local_range(0)));
353 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
354 n_active_threads});
355 }
356 });
357 });
358 } else {
359 q.submit([&] (sycl::handler& h) {
360 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
361 sycl::range<1>(nthreads_per_block)),
362 [=] (sycl::nd_item<1> item)
363 [[sycl::reqd_work_group_size(MT)]]
364 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
365 {
366 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
367 icell < indexer.numPts(); icell += stride) {
368 auto iv = indexer.intVect(icell);
369 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
370 }
371 });
372 });
373 }
374 } catch (sycl::exception const& ex) {
375 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
376 }
377}
378
379template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
380void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
381{
382 if (amrex::isEmpty(box)) { return; }
383 const BoxIndexerND<dim> indexer(box);
384 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
385 const auto nthreads_per_block = ec.numThreads.x;
386 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
387 auto& q = Gpu::Device::streamQueue();
388 try {
389 if (info.hasReduction()) {
390 q.submit([&] (sycl::handler& h) {
391 sycl::local_accessor<unsigned long long>
392 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
393 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
394 sycl::range<1>(nthreads_per_block)),
395 [=] (sycl::nd_item<1> item)
396 [[sycl::reqd_work_group_size(MT)]]
397 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
398 {
399 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
400 icell < indexer.numPts(); icell += stride) {
401 auto iv = indexer.intVect(icell);
402 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
403 std::uint64_t(item.get_local_range(0)));
405 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
406 n_active_threads});
407 }
408 });
409 });
410 } else {
411 q.submit([&] (sycl::handler& h) {
412 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
413 sycl::range<1>(nthreads_per_block)),
414 [=] (sycl::nd_item<1> item)
415 [[sycl::reqd_work_group_size(MT)]]
416 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
417 {
418 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
419 icell < indexer.numPts(); icell += stride) {
420 auto iv = indexer.intVect(icell);
421 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
422 }
423 });
424 });
425 }
426 } catch (sycl::exception const& ex) {
427 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
428 }
429}
430
431template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
432void ParallelForRNG (T n, L const& f) noexcept
433{
434 if (amrex::isEmpty(n)) { return; }
435 const auto ec = Gpu::ExecutionConfig(n);
436 const auto nthreads_per_block = ec.numThreads.x;
437 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
438 auto& q = Gpu::Device::streamQueue();
439 auto& engdescr = *(getRandEngineDescriptor());
440 try {
441 q.submit([&] (sycl::handler& h) {
442 auto engine_acc = engdescr.get_access(h);
443 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
444 sycl::range<1>(nthreads_per_block)),
445 [=] (sycl::nd_item<1> item)
446 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
447 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
448 {
449 auto const tid = item.get_global_id(0);
450 auto engine = engine_acc.load(tid);
451 RandomEngine rand_eng{&engine};
452 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
453 f(T(i),rand_eng);
454 }
455 engine_acc.store(engine, tid);
456 });
457 });
458 q.wait_and_throw(); // because next launch might be on a different queue
459 } catch (sycl::exception const& ex) {
460 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
461 }
462}
463
464template <typename L, int dim>
465void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
466{
467 if (amrex::isEmpty(box)) { return; }
468 const BoxIndexerND<dim> indexer(box);
469 const auto ec = Gpu::ExecutionConfig(box.numPts());
470 const auto nthreads_per_block = ec.numThreads.x;
471 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
472 auto& q = Gpu::Device::streamQueue();
473 auto& engdescr = *(getRandEngineDescriptor());
474 try {
475 q.submit([&] (sycl::handler& h) {
476 auto engine_acc = engdescr.get_access(h);
477 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
478 sycl::range<1>(nthreads_per_block)),
479 [=] (sycl::nd_item<1> item)
480 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
481 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
482 {
483 auto const tid = item.get_global_id(0);
484 auto engine = engine_acc.load(tid);
485 RandomEngine rand_eng{&engine};
486 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
487 icell < indexer.numPts(); icell += stride) {
488 auto iv = indexer.intVect(icell);
489 detail::call_f_intvect_engine(f,iv,rand_eng);
490 }
491 engine_acc.store(engine, tid);
492 });
493 });
494 q.wait_and_throw(); // because next launch might be on a different queue
495 } catch (sycl::exception const& ex) {
496 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
497 }
498}
499
500template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
501void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
502{
503 if (amrex::isEmpty(box)) { return; }
504 const BoxIndexerND<dim> indexer(box);
505 const auto ec = Gpu::ExecutionConfig(box.numPts());
506 const auto nthreads_per_block = ec.numThreads.x;
507 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
508 auto& q = Gpu::Device::streamQueue();
509 auto& engdescr = *(getRandEngineDescriptor());
510 try {
511 q.submit([&] (sycl::handler& h) {
512 auto engine_acc = engdescr.get_access(h);
513 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
514 sycl::range<1>(nthreads_per_block)),
515 [=] (sycl::nd_item<1> item)
516 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
517 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
518 {
519 auto const tid = item.get_global_id(0);
520 auto engine = engine_acc.load(tid);
521 RandomEngine rand_eng{&engine};
522 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
523 icell < indexer.numPts(); icell += stride) {
524 auto iv = indexer.intVect(icell);
525 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
526 }
527 engine_acc.store(engine, tid);
528 });
529 });
530 q.wait_and_throw(); // because next launch might be on a different queue
531 } catch (sycl::exception const& ex) {
532 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
533 }
534}
535
536template <int MT, typename L1, typename L2, int dim>
537void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
538{
539 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
540 const BoxIndexerND<dim> indexer1(box1);
541 const BoxIndexerND<dim> indexer2(box2);
542 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
543 const auto nthreads_per_block = ec.numThreads.x;
544 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
545 auto& q = Gpu::Device::streamQueue();
546 try {
547 q.submit([&] (sycl::handler& h) {
548 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
549 sycl::range<1>(nthreads_per_block)),
550 [=] (sycl::nd_item<1> item)
551 [[sycl::reqd_work_group_size(MT)]]
552 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
553 {
554 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
555 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
556 icell < ncells; icell += stride) {
557 if (icell < indexer1.numPts()) {
558 auto iv = indexer1.intVect(icell);
560 }
561 if (icell < indexer2.numPts()) {
562 auto iv = indexer2.intVect(icell);
564 }
565 }
566 });
567 });
568 } catch (sycl::exception const& ex) {
569 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
570 }
571}
572
573template <int MT, typename L1, typename L2, typename L3, int dim>
574void ParallelFor (Gpu::KernelInfo const& /*info*/,
575 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
576 L1&& f1, L2&& f2, L3&& f3) noexcept
577{
578 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
579 const BoxIndexerND<dim> indexer1(box1);
580 const BoxIndexerND<dim> indexer2(box2);
581 const BoxIndexerND<dim> indexer3(box3);
582 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
583 const auto nthreads_per_block = ec.numThreads.x;
584 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
585 auto& q = Gpu::Device::streamQueue();
586 try {
587 q.submit([&] (sycl::handler& h) {
588 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
589 sycl::range<1>(nthreads_per_block)),
590 [=] (sycl::nd_item<1> item)
591 [[sycl::reqd_work_group_size(MT)]]
592 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
593 {
594 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
595 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
596 icell < ncells; icell += stride) {
597 if (icell < indexer1.numPts()) {
598 auto iv = indexer1.intVect(icell);
600 }
601 if (icell < indexer2.numPts()) {
602 auto iv = indexer2.intVect(icell);
604 }
605 if (icell < indexer3.numPts()) {
606 auto iv = indexer3.intVect(icell);
608 }
609 }
610 });
611 });
612 } catch (sycl::exception const& ex) {
613 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
614 }
615}
616
617template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
618 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
619 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
620void ParallelFor (Gpu::KernelInfo const& /*info*/,
621 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
622 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
623{
624 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
625 const BoxIndexerND<dim> indexer1(box1);
626 const BoxIndexerND<dim> indexer2(box2);
627 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
628 const auto nthreads_per_block = ec.numThreads.x;
629 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
630 auto& q = Gpu::Device::streamQueue();
631 try {
632 q.submit([&] (sycl::handler& h) {
633 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
634 sycl::range<1>(nthreads_per_block)),
635 [=] (sycl::nd_item<1> item)
636 [[sycl::reqd_work_group_size(MT)]]
637 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
638 {
639 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
640 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
641 icell < ncells; icell += stride) {
642 if (icell < indexer1.numPts()) {
643 auto iv = indexer1.intVect(icell);
644 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
645 }
646 if (icell < indexer2.numPts()) {
647 auto iv = indexer2.intVect(icell);
648 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
649 }
650 }
651 });
652 });
653 } catch (sycl::exception const& ex) {
654 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
655 }
656}
657
658template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
659 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
660 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
661 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
662void ParallelFor (Gpu::KernelInfo const& /*info*/,
663 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
664 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
665 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
666{
667 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
668 const BoxIndexerND<dim> indexer1(box1);
669 const BoxIndexerND<dim> indexer2(box2);
670 const BoxIndexerND<dim> indexer3(box3);
671 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
672 const auto nthreads_per_block = ec.numThreads.x;
673 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
674 auto& q = Gpu::Device::streamQueue();
675 try {
676 q.submit([&] (sycl::handler& h) {
677 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
678 sycl::range<1>(nthreads_per_block)),
679 [=] (sycl::nd_item<1> item)
680 [[sycl::reqd_work_group_size(MT)]]
681 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
682 {
683 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
684 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
685 icell < ncells; icell += stride) {
686 if (icell < indexer1.numPts()) {
687 auto iv = indexer1.intVect(icell);
688 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
689 }
690 if (icell < indexer2.numPts()) {
691 auto iv = indexer2.intVect(icell);
692 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
693 }
694 if (icell < indexer3.numPts()) {
695 auto iv = indexer3.intVect(icell);
696 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
697 }
698 }
699 });
700 });
701 } catch (sycl::exception const& ex) {
702 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
703 }
704}
705
706#else
707// CUDA or HIP
708
709template <typename L>
710void single_task (gpuStream_t stream, L const& f) noexcept
711{
713 [=] AMREX_GPU_DEVICE () noexcept {f();});
715}
716
717template <int MT, typename L>
718void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
719 L const& f) noexcept
720{
721 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
722 [=] AMREX_GPU_DEVICE () noexcept { f(); });
724}
725
726template <int MT, typename L>
727void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
728{
729 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
730 [=] AMREX_GPU_DEVICE () noexcept { f(); });
732}
733
734template<typename L>
735void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736 gpuStream_t stream, L const& f) noexcept
737{
738 AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
739 stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
741}
742
743template<typename L>
744void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
745{
746 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
747}
748
749template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
750void launch (T const& n, L const& f) noexcept
751{
752 static_assert(sizeof(T) >= 2);
753 if (amrex::isEmpty(n)) { return; }
754 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
755 for (auto const& ec : nec) {
756 const T start_idx = T(ec.start_idx);
757 const T nleft = n - start_idx;
758 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
759 [=] AMREX_GPU_DEVICE () noexcept {
760 // This will not overflow, even though nblocks*MT might.
761 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
762 if (tid < nleft) {
763 f(tid+start_idx);
764 }
765 });
766 }
768}
769
770template<int MT, int dim, typename L>
771void launch (BoxND<dim> const& box, L const& f) noexcept
772{
773 if (box.isEmpty()) { return; }
774 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
775 const BoxIndexerND<dim> indexer(box);
776 const auto type = box.ixType();
777 for (auto const& ec : nec) {
778 const auto start_idx = std::uint64_t(ec.start_idx);
779 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
780 [=] AMREX_GPU_DEVICE () noexcept {
781 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
782 if (icell < indexer.numPts()) {
783 auto iv = indexer.intVect(icell);
784 f(BoxND<dim>(iv,iv,type));
785 }
786 });
787 }
789}
790
791template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
792std::enable_if_t<MaybeDeviceRunnable<L>::value>
793ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
794{
795 static_assert(sizeof(T) >= 2);
796 if (amrex::isEmpty(n)) { return; }
797 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
798 for (auto const& ec : nec) {
799 const T start_idx = T(ec.start_idx);
800 const T nleft = n - start_idx;
801 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
802 [=] AMREX_GPU_DEVICE () noexcept {
803 // This will not overflow, even though nblocks*MT might.
804 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
805 if (tid < nleft) {
806 detail::call_f_scalar_handler(f, tid+start_idx,
807 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
808 (std::uint64_t)MT)));
809 }
810 });
811 }
813}
814
815template <int MT, typename L, int dim>
816std::enable_if_t<MaybeDeviceRunnable<L>::value>
817ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
818{
819 if (amrex::isEmpty(box)) { return; }
820 const BoxIndexerND<dim> indexer(box);
821 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
822 for (auto const& ec : nec) {
823 const auto start_idx = std::uint64_t(ec.start_idx);
824 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
825 [=] AMREX_GPU_DEVICE () noexcept {
826 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
827 if (icell < indexer.numPts()) {
828 auto iv = indexer.intVect(icell);
830 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
831 (std::uint64_t)MT)));
832 }
833 });
834 }
836}
837
838template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
839std::enable_if_t<MaybeDeviceRunnable<L>::value>
840ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
841{
842 if (amrex::isEmpty(box)) { return; }
843 const BoxIndexerND<dim> indexer(box);
844 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
845 for (auto const& ec : nec) {
846 const auto start_idx = std::uint64_t(ec.start_idx);
847 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
848 [=] AMREX_GPU_DEVICE () noexcept {
849 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
850 if (icell < indexer.numPts()) {
851 auto iv = indexer.intVect(icell);
853 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
854 (std::uint64_t)MT)));
855 }
856 });
857 }
859}
860
861template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
862std::enable_if_t<MaybeDeviceRunnable<L>::value>
863ParallelForRNG (T n, L const& f) noexcept
864{
865 if (amrex::isEmpty(n)) { return; }
866 randState_t* rand_state = getRandState();
867 const auto ec = Gpu::ExecutionConfig(n);
868 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
870 ec.numThreads, 0, Gpu::gpuStream(),
871 [=] AMREX_GPU_DEVICE () noexcept {
872 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
873 RandomEngine engine{&(rand_state[tid])};
874 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
875 f(T(i),engine);
876 }
877 });
878 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
880}
881
882template <typename L, int dim>
883std::enable_if_t<MaybeDeviceRunnable<L>::value>
884ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
885{
886 if (amrex::isEmpty(box)) { return; }
887 randState_t* rand_state = getRandState();
888 const BoxIndexerND<dim> indexer(box);
889 const auto ec = Gpu::ExecutionConfig(box.numPts());
890 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
892 ec.numThreads, 0, Gpu::gpuStream(),
893 [=] AMREX_GPU_DEVICE () noexcept {
894 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
895 RandomEngine engine{&(rand_state[tid])};
896 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
897 auto iv = indexer.intVect(icell);
898 detail::call_f_intvect_engine(f, iv, engine);
899 }
900 });
901 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
903}
904
905template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
906std::enable_if_t<MaybeDeviceRunnable<L>::value>
907ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
908{
909 if (amrex::isEmpty(box)) { return; }
910 randState_t* rand_state = getRandState();
911 const BoxIndexerND<dim> indexer(box);
912 const auto ec = Gpu::ExecutionConfig(box.numPts());
913 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
915 ec.numThreads, 0, Gpu::gpuStream(),
916 [=] AMREX_GPU_DEVICE () noexcept {
917 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
918 RandomEngine engine{&(rand_state[tid])};
919 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
920 auto iv = indexer.intVect(icell);
921 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
922 }
923 });
924 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
926}
927
928template <int MT, typename L1, typename L2, int dim>
929std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
931 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
932{
933 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
934 const BoxIndexerND<dim> indexer1(box1);
935 const BoxIndexerND<dim> indexer2(box2);
936 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
937 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
938 [=] AMREX_GPU_DEVICE () noexcept {
939 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
940 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
941 icell < ncells; icell += stride) {
942 if (icell < indexer1.numPts()) {
943 auto iv = indexer1.intVect(icell);
945 }
946 if (icell < indexer2.numPts()) {
947 auto iv = indexer2.intVect(icell);
949 }
950 }
951 });
953}
954
955template <int MT, typename L1, typename L2, typename L3, int dim>
956std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
958 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
959 L1&& f1, L2&& f2, L3&& f3) noexcept
960{
961 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
962 const BoxIndexerND<dim> indexer1(box1);
963 const BoxIndexerND<dim> indexer2(box2);
964 const BoxIndexerND<dim> indexer3(box3);
965 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
966 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
967 [=] AMREX_GPU_DEVICE () noexcept {
968 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
969 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
970 icell < ncells; icell += stride) {
971 if (icell < indexer1.numPts()) {
972 auto iv = indexer1.intVect(icell);
974 }
975 if (icell < indexer2.numPts()) {
976 auto iv = indexer2.intVect(icell);
978 }
979 if (icell < indexer3.numPts()) {
980 auto iv = indexer3.intVect(icell);
982 }
983 }
984 });
986}
987
988template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
989 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
990 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
991std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
993 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
994 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
995{
996 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
997 const BoxIndexerND<dim> indexer1(box1);
998 const BoxIndexerND<dim> indexer2(box2);
999 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1000 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1001 [=] AMREX_GPU_DEVICE () noexcept {
1002 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1003 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1004 icell < ncells; icell += stride) {
1005 if (icell < indexer1.numPts()) {
1006 auto iv = indexer1.intVect(icell);
1007 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1008 }
1009 if (icell < indexer2.numPts()) {
1010 auto iv = indexer2.intVect(icell);
1011 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1012 }
1013 }
1014 });
1016}
1017
1018template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1019 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1020 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1021 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1022std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1024 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1025 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1026 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1027{
1028 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1029 const BoxIndexerND<dim> indexer1(box1);
1030 const BoxIndexerND<dim> indexer2(box2);
1031 const BoxIndexerND<dim> indexer3(box3);
1032 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1033 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1034 [=] AMREX_GPU_DEVICE () noexcept {
1035 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1036 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1037 icell < ncells; icell += stride) {
1038 if (icell < indexer1.numPts()) {
1039 auto iv = indexer1.intVect(icell);
1040 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1041 }
1042 if (icell < indexer2.numPts()) {
1043 auto iv = indexer2.intVect(icell);
1044 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1045 }
1046 if (icell < indexer3.numPts()) {
1047 auto iv = indexer3.intVect(icell);
1048 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1049 }
1050 }
1051 });
1053}
1054
1055#endif
1056
1057template <typename L>
1058void single_task (L&& f) noexcept
1059{
1060 single_task(Gpu::gpuStream(), std::forward<L>(f));
1061}
1062
1063template<typename T, typename L>
1064void launch (T const& n, L&& f) noexcept
1065{
1066 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1067}
1068
1069template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1070std::enable_if_t<MaybeDeviceRunnable<L>::value>
1071ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1072{
1073 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1074}
1075
1076template <typename L, int dim>
1077std::enable_if_t<MaybeDeviceRunnable<L>::value>
1078ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1079{
1080 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1081}
1082
1083template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1084std::enable_if_t<MaybeDeviceRunnable<L>::value>
1085ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1086{
1087 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1088}
1089
1090template <typename L1, typename L2, int dim>
1091std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1093 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1094{
1095 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1096 std::forward<L2>(f2));
1097}
1098
1099template <typename L1, typename L2, typename L3, int dim>
1100std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1102 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1103 L1&& f1, L2&& f2, L3&& f3) noexcept
1104{
1105 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1106 std::forward<L2>(f2), std::forward<L3>(f3));
1107}
1108
1109template <typename T1, typename T2, typename L1, typename L2, int dim,
1110 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1111 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1112std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1114 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1115 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1116{
1117 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1118 box2, ncomp2, std::forward<L2>(f2));
1119}
1120
1121template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1122 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1123 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1124 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1125std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1127 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1128 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1129 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1130{
1131 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1132 box2, ncomp2, std::forward<L2>(f2),
1133 box3, ncomp3, std::forward<L3>(f3));
1134}
1135
1136template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1137void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1138{
1139 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1140}
1141
1142template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1143void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1144{
1145 ParallelFor<MT>(info, n,std::forward<L>(f));
1146}
1147
1148template <typename L, int dim>
1149void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1150{
1151 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1152}
1153
1154template <int MT, typename L, int dim>
1155void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1156{
1157 ParallelFor<MT>(info, box,std::forward<L>(f));
1158}
1159
1160template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1161void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1162{
1163 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1164}
1165
1166template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1167void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1168{
1169 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1170}
1171
1172template <typename L1, typename L2, int dim>
1173void For (Gpu::KernelInfo const& info,
1174 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1175{
1176 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1177}
1178
1179template <int MT, typename L1, typename L2, int dim>
1180void For (Gpu::KernelInfo const& info,
1181 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1182{
1183 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1184}
1185
1186template <typename L1, typename L2, typename L3, int dim>
1187void For (Gpu::KernelInfo const& info,
1188 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1189 L1&& f1, L2&& f2, L3&& f3) noexcept
1190{
1191 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1192}
1193
1194template <int MT, typename L1, typename L2, typename L3, int dim>
1195void For (Gpu::KernelInfo const& info,
1196 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1197 L1&& f1, L2&& f2, L3&& f3) noexcept
1198{
1199 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1200}
1201
1202template <typename T1, typename T2, typename L1, typename L2, int dim,
1203 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1204 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1205void For (Gpu::KernelInfo const& info,
1206 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1207 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1208{
1209 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1210}
1211
1212template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1213 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1214 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1215void For (Gpu::KernelInfo const& info,
1216 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1217 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1218{
1219 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1220}
1221
1222template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1223 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1224 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1225 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1226void For (Gpu::KernelInfo const& info,
1227 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1228 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1229 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1230{
1231 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1232 box1,ncomp1,std::forward<L1>(f1),
1233 box2,ncomp2,std::forward<L2>(f2),
1234 box3,ncomp3,std::forward<L3>(f3));
1235}
1236
1237template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1238 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1239 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1240 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1241void For (Gpu::KernelInfo const& info,
1242 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1243 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1244 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1245{
1246 ParallelFor<MT>(info,
1247 box1,ncomp1,std::forward<L1>(f1),
1248 box2,ncomp2,std::forward<L2>(f2),
1249 box3,ncomp3,std::forward<L3>(f3));
1250}
1251
1252template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1253void ParallelFor (T n, L&& f) noexcept
1254{
1255 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1256}
1257
1258template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1259void ParallelFor (T n, L&& f) noexcept
1260{
1261 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1262}
1263
1264template <typename L, int dim>
1265void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1266{
1267 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1268}
1269
1270template <int MT, typename L, int dim>
1271void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1272{
1273 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1274}
1275
1276template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1277void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1278{
1279 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1280}
1281
1282template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1283void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1284{
1285 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1286}
1287
1288template <typename L1, typename L2, int dim>
1289void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1290{
1291 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1292}
1293
1294template <int MT, typename L1, typename L2, int dim>
1295void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1296{
1297 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1298}
1299
1300template <typename L1, typename L2, typename L3, int dim>
1301void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1302 L1&& f1, L2&& f2, L3&& f3) noexcept
1303{
1304 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1305}
1306
1307template <int MT, typename L1, typename L2, typename L3, int dim>
1308void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1309 L1&& f1, L2&& f2, L3&& f3) noexcept
1310{
1311 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1312}
1313
1314template <typename T1, typename T2, typename L1, typename L2, int dim,
1315 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1316 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1317void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1318 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1319{
1320 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1321}
1322
1323template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1324 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1325 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1326void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1327 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1328{
1329 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1330}
1331
1332template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1333 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1334 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1335 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1336void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1337 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1338 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1339{
1340 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1341 box1,ncomp1,std::forward<L1>(f1),
1342 box2,ncomp2,std::forward<L2>(f2),
1343 box3,ncomp3,std::forward<L3>(f3));
1344}
1345
1346template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1347 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1348 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1349 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1350void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1351 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1352 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1353{
1354 ParallelFor<MT>(Gpu::KernelInfo{},
1355 box1,ncomp1,std::forward<L1>(f1),
1356 box2,ncomp2,std::forward<L2>(f2),
1357 box3,ncomp3,std::forward<L3>(f3));
1358}
1359
1360template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1361void For (T n, L&& f) noexcept
1362{
1363 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1364}
1365
1366template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1367void For (T n, L&& f) noexcept
1368{
1369 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1370}
1371
1372template <typename L, int dim>
1373void For (BoxND<dim> const& box, L&& f) noexcept
1374{
1375 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1376}
1377
1378template <int MT, typename L, int dim>
1379void For (BoxND<dim> const& box, L&& f) noexcept
1380{
1381 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1382}
1383
1384template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1385void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1386{
1387 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1388}
1389
1390template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1391void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1392{
1393 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1394}
1395
1396template <typename L1, typename L2, int dim>
1397void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1398{
1399 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1400}
1401
1402template <int MT, typename L1, typename L2, int dim>
1403void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1404{
1405 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1406}
1407
1408template <typename L1, typename L2, typename L3, int dim>
1409void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1410 L1&& f1, L2&& f2, L3&& f3) noexcept
1411{
1412 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1413}
1414
1415template <int MT, typename L1, typename L2, typename L3, int dim>
1416void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1417 L1&& f1, L2&& f2, L3&& f3) noexcept
1418{
1419 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1420}
1421
1422template <typename T1, typename T2, typename L1, typename L2, int dim,
1423 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1424 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1425void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1426 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1427{
1428 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1429}
1430
1431template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1432 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1433 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1434void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1435 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1436{
1437 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1438}
1439
1440template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1441 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1442 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1443 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1444void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1445 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1446 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1447{
1448 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1449 box1,ncomp1,std::forward<L1>(f1),
1450 box2,ncomp2,std::forward<L2>(f2),
1451 box3,ncomp3,std::forward<L3>(f3));
1452}
1453
1454template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1455 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1456 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1457 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1458void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1459 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1460 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1461{
1462 ParallelFor<MT>(Gpu::KernelInfo{},
1463 box1,ncomp1,std::forward<L1>(f1),
1464 box2,ncomp2,std::forward<L2>(f2),
1465 box3,ncomp3,std::forward<L3>(f3));
1466}
1467
1468template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1469std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1470HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1471{
1472 if (Gpu::inLaunchRegion()) {
1473 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1474 } else {
1475#ifdef AMREX_USE_SYCL
1476 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1477#else
1479 for (T i = 0; i < n; ++i) { f(i); }
1480#endif
1481 }
1482}
1483
1484template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1485std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1486HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1487{
1488 if (Gpu::inLaunchRegion()) {
1489 ParallelFor<MT>(info,n,std::forward<L>(f));
1490 } else {
1491#ifdef AMREX_USE_SYCL
1492 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1493#else
1495 for (T i = 0; i < n; ++i) { f(i); }
1496#endif
1497 }
1498}
1499
1500template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1501std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1502HostDeviceParallelFor (T n, L&& f) noexcept
1503{
1504 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1505}
1506
1507template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1508std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1509HostDeviceParallelFor (T n, L&& f) noexcept
1510{
1511 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1512}
1513
1514template <typename L, int dim>
1515std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1516HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1517{
1518 if (Gpu::inLaunchRegion()) {
1519 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1520 } else {
1521#ifdef AMREX_USE_SYCL
1522 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1523#else
1524 LoopConcurrentOnCpu(box,std::forward<L>(f));
1525#endif
1526 }
1527}
1528
1529template <int MT, typename L, int dim>
1530std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1531HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1532{
1533 if (Gpu::inLaunchRegion()) {
1534 ParallelFor<MT>(info, box,std::forward<L>(f));
1535 } else {
1536#ifdef AMREX_USE_SYCL
1537 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1538#else
1539 LoopConcurrentOnCpu(box,std::forward<L>(f));
1540#endif
1541 }
1542}
1543
1544template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1545std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1546HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1547{
1548 if (Gpu::inLaunchRegion()) {
1549 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1550 } else {
1551#ifdef AMREX_USE_SYCL
1552 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1553#else
1554 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1555#endif
1556 }
1557}
1558
1559template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1560std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1561HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1562{
1563 if (Gpu::inLaunchRegion()) {
1564 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1565 } else {
1566#ifdef AMREX_USE_SYCL
1567 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1568#else
1569 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1570#endif
1571 }
1572}
1573
1574template <typename L1, typename L2, int dim>
1575std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1577 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1578{
1579 if (Gpu::inLaunchRegion()) {
1580 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1581 } else {
1582#ifdef AMREX_USE_SYCL
1583 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1584#else
1585 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1586 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1587#endif
1588 }
1589}
1590
1591template <int MT, typename L1, typename L2, int dim>
1592std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1594 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1595{
1596 if (Gpu::inLaunchRegion()) {
1597 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1598 } else {
1599#ifdef AMREX_USE_SYCL
1600 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1601#else
1602 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1603 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1604#endif
1605 }
1606}
1607
1608template <int MT, typename L1, typename L2, typename L3, int dim>
1609std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1611 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1612 L1&& f1, L2&& f2, L3&& f3) noexcept
1613{
1614 if (Gpu::inLaunchRegion()) {
1615 ParallelFor<MT>(info,box1,box2,box3,
1616 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1617 } else {
1618#ifdef AMREX_USE_SYCL
1619 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1620#else
1621 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1622 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1623 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1624#endif
1625 }
1626}
1627
1628template <typename T1, typename T2, typename L1, typename L2, int dim,
1629 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1630 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1631std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1633 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1634 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1635{
1636 if (Gpu::inLaunchRegion()) {
1637 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1638 } else {
1639#ifdef AMREX_USE_SYCL
1640 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1641#else
1642 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1643 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1644#endif
1645 }
1646}
1647
1648template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1649 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1650 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1651std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1653 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1654 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1655{
1656 if (Gpu::inLaunchRegion()) {
1657 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1658 } else {
1659#ifdef AMREX_USE_SYCL
1660 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1661#else
1662 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1663 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1664#endif
1665 }
1666}
1667
1668template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1669 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1670 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1671 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1672std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1674 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1675 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1676 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1677{
1678 if (Gpu::inLaunchRegion()) {
1679 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1680 box1,ncomp1,std::forward<L1>(f1),
1681 box2,ncomp2,std::forward<L2>(f2),
1682 box3,ncomp3,std::forward<L3>(f3));
1683 } else {
1684#ifdef AMREX_USE_SYCL
1685 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1686#else
1687 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1688 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1689 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1690#endif
1691 }
1692}
1693
1694template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1695 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1696 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1697 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1698std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1700 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1701 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1702 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1703{
1704 if (Gpu::inLaunchRegion()) {
1705 ParallelFor<MT>(info,
1706 box1,ncomp1,std::forward<L1>(f1),
1707 box2,ncomp2,std::forward<L2>(f2),
1708 box3,ncomp3,std::forward<L3>(f3));
1709 } else {
1710#ifdef AMREX_USE_SYCL
1711 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1712#else
1713 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1714 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1715 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1716#endif
1717 }
1718}
1719
1720template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1721void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1722{
1723 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1724}
1725
1726template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1727void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1728{
1729 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1730}
1731
1732template <typename L, int dim>
1733void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1734{
1735 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1736}
1737
1738template <int MT, typename L, int dim>
1739void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1740{
1741 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1742}
1743
1744template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1745void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1746{
1747 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1748}
1749
1750template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1751void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1752{
1753 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1754}
1755
1756template <typename L1, typename L2, int dim>
1757void HostDeviceFor (Gpu::KernelInfo const& info,
1758 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1759{
1760 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1761}
1762
1763template <int MT, typename L1, typename L2, int dim>
1764void HostDeviceFor (Gpu::KernelInfo const& info,
1765 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1766{
1767 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1768}
1769
1770template <typename L1, typename L2, typename L3, int dim>
1771void HostDeviceFor (Gpu::KernelInfo const& info,
1772 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1773 L1&& f1, L2&& f2, L3&& f3) noexcept
1774{
1775 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1776 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1777}
1778
1779template <int MT, typename L1, typename L2, typename L3, int dim>
1780void HostDeviceFor (Gpu::KernelInfo const& info,
1781 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1782 L1&& f1, L2&& f2, L3&& f3) noexcept
1783{
1784 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1785 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1786}
1787
1788template <typename T1, typename T2, typename L1, typename L2, int dim,
1789 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1790 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1791void HostDeviceFor (Gpu::KernelInfo const& info,
1792 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1793 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1794{
1795 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1796}
1797
1798template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1799 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1800 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1801void HostDeviceFor (Gpu::KernelInfo const& info,
1802 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1803 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1804{
1805 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1806}
1807
1808template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1809 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1810 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1811 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1812void HostDeviceFor (Gpu::KernelInfo const& info,
1813 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1814 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1815 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1816{
1817 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1818 box1,ncomp1,std::forward<L1>(f1),
1819 box2,ncomp2,std::forward<L2>(f2),
1820 box3,ncomp3,std::forward<L3>(f3));
1821}
1822
1823template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1824 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1825 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1826 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1827void HostDeviceFor (Gpu::KernelInfo const& info,
1828 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1829 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1830 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1831{
1832 HostDeviceParallelFor<MT>(info,
1833 box1,ncomp1,std::forward<L1>(f1),
1834 box2,ncomp2,std::forward<L2>(f2),
1835 box3,ncomp3,std::forward<L3>(f3));
1836}
1837
1838template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1839void HostDeviceParallelFor (T n, L&& f) noexcept
1840{
1841 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1842}
1843
1844template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1845void HostDeviceParallelFor (T n, L&& f) noexcept
1846{
1847 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1848}
1849
1850template <typename L, int dim>
1851void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1852{
1853 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1854}
1855
1856template <int MT, typename L, int dim>
1857void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1858{
1859 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1860}
1861
1862template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1863void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1864{
1865 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1866}
1867
1868template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1869void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1870{
1871 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1872}
1873
1874template <typename L1, typename L2, int dim>
1875void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1876{
1877 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1878}
1879
1880template <int MT, typename L1, typename L2, int dim>
1881void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1882{
1883 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1884}
1885
1886template <typename L1, typename L2, typename L3, int dim>
1887void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1888 L1&& f1, L2&& f2, L3&& f3) noexcept
1889{
1890 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
1891 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1892}
1893
1894template <int MT, typename L1, typename L2, typename L3, int dim>
1895void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1896 L1&& f1, L2&& f2, L3&& f3) noexcept
1897{
1898 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
1899 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1900}
1901
1902template <typename T1, typename T2, typename L1, typename L2, int dim,
1903 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1904 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1905void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1906 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1907{
1908 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1909}
1910
1911template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1912 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1913 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1914void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1915 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1916{
1917 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1918}
1919
1920template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1921 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1922 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1923 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1924void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1925 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1926 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1927{
1928 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1929 box1,ncomp1,std::forward<L1>(f1),
1930 box2,ncomp2,std::forward<L2>(f2),
1931 box3,ncomp3,std::forward<L3>(f3));
1932}
1933
1934template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1935 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1936 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1937 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1938void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1939 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1940 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1941{
1942 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
1943 box1,ncomp1,std::forward<L1>(f1),
1944 box2,ncomp2,std::forward<L2>(f2),
1945 box3,ncomp3,std::forward<L3>(f3));
1946}
1947
1948}
1949
1950#endif
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:46
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:197
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
Definition AMReX_GpuKernelInfo.H:8
Definition AMReX_IntVect.H:55
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:125
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:260
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
AMREX_FORCE_INLINE auto call_f_intvect_handler(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition AMReX_GpuLaunchFunctsC.H:93
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_handler(F const &f, IntVectND< dim > iv, T n) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n))
Definition AMReX_GpuLaunchFunctsC.H:121
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect_ncomp(F const &f, IntVectND< dim > iv, T ncomp) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, 0))
Definition AMReX_GpuLaunchFunctsG.H:103
AMREX_FORCE_INLINE auto call_f_scalar_handler(F const &f, N i) noexcept -> decltype(f(0))
Definition AMReX_GpuLaunchFunctsC.H:31
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f_intvect(F const &f, IntVectND< dim > iv) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv))
Definition AMReX_GpuLaunchFunctsG.H:65
AMREX_FORCE_INLINE auto call_f_intvect_engine(F const &f, IntVectND< dim > iv, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, engine))
Definition AMReX_GpuLaunchFunctsC.H:83
AMREX_FORCE_INLINE auto call_f_intvect_ncomp_engine(F const &f, IntVectND< dim > iv, T n, RandomEngine engine) noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence< dim >(), f, iv, n, engine))
Definition AMReX_GpuLaunchFunctsC.H:111
AMREX_FORCE_INLINE auto call_f_intvect_inner(std::index_sequence< Ns... >, F const &f, IntVectND< 1 > iv, Args...args) noexcept -> decltype(f(0, 0, 0, args...))
Definition AMReX_GpuLaunchFunctsC.H:49
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
__host__ __device__ constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:179
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:138
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:912
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:799
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1350
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:378
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:152
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1264
Definition AMReX_FabArrayCommI.H:1000
Definition AMReX_Box.H:2045
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2062
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2086
Definition AMReX_GpuLaunch.H:132
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72