Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuLaunchFunctsG.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_
2#define AMREX_GPU_LAUNCH_FUNCTS_G_H_
3#include <AMReX_Config.H>
4
5namespace amrex {
6
8namespace detail {
9
10 // call_f_scalar_handler
11
12 template <typename F, typename N>
14 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)
15 noexcept -> decltype(f(0))
16 {
17 return f(i);
18 }
19
20 template <typename F, typename N>
22 auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)
23 noexcept -> decltype(f(0,Gpu::Handler{}))
24 {
25 return f(i, handler);
26 }
27
28 // call_f_intvect_inner
29
30 template <typename F, std::size_t...Ns, class...Args>
32 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)
33 noexcept -> decltype(f(0, 0, 0, args...))
34 {
35 return f(iv[0], 0, 0, args...);
36 }
37
38 template <typename F, std::size_t...Ns, class...Args>
40 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)
41 noexcept -> decltype(f(0, 0, 0, args...))
42 {
43 return f(iv[0], iv[1], 0, args...);
44 }
45
46 template <typename F, int dim, std::size_t...Ns, class...Args>
48 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
49 noexcept -> decltype(f(iv, args...))
50 {
51 return f(iv, args...);
52 }
53
54 template <typename F, int dim, std::size_t...Ns, class...Args>
56 auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)
57 noexcept -> decltype(f(iv[Ns]..., args...))
58 {
59 return f(iv[Ns]..., args...);
60 }
61
62 // call_f_intvect
63
64 template <typename F, int dim>
66 auto call_f_intvect (F const& f, IntVectND<dim> iv)
67 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
68 {
69 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
70 }
71
72 // call_f_intvect_engine
73
74 template <typename F, int dim>
76 auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)
77 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))
78 {
79 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);
80 }
81
82 // call_f_intvect_handler
83
84 template <typename F, int dim>
86 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)
87 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
88 {
89 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
90 }
91
92 template <typename F, int dim>
94 auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)
95 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))
96 {
97 return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);
98 }
99
100 // call_f_intvect_ncomp
101
102 template <typename F, typename T, int dim>
104 auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)
105 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
106 {
107 for (T n = 0; n < ncomp; ++n) {
108 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
109 }
110 }
111
112 // call_f_intvect_ncomp_engine
113
114 template <typename F, typename T, int dim>
116 auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)
117 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))
118 {
119 for (T n = 0; n < ncomp; ++n) {
120 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);
121 }
122 }
123
124 // call_f_intvect_ncomp_handler
125
126 template <typename F, typename T, int dim>
128 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)
129 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))
130 {
131 for (T n = 0; n < ncomp; ++n) {
132 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
133 }
134 }
135
136 template <typename F, typename T, int dim>
138 auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)
139 noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))
140 {
141 for (T n = 0; n < ncomp; ++n) {
142 call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);
143 }
144 }
145
146}
148
149#ifdef AMREX_USE_SYCL
150
151template <typename L>
152void single_task (gpuStream_t stream, L const& f) noexcept
153{
154 auto& q = *(stream.queue);
155 try {
156 q.submit([&] (sycl::handler& h) {
157 h.single_task(f);
158 });
159 } catch (sycl::exception const& ex) {
160 amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");
161 }
162}
163
164template<typename L>
165void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
166 gpuStream_t stream, L const& f) noexcept
167{
168 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
169 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
170 / sizeof(unsigned long long);
171 auto& q = *(stream.queue);
172 try {
173 q.submit([&] (sycl::handler& h) {
174 sycl::local_accessor<unsigned long long>
175 shared_data(sycl::range<1>(shared_mem_numull), h);
176 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
177 sycl::range<1>(nthreads_per_block)),
178 [=] (sycl::nd_item<1> item)
179 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
180 {
181 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
182 });
183 });
184 } catch (sycl::exception const& ex) {
185 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
186 }
187}
188
189template<typename L>
190void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
191{
192 const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
193 auto& q = *(stream.queue);
194 try {
195 q.submit([&] (sycl::handler& h) {
196 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
197 sycl::range<1>(nthreads_per_block)),
198 [=] (sycl::nd_item<1> item)
199 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
200 {
201 f(item);
202 });
203 });
204 } catch (sycl::exception const& ex) {
205 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
206 }
207}
208
209template <int MT, typename L>
210void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
211 L const& f) noexcept
212{
213 const auto nthreads_total = MT * std::size_t(nblocks);
214 const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
215 / sizeof(unsigned long long);
216 auto& q = *(stream.queue);
217 try {
218 q.submit([&] (sycl::handler& h) {
219 sycl::local_accessor<unsigned long long>
220 shared_data(sycl::range<1>(shared_mem_numull), h);
221 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
222 sycl::range<1>(MT)),
223 [=] (sycl::nd_item<1> item)
224 [[sycl::reqd_work_group_size(MT)]]
225 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
226 {
227 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
228 });
229 });
230 } catch (sycl::exception const& ex) {
231 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
232 }
233}
234
235template <int MT, typename L>
236void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
237{
238 const auto nthreads_total = MT * std::size_t(nblocks);
239 auto& q = *(stream.queue);
240 try {
241 q.submit([&] (sycl::handler& h) {
242 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
243 sycl::range<1>(MT)),
244 [=] (sycl::nd_item<1> item)
245 [[sycl::reqd_work_group_size(MT)]]
246 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
247 {
248 f(item);
249 });
250 });
251 } catch (sycl::exception const& ex) {
252 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
253 }
254}
255
256template<int MT, typename T, typename L>
257void launch (T const& n, L const& f) noexcept
258{
259 if (amrex::isEmpty(n)) { return; }
260 const auto ec = Gpu::makeExecutionConfig<MT>(n);
261 const auto nthreads_per_block = ec.numThreads.x;
262 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
263 auto& q = Gpu::Device::streamQueue();
264 try {
265 q.submit([&] (sycl::handler& h) {
266 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
267 sycl::range<1>(nthreads_per_block)),
268 [=] (sycl::nd_item<1> item)
269 [[sycl::reqd_work_group_size(MT)]]
270 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
271 {
272 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
273 f(i);
274 }
275 });
276 });
277 } catch (sycl::exception const& ex) {
278 amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
279 }
280}
281
282template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
283void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
284{
285 if (amrex::isEmpty(n)) { return; }
286 const auto ec = Gpu::makeExecutionConfig<MT>(n);
287 const auto nthreads_per_block = ec.numThreads.x;
288 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
289 auto& q = Gpu::Device::streamQueue();
290 try {
291 if (info.hasReduction()) {
292 q.submit([&] (sycl::handler& h) {
293 sycl::local_accessor<unsigned long long>
294 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
295 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
296 sycl::range<1>(nthreads_per_block)),
297 [=] (sycl::nd_item<1> item)
298 [[sycl::reqd_work_group_size(MT)]]
299 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
300 {
301 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
302 i < std::size_t(n); i += stride) {
303 int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
304 item.get_local_range(0));
305 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
306 n_active_threads});
307 }
308 });
309 });
310 } else {
311 q.submit([&] (sycl::handler& h) {
312 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
313 sycl::range<1>(nthreads_per_block)),
314 [=] (sycl::nd_item<1> item)
315 [[sycl::reqd_work_group_size(MT)]]
316 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
317 {
318 for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
319 i < std::size_t(n); i += stride) {
320 detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});
321 }
322 });
323 });
324 }
325 } catch (sycl::exception const& ex) {
326 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
327 }
328}
329
330template <int MT, typename L, int dim>
331void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f) noexcept
332{
333 if (amrex::isEmpty(box)) { return; }
334 const BoxIndexerND<dim> indexer(box);
335 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
336 const auto nthreads_per_block = ec.numThreads.x;
337 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
338 auto& q = Gpu::Device::streamQueue();
339 try {
340 if (info.hasReduction()) {
341 q.submit([&] (sycl::handler& h) {
342 sycl::local_accessor<unsigned long long>
343 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
344 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
345 sycl::range<1>(nthreads_per_block)),
346 [=] (sycl::nd_item<1> item)
347 [[sycl::reqd_work_group_size(MT)]]
348 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
349 {
350 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
351 icell < indexer.numPts(); icell += stride) {
352 auto iv = indexer.intVect(icell);
353 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
354 std::uint64_t(item.get_local_range(0)));
355 detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
356 n_active_threads});
357 }
358 });
359 });
360 } else {
361 q.submit([&] (sycl::handler& h) {
362 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
363 sycl::range<1>(nthreads_per_block)),
364 [=] (sycl::nd_item<1> item)
365 [[sycl::reqd_work_group_size(MT)]]
366 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
367 {
368 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
369 icell < indexer.numPts(); icell += stride) {
370 auto iv = indexer.intVect(icell);
371 detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});
372 }
373 });
374 });
375 }
376 } catch (sycl::exception const& ex) {
377 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
378 }
379}
380
381template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
382void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f) noexcept
383{
384 if (amrex::isEmpty(box)) { return; }
385 const BoxIndexerND<dim> indexer(box);
386 const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());
387 const auto nthreads_per_block = ec.numThreads.x;
388 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
389 auto& q = Gpu::Device::streamQueue();
390 try {
391 if (info.hasReduction()) {
392 q.submit([&] (sycl::handler& h) {
393 sycl::local_accessor<unsigned long long>
394 shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
395 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
396 sycl::range<1>(nthreads_per_block)),
397 [=] (sycl::nd_item<1> item)
398 [[sycl::reqd_work_group_size(MT)]]
399 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
400 {
401 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
402 icell < indexer.numPts(); icell += stride) {
403 auto iv = indexer.intVect(icell);
404 int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
405 std::uint64_t(item.get_local_range(0)));
406 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
407 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
408 n_active_threads});
409 }
410 });
411 });
412 } else {
413 q.submit([&] (sycl::handler& h) {
414 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
415 sycl::range<1>(nthreads_per_block)),
416 [=] (sycl::nd_item<1> item)
417 [[sycl::reqd_work_group_size(MT)]]
418 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
419 {
420 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
421 icell < indexer.numPts(); icell += stride) {
422 auto iv = indexer.intVect(icell);
423 detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});
424 }
425 });
426 });
427 }
428 } catch (sycl::exception const& ex) {
429 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
430 }
431}
432
433template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
434void ParallelForRNG (T n, L const& f) noexcept
435{
436 if (amrex::isEmpty(n)) { return; }
437 const auto ec = Gpu::ExecutionConfig(n);
438 const auto nthreads_per_block = ec.numThreads.x;
439 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
440 auto& q = Gpu::Device::streamQueue();
441 auto& engdescr = *(getRandEngineDescriptor());
442 try {
443 q.submit([&] (sycl::handler& h) {
444 auto engine_acc = engdescr.get_access(h);
445 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
446 sycl::range<1>(nthreads_per_block)),
447 [=] (sycl::nd_item<1> item)
448 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
449 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
450 {
451 auto const tid = item.get_global_id(0);
452 auto engine = engine_acc.load(tid);
453 RandomEngine rand_eng{&engine};
454 for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
455 f(T(i),rand_eng);
456 }
457 engine_acc.store(engine, tid);
458 });
459 });
460 q.wait_and_throw(); // because next launch might be on a different queue
461 } catch (sycl::exception const& ex) {
462 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
463 }
464}
465
466template <typename L, int dim>
467void ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
468{
469 if (amrex::isEmpty(box)) { return; }
470 const BoxIndexerND<dim> indexer(box);
471 const auto ec = Gpu::ExecutionConfig(box.numPts());
472 const auto nthreads_per_block = ec.numThreads.x;
473 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
474 auto& q = Gpu::Device::streamQueue();
475 auto& engdescr = *(getRandEngineDescriptor());
476 try {
477 q.submit([&] (sycl::handler& h) {
478 auto engine_acc = engdescr.get_access(h);
479 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
480 sycl::range<1>(nthreads_per_block)),
481 [=] (sycl::nd_item<1> item)
482 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
483 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
484 {
485 auto const tid = item.get_global_id(0);
486 auto engine = engine_acc.load(tid);
487 RandomEngine rand_eng{&engine};
488 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
489 icell < indexer.numPts(); icell += stride) {
490 auto iv = indexer.intVect(icell);
491 detail::call_f_intvect_engine(f,iv,rand_eng);
492 }
493 engine_acc.store(engine, tid);
494 });
495 });
496 q.wait_and_throw(); // because next launch might be on a different queue
497 } catch (sycl::exception const& ex) {
498 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
499 }
500}
501
502template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
503void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
504{
505 if (amrex::isEmpty(box)) { return; }
506 const BoxIndexerND<dim> indexer(box);
507 const auto ec = Gpu::ExecutionConfig(box.numPts());
508 const auto nthreads_per_block = ec.numThreads.x;
509 const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
510 auto& q = Gpu::Device::streamQueue();
511 auto& engdescr = *(getRandEngineDescriptor());
512 try {
513 q.submit([&] (sycl::handler& h) {
514 auto engine_acc = engdescr.get_access(h);
515 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
516 sycl::range<1>(nthreads_per_block)),
517 [=] (sycl::nd_item<1> item)
518 [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
519 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
520 {
521 auto const tid = item.get_global_id(0);
522 auto engine = engine_acc.load(tid);
523 RandomEngine rand_eng{&engine};
524 for (std::uint64_t icell = tid, stride = item.get_global_range(0);
525 icell < indexer.numPts(); icell += stride) {
526 auto iv = indexer.intVect(icell);
527 detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);
528 }
529 engine_acc.store(engine, tid);
530 });
531 });
532 q.wait_and_throw(); // because next launch might be on a different queue
533 } catch (sycl::exception const& ex) {
534 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
535 }
536}
537
538template <int MT, typename L1, typename L2, int dim>
539void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
540{
541 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
542 const BoxIndexerND<dim> indexer1(box1);
543 const BoxIndexerND<dim> indexer2(box2);
544 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));
545 const auto nthreads_per_block = ec.numThreads.x;
546 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
547 auto& q = Gpu::Device::streamQueue();
548 try {
549 q.submit([&] (sycl::handler& h) {
550 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
551 sycl::range<1>(nthreads_per_block)),
552 [=] (sycl::nd_item<1> item)
553 [[sycl::reqd_work_group_size(MT)]]
554 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
555 {
556 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
557 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
558 icell < ncells; icell += stride) {
559 if (icell < indexer1.numPts()) {
560 auto iv = indexer1.intVect(icell);
561 detail::call_f_intvect(f1,iv);
562 }
563 if (icell < indexer2.numPts()) {
564 auto iv = indexer2.intVect(icell);
565 detail::call_f_intvect(f2,iv);
566 }
567 }
568 });
569 });
570 } catch (sycl::exception const& ex) {
571 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
572 }
573}
574
575template <int MT, typename L1, typename L2, typename L3, int dim>
576void ParallelFor (Gpu::KernelInfo const& /*info*/,
577 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
578 L1&& f1, L2&& f2, L3&& f3) noexcept
579{
580 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
581 const BoxIndexerND<dim> indexer1(box1);
582 const BoxIndexerND<dim> indexer2(box2);
583 const BoxIndexerND<dim> indexer3(box3);
584 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
585 const auto nthreads_per_block = ec.numThreads.x;
586 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
587 auto& q = Gpu::Device::streamQueue();
588 try {
589 q.submit([&] (sycl::handler& h) {
590 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
591 sycl::range<1>(nthreads_per_block)),
592 [=] (sycl::nd_item<1> item)
593 [[sycl::reqd_work_group_size(MT)]]
594 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
595 {
596 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
597 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
598 icell < ncells; icell += stride) {
599 if (icell < indexer1.numPts()) {
600 auto iv = indexer1.intVect(icell);
601 detail::call_f_intvect(f1,iv);
602 }
603 if (icell < indexer2.numPts()) {
604 auto iv = indexer2.intVect(icell);
605 detail::call_f_intvect(f2,iv);
606 }
607 if (icell < indexer3.numPts()) {
608 auto iv = indexer3.intVect(icell);
609 detail::call_f_intvect(f3,iv);
610 }
611 }
612 });
613 });
614 } catch (sycl::exception const& ex) {
615 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
616 }
617}
618
619template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
620 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
621 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
622void ParallelFor (Gpu::KernelInfo const& /*info*/,
623 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
624 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
625{
626 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
627 const BoxIndexerND<dim> indexer1(box1);
628 const BoxIndexerND<dim> indexer2(box2);
629 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
630 const auto nthreads_per_block = ec.numThreads.x;
631 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
632 auto& q = Gpu::Device::streamQueue();
633 try {
634 q.submit([&] (sycl::handler& h) {
635 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
636 sycl::range<1>(nthreads_per_block)),
637 [=] (sycl::nd_item<1> item)
638 [[sycl::reqd_work_group_size(MT)]]
639 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
640 {
641 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
642 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
643 icell < ncells; icell += stride) {
644 if (icell < indexer1.numPts()) {
645 auto iv = indexer1.intVect(icell);
646 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
647 }
648 if (icell < indexer2.numPts()) {
649 auto iv = indexer2.intVect(icell);
650 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
651 }
652 }
653 });
654 });
655 } catch (sycl::exception const& ex) {
656 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
657 }
658}
659
660template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
661 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
662 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
663 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
664void ParallelFor (Gpu::KernelInfo const& /*info*/,
665 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
666 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
667 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
668{
669 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
670 const BoxIndexerND<dim> indexer1(box1);
671 const BoxIndexerND<dim> indexer2(box2);
672 const BoxIndexerND<dim> indexer3(box3);
673 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
674 const auto nthreads_per_block = ec.numThreads.x;
675 const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
676 auto& q = Gpu::Device::streamQueue();
677 try {
678 q.submit([&] (sycl::handler& h) {
679 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
680 sycl::range<1>(nthreads_per_block)),
681 [=] (sycl::nd_item<1> item)
682 [[sycl::reqd_work_group_size(MT)]]
683 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
684 {
685 auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());
686 for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
687 icell < ncells; icell += stride) {
688 if (icell < indexer1.numPts()) {
689 auto iv = indexer1.intVect(icell);
690 detail::call_f_intvect_ncomp(f1,iv,ncomp1);
691 }
692 if (icell < indexer2.numPts()) {
693 auto iv = indexer2.intVect(icell);
694 detail::call_f_intvect_ncomp(f2,iv,ncomp2);
695 }
696 if (icell < indexer3.numPts()) {
697 auto iv = indexer3.intVect(icell);
698 detail::call_f_intvect_ncomp(f3,iv,ncomp3);
699 }
700 }
701 });
702 });
703 } catch (sycl::exception const& ex) {
704 amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
705 }
706}
707
708#else
709// CUDA or HIP
710
711template <typename L>
712void single_task (gpuStream_t stream, L const& f) noexcept
713{
714 AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream, f);
716}
717
718template <int MT, typename L>
719void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
720 L const& f) noexcept
721{
722 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream, f);
724}
725
726template <int MT, typename L>
727void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
728{
729 AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream, f);
731}
732
733template<typename L>
734void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
735 gpuStream_t stream, L const& f) noexcept
736{
737 AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes, stream, f);
739}
740
741template<typename L>
742void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
743{
744 launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
745}
746
747template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
748void launch (T const& n, L const& f) noexcept
749{
750 static_assert(sizeof(T) >= 2);
751 if (amrex::isEmpty(n)) { return; }
752 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
753 for (auto const& ec : nec) {
754 const T start_idx = T(ec.start_idx);
755 const T nleft = n - start_idx;
756 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
757 [=] AMREX_GPU_DEVICE () noexcept {
758 // This will not overflow, even though nblocks*MT might.
759 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
760 if (tid < nleft) {
761 f(tid+start_idx);
762 }
763 });
764 }
766}
767
768template<int MT, int dim, typename L>
769void launch (BoxND<dim> const& box, L const& f) noexcept
770{
771 if (box.isEmpty()) { return; }
772 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
773 const BoxIndexerND<dim> indexer(box);
774 const auto type = box.ixType();
775 for (auto const& ec : nec) {
776 const auto start_idx = std::uint64_t(ec.start_idx);
777 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
778 [=] AMREX_GPU_DEVICE () noexcept {
779 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
780 if (icell < indexer.numPts()) {
781 auto iv = indexer.intVect(icell);
782 f(BoxND<dim>(iv,iv,type));
783 }
784 });
785 }
787}
788
793template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
794std::enable_if_t<MaybeDeviceRunnable<L>::value>
795ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
796{
797 static_assert(sizeof(T) >= 2);
798 if (amrex::isEmpty(n)) { return; }
799 const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);
800 for (auto const& ec : nec) {
801 const T start_idx = T(ec.start_idx);
802 const T nleft = n - start_idx;
803 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
804 [=] AMREX_GPU_DEVICE () noexcept {
805 // This will not overflow, even though nblocks*MT might.
806 auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);
807 if (tid < nleft) {
808 detail::call_f_scalar_handler(f, tid+start_idx,
809 Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
810 (std::uint64_t)MT)));
811 }
812 });
813 }
815}
816
821template <int MT, typename L, int dim>
822std::enable_if_t<MaybeDeviceRunnable<L>::value>
823ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
824{
825 if (amrex::isEmpty(box)) { return; }
826 const BoxIndexerND<dim> indexer(box);
827 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
828 for (auto const& ec : nec) {
829 const auto start_idx = std::uint64_t(ec.start_idx);
830 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
831 [=] AMREX_GPU_DEVICE () noexcept {
832 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
833 if (icell < indexer.numPts()) {
834 auto iv = indexer.intVect(icell);
835 detail::call_f_intvect_handler(f, iv,
836 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
837 (std::uint64_t)MT)));
838 }
839 });
840 }
842}
843
848template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
849std::enable_if_t<MaybeDeviceRunnable<L>::value>
850ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f) noexcept
851{
852 if (amrex::isEmpty(box)) { return; }
853 const BoxIndexerND<dim> indexer(box);
854 const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);
855 for (auto const& ec : nec) {
856 const auto start_idx = std::uint64_t(ec.start_idx);
857 AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),
858 [=] AMREX_GPU_DEVICE () noexcept {
859 auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;
860 if (icell < indexer.numPts()) {
861 auto iv = indexer.intVect(icell);
862 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
863 Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
864 (std::uint64_t)MT)));
865 }
866 });
867 }
869}
870
876template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
877std::enable_if_t<MaybeDeviceRunnable<L>::value>
878ParallelForRNG (T n, L const& f) noexcept
879{
880 if (amrex::isEmpty(n)) { return; }
881 randState_t* rand_state = getRandState();
882 const auto ec = Gpu::ExecutionConfig(n);
883 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
885 ec.numThreads, 0, Gpu::gpuStream(),
886 [=] AMREX_GPU_DEVICE () noexcept {
887 Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
888 RandomEngine engine{&(rand_state[tid])};
889 for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
890 f(T(i),engine);
891 }
892 });
893 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
895}
896
902template <typename L, int dim>
903std::enable_if_t<MaybeDeviceRunnable<L>::value>
904ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
905{
906 if (amrex::isEmpty(box)) { return; }
907 randState_t* rand_state = getRandState();
908 const BoxIndexerND<dim> indexer(box);
909 const auto ec = Gpu::ExecutionConfig(box.numPts());
910 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
912 ec.numThreads, 0, Gpu::gpuStream(),
913 [=] AMREX_GPU_DEVICE () noexcept {
914 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
915 RandomEngine engine{&(rand_state[tid])};
916 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
917 auto iv = indexer.intVect(icell);
918 detail::call_f_intvect_engine(f, iv, engine);
919 }
920 });
921 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
923}
924
930template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
931std::enable_if_t<MaybeDeviceRunnable<L>::value>
932ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
933{
934 if (amrex::isEmpty(box)) { return; }
935 randState_t* rand_state = getRandState();
936 const BoxIndexerND<dim> indexer(box);
937 const auto ec = Gpu::ExecutionConfig(box.numPts());
938 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
940 ec.numThreads, 0, Gpu::gpuStream(),
941 [=] AMREX_GPU_DEVICE () noexcept {
942 auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
943 RandomEngine engine{&(rand_state[tid])};
944 for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
945 auto iv = indexer.intVect(icell);
946 detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
947 }
948 });
949 Gpu::streamSynchronize(); // To avoid multiple streams using RNG
951}
952
957template <int MT, typename L1, typename L2, int dim>
958std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
960 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
961{
962 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
963 const BoxIndexerND<dim> indexer1(box1);
964 const BoxIndexerND<dim> indexer2(box2);
965 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
966 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
967 [=] AMREX_GPU_DEVICE () noexcept {
968 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
969 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
970 icell < ncells; icell += stride) {
971 if (icell < indexer1.numPts()) {
972 auto iv = indexer1.intVect(icell);
973 detail::call_f_intvect(f1, iv);
974 }
975 if (icell < indexer2.numPts()) {
976 auto iv = indexer2.intVect(icell);
977 detail::call_f_intvect(f2, iv);
978 }
979 }
980 });
982}
983
988template <int MT, typename L1, typename L2, typename L3, int dim>
989std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
991 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
992 L1&& f1, L2&& f2, L3&& f3) noexcept
993{
994 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
995 const BoxIndexerND<dim> indexer1(box1);
996 const BoxIndexerND<dim> indexer2(box2);
997 const BoxIndexerND<dim> indexer3(box3);
998 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
999 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1000 [=] AMREX_GPU_DEVICE () noexcept {
1001 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1002 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1003 icell < ncells; icell += stride) {
1004 if (icell < indexer1.numPts()) {
1005 auto iv = indexer1.intVect(icell);
1006 detail::call_f_intvect(f1, iv);
1007 }
1008 if (icell < indexer2.numPts()) {
1009 auto iv = indexer2.intVect(icell);
1010 detail::call_f_intvect(f2, iv);
1011 }
1012 if (icell < indexer3.numPts()) {
1013 auto iv = indexer3.intVect(icell);
1014 detail::call_f_intvect(f3, iv);
1015 }
1016 }
1017 });
1019}
1020
1025template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1026 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1027 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1028std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1030 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1031 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1032{
1033 if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
1034 const BoxIndexerND<dim> indexer1(box1);
1035 const BoxIndexerND<dim> indexer2(box2);
1036 const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));
1037 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1038 [=] AMREX_GPU_DEVICE () noexcept {
1039 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1040 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1041 icell < ncells; icell += stride) {
1042 if (icell < indexer1.numPts()) {
1043 auto iv = indexer1.intVect(icell);
1044 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1045 }
1046 if (icell < indexer2.numPts()) {
1047 auto iv = indexer2.intVect(icell);
1048 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1049 }
1050 }
1051 });
1053}
1054
1059template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1060 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1061 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1062 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1063std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1065 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1066 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1067 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1068{
1069 if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
1070 const BoxIndexerND<dim> indexer1(box1);
1071 const BoxIndexerND<dim> indexer2(box2);
1072 const BoxIndexerND<dim> indexer3(box3);
1073 const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));
1074 AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
1075 [=] AMREX_GPU_DEVICE () noexcept {
1076 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1077 for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
1078 icell < ncells; icell += stride) {
1079 if (icell < indexer1.numPts()) {
1080 auto iv = indexer1.intVect(icell);
1081 detail::call_f_intvect_ncomp(f1, iv, ncomp1);
1082 }
1083 if (icell < indexer2.numPts()) {
1084 auto iv = indexer2.intVect(icell);
1085 detail::call_f_intvect_ncomp(f2, iv, ncomp2);
1086 }
1087 if (icell < indexer3.numPts()) {
1088 auto iv = indexer3.intVect(icell);
1089 detail::call_f_intvect_ncomp(f3, iv, ncomp3);
1090 }
1091 }
1092 });
1094}
1095
1096#endif
1097
1098template <typename L>
1099void single_task (L&& f) noexcept
1100{
1101 single_task(Gpu::gpuStream(), std::forward<L>(f));
1102}
1103
1104template<typename T, typename L>
1105void launch (T const& n, L&& f) noexcept
1106{
1107 launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
1108}
1109
1114template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1115std::enable_if_t<MaybeDeviceRunnable<L>::value>
1116ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1117{
1118 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
1119}
1120
1125template <typename L, int dim>
1126std::enable_if_t<MaybeDeviceRunnable<L>::value>
1127ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1128{
1129 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
1130}
1131
1136template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1137std::enable_if_t<MaybeDeviceRunnable<L>::value>
1138ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1139{
1140 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
1141}
1142
1147template <typename L1, typename L2, int dim>
1148std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1150 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1151{
1152 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
1153 std::forward<L2>(f2));
1154}
1155
1160template <typename L1, typename L2, typename L3, int dim>
1161std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1163 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1164 L1&& f1, L2&& f2, L3&& f3) noexcept
1165{
1166 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
1167 std::forward<L2>(f2), std::forward<L3>(f3));
1168}
1169
1174template <typename T1, typename T2, typename L1, typename L2, int dim,
1175 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1176 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1177std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
1179 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1180 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1181{
1182 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1183 box2, ncomp2, std::forward<L2>(f2));
1184}
1185
1190template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1191 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1192 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1193 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1194std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
1196 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1197 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1198 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1199{
1200 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
1201 box2, ncomp2, std::forward<L2>(f2),
1202 box3, ncomp3, std::forward<L3>(f3));
1203}
1204
1205template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1206void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1207{
1208 ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
1209}
1210
1211template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1212void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1213{
1214 ParallelFor<MT>(info, n,std::forward<L>(f));
1215}
1216
1217template <typename L, int dim>
1218void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1219{
1220 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1221}
1222
1223template <int MT, typename L, int dim>
1224void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1225{
1226 ParallelFor<MT>(info, box,std::forward<L>(f));
1227}
1228
1229template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1230void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1231{
1232 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1233}
1234
1235template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1236void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1237{
1238 ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1239}
1240
1241template <typename L1, typename L2, int dim>
1242void For (Gpu::KernelInfo const& info,
1243 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1244{
1245 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1246}
1247
1248template <int MT, typename L1, typename L2, int dim>
1249void For (Gpu::KernelInfo const& info,
1250 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1251{
1252 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1253}
1254
1255template <typename L1, typename L2, typename L3, int dim>
1256void For (Gpu::KernelInfo const& info,
1257 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1258 L1&& f1, L2&& f2, L3&& f3) noexcept
1259{
1260 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1261}
1262
1263template <int MT, typename L1, typename L2, typename L3, int dim>
1264void For (Gpu::KernelInfo const& info,
1265 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1266 L1&& f1, L2&& f2, L3&& f3) noexcept
1267{
1268 ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1269}
1270
1271template <typename T1, typename T2, typename L1, typename L2, int dim,
1272 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1273 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1274void For (Gpu::KernelInfo const& info,
1275 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1276 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1277{
1278 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1279}
1280
1281template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1282 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1283 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1284void For (Gpu::KernelInfo const& info,
1285 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1286 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1287{
1288 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1289}
1290
1291template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1292 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1293 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1294 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1295void For (Gpu::KernelInfo const& info,
1296 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1297 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1298 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1299{
1300 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1301 box1,ncomp1,std::forward<L1>(f1),
1302 box2,ncomp2,std::forward<L2>(f2),
1303 box3,ncomp3,std::forward<L3>(f3));
1304}
1305
1306template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1307 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1308 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1309 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1310void For (Gpu::KernelInfo const& info,
1311 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1312 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1313 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1314{
1315 ParallelFor<MT>(info,
1316 box1,ncomp1,std::forward<L1>(f1),
1317 box2,ncomp2,std::forward<L2>(f2),
1318 box3,ncomp3,std::forward<L3>(f3));
1319}
1320
1325template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1326void ParallelFor (T n, L&& f) noexcept
1327{
1328 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1329}
1330
1335template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1336void ParallelFor (T n, L&& f) noexcept
1337{
1338 ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1339}
1340
1345template <typename L, int dim>
1346void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1347{
1348 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1349}
1350
1355template <int MT, typename L, int dim>
1356void ParallelFor (BoxND<dim> const& box, L&& f) noexcept
1357{
1358 ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
1359}
1360
1365template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1366void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1367{
1368 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1369}
1370
1375template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1376void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1377{
1378 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1379}
1380
1385template <typename L1, typename L2, int dim>
1386void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1387{
1388 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1389}
1390
1395template <int MT, typename L1, typename L2, int dim>
1396void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1397{
1398 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1399}
1400
1405template <typename L1, typename L2, typename L3, int dim>
1406void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1407 L1&& f1, L2&& f2, L3&& f3) noexcept
1408{
1409 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1410}
1411
1416template <int MT, typename L1, typename L2, typename L3, int dim>
1417void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1418 L1&& f1, L2&& f2, L3&& f3) noexcept
1419{
1420 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1421}
1422
1427template <typename T1, typename T2, typename L1, typename L2, int dim,
1428 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1429 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1430void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1431 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1432{
1433 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1434}
1435
1440template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1441 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1442 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1443void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1444 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1445{
1446 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1447}
1448
1453template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1454 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1455 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1456 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1457void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1458 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1459 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1460{
1461 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1462 box1,ncomp1,std::forward<L1>(f1),
1463 box2,ncomp2,std::forward<L2>(f2),
1464 box3,ncomp3,std::forward<L3>(f3));
1465}
1466
1471template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1472 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1473 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1474 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1475void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1476 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1477 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1478{
1479 ParallelFor<MT>(Gpu::KernelInfo{},
1480 box1,ncomp1,std::forward<L1>(f1),
1481 box2,ncomp2,std::forward<L2>(f2),
1482 box3,ncomp3,std::forward<L3>(f3));
1483}
1484
1485template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1486void For (T n, L&& f) noexcept
1487{
1488 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1489}
1490
1491template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1492void For (T n, L&& f) noexcept
1493{
1494 ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
1495}
1496
1497template <typename L, int dim>
1498void For (BoxND<dim> const& box, L&& f) noexcept
1499{
1500 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1501}
1502
1503template <int MT, typename L, int dim>
1504void For (BoxND<dim> const& box, L&& f) noexcept
1505{
1506 ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
1507}
1508
1509template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1510void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1511{
1512 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1513}
1514
1515template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1516void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1517{
1518 ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1519}
1520
1521template <typename L1, typename L2, int dim>
1522void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1523{
1524 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1525}
1526
1527template <int MT, typename L1, typename L2, int dim>
1528void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1529{
1530 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1531}
1532
1533template <typename L1, typename L2, typename L3, int dim>
1534void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1535 L1&& f1, L2&& f2, L3&& f3) noexcept
1536{
1537 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1538}
1539
1540template <int MT, typename L1, typename L2, typename L3, int dim>
1541void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1542 L1&& f1, L2&& f2, L3&& f3) noexcept
1543{
1544 ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1545}
1546
1547template <typename T1, typename T2, typename L1, typename L2, int dim,
1548 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1549 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1550void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1551 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1552{
1553 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1554}
1555
1556template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1557 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1558 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1559void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1560 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1561{
1562 ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1563}
1564
1565template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1566 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1567 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1568 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1569void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1570 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1571 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1572{
1573 ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
1574 box1,ncomp1,std::forward<L1>(f1),
1575 box2,ncomp2,std::forward<L2>(f2),
1576 box3,ncomp3,std::forward<L3>(f3));
1577}
1578
1579template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1580 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1581 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1582 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1583void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1584 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1585 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1586{
1587 ParallelFor<MT>(Gpu::KernelInfo{},
1588 box1,ncomp1,std::forward<L1>(f1),
1589 box2,ncomp2,std::forward<L2>(f2),
1590 box3,ncomp3,std::forward<L3>(f3));
1591}
1592
1593template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1594std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1595HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1596{
1597 if (Gpu::inLaunchRegion()) {
1598 ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1599 } else {
1600#ifdef AMREX_USE_SYCL
1601 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1602#else
1604 for (T i = 0; i < n; ++i) { f(i); }
1605#endif
1606 }
1607}
1608
1609template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1610std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1611HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1612{
1613 if (Gpu::inLaunchRegion()) {
1614 ParallelFor<MT>(info,n,std::forward<L>(f));
1615 } else {
1616#ifdef AMREX_USE_SYCL
1617 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1618#else
1620 for (T i = 0; i < n; ++i) { f(i); }
1621#endif
1622 }
1623}
1624
1625template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1626std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1627HostDeviceParallelFor (T n, L&& f) noexcept
1628{
1629 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1630}
1631
1632template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1633std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1634HostDeviceParallelFor (T n, L&& f) noexcept
1635{
1636 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
1637}
1638
1639template <typename L, int dim>
1640std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1641HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1642{
1643 if (Gpu::inLaunchRegion()) {
1644 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
1645 } else {
1646#ifdef AMREX_USE_SYCL
1647 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1648#else
1649 LoopConcurrentOnCpu(box,std::forward<L>(f));
1650#endif
1651 }
1652}
1653
1654template <int MT, typename L, int dim>
1655std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1656HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1657{
1658 if (Gpu::inLaunchRegion()) {
1659 ParallelFor<MT>(info, box,std::forward<L>(f));
1660 } else {
1661#ifdef AMREX_USE_SYCL
1662 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1663#else
1664 LoopConcurrentOnCpu(box,std::forward<L>(f));
1665#endif
1666 }
1667}
1668
1669template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1670std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1671HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1672{
1673 if (Gpu::inLaunchRegion()) {
1674 ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
1675 } else {
1676#ifdef AMREX_USE_SYCL
1677 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1678#else
1679 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1680#endif
1681 }
1682}
1683
1684template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1685std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
1686HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1687{
1688 if (Gpu::inLaunchRegion()) {
1689 ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
1690 } else {
1691#ifdef AMREX_USE_SYCL
1692 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1693#else
1694 LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
1695#endif
1696 }
1697}
1698
1699template <typename L1, typename L2, int dim>
1700std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1702 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1703{
1704 if (Gpu::inLaunchRegion()) {
1705 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1706 } else {
1707#ifdef AMREX_USE_SYCL
1708 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1709#else
1710 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1711 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1712#endif
1713 }
1714}
1715
1716template <int MT, typename L1, typename L2, int dim>
1717std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1719 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1720{
1721 if (Gpu::inLaunchRegion()) {
1722 ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1723 } else {
1724#ifdef AMREX_USE_SYCL
1725 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1726#else
1727 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1728 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1729#endif
1730 }
1731}
1732
1733template <int MT, typename L1, typename L2, typename L3, int dim>
1734std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1736 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1737 L1&& f1, L2&& f2, L3&& f3) noexcept
1738{
1739 if (Gpu::inLaunchRegion()) {
1740 ParallelFor<MT>(info,box1,box2,box3,
1741 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1742 } else {
1743#ifdef AMREX_USE_SYCL
1744 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1745#else
1746 LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
1747 LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
1748 LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
1749#endif
1750 }
1751}
1752
1753template <typename T1, typename T2, typename L1, typename L2, int dim,
1754 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1755 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1756std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1758 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1759 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1760{
1761 if (Gpu::inLaunchRegion()) {
1762 ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1763 } else {
1764#ifdef AMREX_USE_SYCL
1765 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1766#else
1767 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1768 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1769#endif
1770 }
1771}
1772
1773template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1774 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1775 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1776std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
1778 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1779 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1780{
1781 if (Gpu::inLaunchRegion()) {
1782 ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1783 } else {
1784#ifdef AMREX_USE_SYCL
1785 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1786#else
1787 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1788 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1789#endif
1790 }
1791}
1792
1793template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1794 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1795 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1796 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1797std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1799 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1800 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1801 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1802{
1803 if (Gpu::inLaunchRegion()) {
1804 ParallelFor<AMREX_GPU_MAX_THREADS>(info,
1805 box1,ncomp1,std::forward<L1>(f1),
1806 box2,ncomp2,std::forward<L2>(f2),
1807 box3,ncomp3,std::forward<L3>(f3));
1808 } else {
1809#ifdef AMREX_USE_SYCL
1810 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1811#else
1812 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1813 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1814 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1815#endif
1816 }
1817}
1818
1819template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1820 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1821 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1822 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1823std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
1825 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1826 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1827 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1828{
1829 if (Gpu::inLaunchRegion()) {
1830 ParallelFor<MT>(info,
1831 box1,ncomp1,std::forward<L1>(f1),
1832 box2,ncomp2,std::forward<L2>(f2),
1833 box3,ncomp3,std::forward<L3>(f3));
1834 } else {
1835#ifdef AMREX_USE_SYCL
1836 amrex::Abort("amrex:: HOST_DEVICE disabled for Intel. It takes too long to compile");
1837#else
1838 LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
1839 LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
1840 LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
1841#endif
1842 }
1843}
1844
1845template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1846void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1847{
1848 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
1849}
1850
1851template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1852void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
1853{
1854 HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
1855}
1856
1857template <typename L, int dim>
1858void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1859{
1860 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
1861}
1862
1863template <int MT, typename L, int dim>
1864void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept
1865{
1866 HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
1867}
1868
1869template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1870void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1871{
1872 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
1873}
1874
1875template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1876void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept
1877{
1878 HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
1879}
1880
1881template <typename L1, typename L2, int dim>
1882void HostDeviceFor (Gpu::KernelInfo const& info,
1883 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1884{
1885 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1886}
1887
1888template <int MT, typename L1, typename L2, int dim>
1889void HostDeviceFor (Gpu::KernelInfo const& info,
1890 BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
1891{
1892 HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
1893}
1894
1895template <typename L1, typename L2, typename L3, int dim>
1896void HostDeviceFor (Gpu::KernelInfo const& info,
1897 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1898 L1&& f1, L2&& f2, L3&& f3) noexcept
1899{
1900 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
1901 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1902}
1903
1904template <int MT, typename L1, typename L2, typename L3, int dim>
1905void HostDeviceFor (Gpu::KernelInfo const& info,
1906 BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
1907 L1&& f1, L2&& f2, L3&& f3) noexcept
1908{
1909 HostDeviceParallelFor<MT>(info, box1,box2,box3,
1910 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
1911}
1912
1913template <typename T1, typename T2, typename L1, typename L2, int dim,
1914 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1915 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1916void HostDeviceFor (Gpu::KernelInfo const& info,
1917 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1918 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1919{
1920 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1921}
1922
1923template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
1924 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1925 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
1926void HostDeviceFor (Gpu::KernelInfo const& info,
1927 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1928 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
1929{
1930 HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
1931}
1932
1933template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1934 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1935 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1936 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1937void HostDeviceFor (Gpu::KernelInfo const& info,
1938 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1939 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1940 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1941{
1942 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
1943 box1,ncomp1,std::forward<L1>(f1),
1944 box2,ncomp2,std::forward<L2>(f2),
1945 box3,ncomp3,std::forward<L3>(f3));
1946}
1947
1948template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
1949 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
1950 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
1951 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
1952void HostDeviceFor (Gpu::KernelInfo const& info,
1953 BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
1954 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
1955 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
1956{
1957 HostDeviceParallelFor<MT>(info,
1958 box1,ncomp1,std::forward<L1>(f1),
1959 box2,ncomp2,std::forward<L2>(f2),
1960 box3,ncomp3,std::forward<L3>(f3));
1961}
1962
1963template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1964void HostDeviceParallelFor (T n, L&& f) noexcept
1965{
1966 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
1967}
1968
1969template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
1970void HostDeviceParallelFor (T n, L&& f) noexcept
1971{
1972 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
1973}
1974
1975template <typename L, int dim>
1976void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1977{
1978 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
1979}
1980
1981template <int MT, typename L, int dim>
1982void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept
1983{
1984 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
1985}
1986
1987template <typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1988void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1989{
1990 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1991}
1992
1993template <int MT, typename T, typename L, int dim, typename M=std::enable_if_t<std::is_integral_v<T>> >
1994void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept
1995{
1996 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
1997}
1998
1999template <typename L1, typename L2, int dim>
2000void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2001{
2002 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2003}
2004
2005template <int MT, typename L1, typename L2, int dim>
2006void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept
2007{
2008 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
2009}
2010
2011template <typename L1, typename L2, typename L3, int dim>
2012void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2013 L1&& f1, L2&& f2, L3&& f3) noexcept
2014{
2015 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
2016 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2017}
2018
2019template <int MT, typename L1, typename L2, typename L3, int dim>
2020void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,
2021 L1&& f1, L2&& f2, L3&& f3) noexcept
2022{
2023 HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
2024 std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
2025}
2026
2027template <typename T1, typename T2, typename L1, typename L2, int dim,
2028 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2029 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2030void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2031 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2032{
2033 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2034}
2035
2036template <int MT, typename T1, typename T2, typename L1, typename L2, int dim,
2037 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2038 typename M2=std::enable_if_t<std::is_integral_v<T2>> >
2039void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2040 BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept
2041{
2042 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
2043}
2044
2045template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2046 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2047 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2048 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2049void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2050 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2051 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2052{
2053 HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
2054 box1,ncomp1,std::forward<L1>(f1),
2055 box2,ncomp2,std::forward<L2>(f2),
2056 box3,ncomp3,std::forward<L3>(f3));
2057}
2058
2059template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3, int dim,
2060 typename M1=std::enable_if_t<std::is_integral_v<T1>>,
2061 typename M2=std::enable_if_t<std::is_integral_v<T2>>,
2062 typename M3=std::enable_if_t<std::is_integral_v<T3>> >
2063void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,
2064 BoxND<dim> const& box2, T2 ncomp2, L2&& f2,
2065 BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept
2066{
2067 HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
2068 box1,ncomp1,std::forward<L1>(f1),
2069 box2,ncomp2,std::forward<L2>(f2),
2070 box3,ncomp3,std::forward<L3>(f3));
2071}
2072
2073}
2074
2075#endif
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
Definition AMReX_GpuKernelInfo.H:8
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:127
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:140
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:922
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:809
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:14
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1362
curandState_t randState_t
Definition AMReX_RandomEngine.H:58
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1274
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_RandomEngine.H:72