Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_ValLocPair.H>
10
11#include <algorithm>
12#include <functional>
13#include <limits>
14
15namespace amrex {
16
18namespace Reduce::detail {
19
20#ifdef AMREX_USE_GPU
21#ifdef AMREX_USE_SYCL
22 template <std::size_t I, typename T, typename P>
24 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
25 {
26 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
27 }
28
29 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
31 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
32 {
33 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
34 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
35 }
36#else
37 template <std::size_t I, typename T, typename P>
39 void for_each_parallel (T& d, T const& s)
40 {
41 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
42 }
43
44 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
46 void for_each_parallel (T& d, T const& s)
47 {
48 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
49 for_each_parallel<I+1,T,P1,Ps...>(d, s);
50 }
51#endif
52#endif
53
54 template <std::size_t I, typename T, typename P>
56 void for_each_local (T& d, T const& s)
57 {
58 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
59 }
60
61 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
63 void for_each_local (T& d, T const& s)
64 {
65 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
66 for_each_local<I+1,T,P1,Ps...>(d, s);
67 }
68
69 template <std::size_t I, typename T, typename P>
71 constexpr void for_each_init (T& t)
72 {
73 P().init(amrex::get<I>(t));
74 }
75
76 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
78 constexpr void for_each_init (T& t)
79 {
80 P().init(amrex::get<I>(t));
81 for_each_init<I+1,T,P1,Ps...>(t);
82 }
83}
85
88{
89
90#ifdef AMREX_USE_GPU
91#ifdef AMREX_USE_SYCL
92 template <typename T>
94 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
95 T r = Gpu::blockReduceSum(s,h);
96 if (h.threadIdx() == 0) { d += r; }
97 }
98#else
99 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
101 void parallel_update (T& d, T const& s) const noexcept {
102 T r = Gpu::blockReduceSum<MT>(s);
103 if (threadIdx.x == 0) { d += r; }
104 }
105#endif
106#endif
107
108 template <typename T>
110 void local_update (T& d, T const& s) const noexcept { d += s; }
111
112 template <typename T>
113 constexpr void init (T& t) const noexcept { t = 0; }
114};
115
118{
119#ifdef AMREX_USE_GPU
120#ifdef AMREX_USE_SYCL
121 template <typename T>
123 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
124 T r = Gpu::blockReduceMin(s,h);
125 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
126 }
127#else
128 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
130 void parallel_update (T& d, T const& s) const noexcept {
131 T r = Gpu::blockReduceMin<MT>(s);
132 if (threadIdx.x == 0) { d = amrex::min(d,r); }
133 }
134#endif
135#endif
136
137 template <typename T>
139 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
140
141 template <typename T>
142 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
143 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
144
145 template <typename T>
146 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
147 init (T& t) const noexcept { t = T::max(); }
148};
149
152{
153#ifdef AMREX_USE_GPU
154#ifdef AMREX_USE_SYCL
155 template <typename T>
157 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
158 T r = Gpu::blockReduceMax(s,h);
159 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
160 }
161#else
162 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
164 void parallel_update (T& d, T const& s) const noexcept {
165 T r = Gpu::blockReduceMax<MT>(s);
166 if (threadIdx.x == 0) { d = amrex::max(d,r); }
167 }
168#endif
169#endif
170
171 template <typename T>
173 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
174
175 template <typename T>
176 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
177 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
178
179 template <typename T>
180 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
181 init (T& t) const noexcept { t = T::lowest(); }
182};
183
186{
187#ifdef AMREX_USE_GPU
188#ifdef AMREX_USE_SYCL
189 template <typename T>
191 std::enable_if_t<std::is_integral_v<T>>
192 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
194 if (h.threadIdx() == 0) { d = d && r; }
195 }
196#else
197 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
199 std::enable_if_t<std::is_integral_v<T>>
200 parallel_update (T& d, T s) const noexcept {
201 T r = Gpu::blockReduceLogicalAnd<MT>(s);
202 if (threadIdx.x == 0) { d = d && r; }
203 }
204#endif
205#endif
206
207 template <typename T>
209 std::enable_if_t<std::is_integral_v<T>>
210 local_update (T& d, T s) const noexcept { d = d && s; }
211
212 template <typename T>
213 constexpr std::enable_if_t<std::is_integral_v<T>>
214 init (T& t) const noexcept { t = true; }
215};
216
219{
220#ifdef AMREX_USE_GPU
221#ifdef AMREX_USE_SYCL
222 template <typename T>
224 std::enable_if_t<std::is_integral_v<T>>
225 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
226 T r = Gpu::blockReduceLogicalOr(s,h);
227 if (h.threadIdx() == 0) { d = d || r; }
228 }
229#else
230 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
232 std::enable_if_t<std::is_integral_v<T>>
233 parallel_update (T& d, T s) const noexcept {
234 T r = Gpu::blockReduceLogicalOr<MT>(s);
235 if (threadIdx.x == 0) { d = d || r; }
236 }
237#endif
238#endif
239
240 template <typename T>
242 std::enable_if_t<std::is_integral_v<T>>
243 local_update (T& d, T s) const noexcept { d = d || s; }
244
245 template <typename T>
246 constexpr std::enable_if_t<std::is_integral_v<T>>
247 init (T& t) const noexcept { t = false; }
248};
249
250template <typename... Ps> class ReduceOps;
251
252#ifdef AMREX_USE_GPU
253
255template <typename... Ts>
257{
258public:
259 using Type = GpuTuple<Ts...>;
260
261 template <typename... Ps>
262 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
263 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
264 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
266 * m_max_blocks * sizeof(Type)))),
267 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
268 {
269 reduce_op.resetResultReadiness();
270 static_assert(std::is_trivially_copyable<Type>(),
271 "ReduceData::Type must be trivially copyable");
272 static_assert(std::is_trivially_destructible<Type>(),
273 "ReduceData::Type must be trivially destructible");
274
275 new (m_host_tuple) Type();
276 m_nblocks.fill(0);
277 }
278
280 The_Pinned_Arena()->free(m_host_tuple);
281 The_Arena()->free(m_device_tuple);
282 }
283
284 ReduceData (ReduceData<Ts...> const&) = delete;
286 void operator= (ReduceData<Ts...> const&) = delete;
287 void operator= (ReduceData<Ts...> &&) = delete;
288
290 {
291 return m_fn_value();
292 }
293
294 template <typename... Ps>
296 {
297 return reduce_op.value(*this);
298 }
299
300 Type* devicePtr () { return m_device_tuple; }
302 return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
303 }
304
305 Type* hostPtr () { return m_host_tuple; }
306
308 int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
309
310 int maxBlocks () const { return m_max_blocks; }
311
312 int maxStreamIndex () const { return m_max_stream_index; }
314 m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
315 }
316
317private:
319 int m_max_stream_index = 0;
320 Type* m_host_tuple = nullptr;
321 Type* m_device_tuple = nullptr;
323 std::function<Type()> m_fn_value;
324};
325
327namespace Reduce::detail {
328 template <typename F>
330 auto call_f (F const& f, int i, int j, int k, IndexType)
331 noexcept -> decltype(f(0,0,0))
332 {
333 return f(i,j,k);
334 }
335
336 template <typename F>
338 auto call_f (F const& f, int i, int j, int k, IndexType t)
339 noexcept -> decltype(f(Box()))
340 {
342 return f(Box(IntVect(AMREX_D_DECL(i,j,k)),
343 IntVect(AMREX_D_DECL(i,j,k)),
344 t));
345 }
346
348 struct iterate_box {};
349 struct iterate_box_comp {};
351
352 template <typename I, typename F, typename T, typename... Ps,
353 std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
355 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
356 {
357 auto const& pr = f(ibox,i,j,k);
358 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
359 }
360
361 template <typename I, typename F, typename T, typename... Ps,
362 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
364 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
365 {
366 for (int n = 0; n < ncomp; ++n) {
367 auto const& pr = f(ibox,i,j,k,n);
368 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
369 }
370 }
371}
373
375template <typename... Ps>
377{
378public:
379
380 // This is public for CUDA
381 template <typename I, typename MF, typename D, typename F>
382 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
383 {
384 using ReduceTuple = typename D::Type;
385 const int nboxes = mf.local_size();
386 if (nboxes > 0) {
387 auto const& parforinfo = mf.getParForInfo(nghost);
388 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
389 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
390 const int nblocks = nblocks_per_box * nboxes;
391 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
392
393 auto const& stream = Gpu::gpuStream();
394 auto pdst = reduce_data.devicePtr(stream);
395 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
396 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
397 reduce_data.nBlocks(stream) = nblocks_ec;
398 reduce_data.updateMaxStreamIndex(stream);
399
400#ifdef AMREX_USE_SYCL
401 // device reduce needs local(i.e., shared) memory
402 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
403 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
404 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
405 {
406 Dim1 blockIdx {gh.blockIdx()};
407 Dim1 threadIdx{gh.threadIdx()};
408#else
409 amrex::launch_global<AMREX_GPU_MAX_THREADS>
410 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
411 ([=] AMREX_GPU_DEVICE () noexcept
412 {
413#endif
414 ReduceTuple r;
415 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
416 ReduceTuple& dst = pdst[blockIdx.x];
417 if (threadIdx.x == 0) {
418 dst = r;
419 }
420 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
421 int ibox = iblock / nblocks_per_box;
422 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
423
424 BoxIndexer const& indexer = dp_boxes[ibox];
425 if (icell < indexer.numPts()) {
426 auto [i, j, k] = indexer(icell);
427 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
428 (f, ibox, i, j, k, ncomp, r);
429 }
430 }
431#ifdef AMREX_USE_SYCL
432 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
433#else
434 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
435#endif
436 });
437 }
438 }
439
440 template <typename MF, typename D, typename F>
441 std::enable_if_t<IsFabArray<MF>::value
442#ifndef AMREX_USE_CUDA
444#endif
445 >
446 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
447 {
448 using ReduceTuple = typename D::Type;
449 const int nboxes = mf.local_size();
450 if (nboxes == 0) {
451 return;
452 } else if (!mf.isFusingCandidate()) {
453 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
454 Box const& b = amrex::grow(mfi.validbox(), nghost);
455 const int li = mfi.LocalIndex();
456 this->eval(b, reduce_data,
457 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
458 {
459 return f(li, i, j, k);
460 });
461 }
462 } else {
463 eval_mf(Reduce::detail::iterate_box{},
464 mf, nghost, 0, reduce_data, std::forward<F>(f));
465 }
466 }
467
468 template <typename MF, typename D, typename F>
469 std::enable_if_t<IsFabArray<MF>::value
470#ifndef AMREX_USE_CUDA
472#endif
473 >
474 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
475 {
476 using ReduceTuple = typename D::Type;
477
478 const int nboxes = mf.local_size();
479
480 if (nboxes == 0) {
481 return;
482 } else if (!mf.isFusingCandidate()) {
483 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
484 Box const& b = amrex::grow(mfi.validbox(), nghost);
485 const int li = mfi.LocalIndex();
486 this->eval(b, ncomp, reduce_data,
487 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
488 {
489 return f(li, i, j, k, n);
490 });
491 }
492 } else {
493 eval_mf(Reduce::detail::iterate_box_comp{},
494 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
495 }
496 }
497
498 template <typename D, typename F>
499 void eval (Box const& box, D & reduce_data, F const& f)
500 {
501 using ReduceTuple = typename D::Type;
502 auto const& stream = Gpu::gpuStream();
503 auto dp = reduce_data.devicePtr(stream);
504 int& nblocks = reduce_data.nBlocks(stream);
505 int ncells = box.numPts();
506 const auto lo = amrex::lbound(box);
507 const auto len = amrex::length(box);
508 const auto lenxy = len.x*len.y;
509 const auto lenx = len.x;
510 IndexType ixtype = box.ixType();
511 constexpr int nitems_per_thread = 4;
512 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
513 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
514 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
515 reduce_data.updateMaxStreamIndex(stream);
516#ifdef AMREX_USE_SYCL
517 // device reduce needs local(i.e., shared) memory
518 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
519 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
520 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
521 {
522 Dim1 blockIdx {gh.blockIdx()};
523 Dim1 threadIdx{gh.threadIdx()};
524 Dim1 gridDim {gh.gridDim()};
525#else
526 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
527 [=] AMREX_GPU_DEVICE () noexcept
528 {
529#endif
530 ReduceTuple r;
531 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
532 ReduceTuple& dst = *(dp+blockIdx.x);
533 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
534 dst = r;
535 }
536 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
537 icell < ncells; icell += stride) {
538 int k = icell / lenxy;
539 int j = (icell - k*lenxy) / lenx;
540 int i = (icell - k*lenxy) - j*lenx;
541 i += lo.x;
542 j += lo.y;
543 k += lo.z;
544 auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
545 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
546 }
547#ifdef AMREX_USE_SYCL
548 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
549#else
550 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
551#endif
552 });
553 nblocks = std::max(nblocks, nblocks_ec);
554 }
555
556 template <typename N, typename D, typename F,
557 typename M=std::enable_if_t<std::is_integral_v<N>> >
558 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
559 {
560 using ReduceTuple = typename D::Type;
561 auto const& stream = Gpu::gpuStream();
562 auto dp = reduce_data.devicePtr(stream);
563 int& nblocks = reduce_data.nBlocks(stream);
564 int ncells = box.numPts();
565 const auto lo = amrex::lbound(box);
566 const auto len = amrex::length(box);
567 const auto lenxy = len.x*len.y;
568 const auto lenx = len.x;
569 constexpr int nitems_per_thread = 4;
570 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
571 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
572 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
573 reduce_data.updateMaxStreamIndex(stream);
574#ifdef AMREX_USE_SYCL
575 // device reduce needs local(i.e., shared) memory
576 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
577 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
578 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
579 {
580 Dim1 blockIdx {gh.blockIdx()};
581 Dim1 threadIdx{gh.threadIdx()};
582 Dim1 gridDim {gh.gridDim()};
583#else
584 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
585 [=] AMREX_GPU_DEVICE () noexcept
586 {
587#endif
588 ReduceTuple r;
589 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
590 ReduceTuple& dst = *(dp+blockIdx.x);
591 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
592 dst = r;
593 }
594 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
595 icell < ncells; icell += stride) {
596 int k = icell / lenxy;
597 int j = (icell - k*lenxy) / lenx;
598 int i = (icell - k*lenxy) - j*lenx;
599 i += lo.x;
600 j += lo.y;
601 k += lo.z;
602 for (N n = 0; n < ncomp; ++n) {
603 auto pr = f(i,j,k,n);
604 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
605 }
606 }
607#ifdef AMREX_USE_SYCL
608 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
609#else
610 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
611#endif
612 });
613 nblocks = std::max(nblocks, nblocks_ec);
614 }
615
616 template <typename N, typename D, typename F,
617 typename M=std::enable_if_t<std::is_integral_v<N>> >
618 void eval (N n, D & reduce_data, F const& f)
619 {
620 if (n <= 0) { return; }
621 using ReduceTuple = typename D::Type;
622 auto const& stream = Gpu::gpuStream();
623 auto dp = reduce_data.devicePtr(stream);
624 int& nblocks = reduce_data.nBlocks(stream);
625 constexpr int nitems_per_thread = 4;
626 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
627 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
628 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
629 reduce_data.updateMaxStreamIndex(stream);
630#ifdef AMREX_USE_SYCL
631 // device reduce needs local(i.e., shared) memory
632 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
633 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
634 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
635 {
636 Dim1 blockIdx {gh.blockIdx()};
637 Dim1 threadIdx{gh.threadIdx()};
638 Dim1 gridDim {gh.gridDim()};
639#else
640 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
641 [=] AMREX_GPU_DEVICE () noexcept
642 {
643#endif
644 ReduceTuple r;
645 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
646 ReduceTuple& dst = *(dp+blockIdx.x);
647 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
648 dst = r;
649 }
650 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
651 i < n; i += stride) {
652 auto pr = f(i);
653 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
654 }
655#ifdef AMREX_USE_SYCL
656 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
657#else
658 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
659#endif
660 });
661 nblocks = amrex::max(nblocks, nblocks_ec);
662 }
663
664 template <typename D>
665 typename D::Type value (D & reduce_data)
666 {
667 auto hp = reduce_data.hostPtr();
668
669 if (m_result_is_ready) {
670 return *hp;
671 }
672
673 using ReduceTuple = typename D::Type;
674 auto const& stream = Gpu::gpuStream();
675 auto dp = reduce_data.devicePtr();
676 auto const& nblocks = reduce_data.nBlocks();
677#if defined(AMREX_USE_SYCL)
678 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
679 const int N = nblocks[0];
680 if (N == 0) {
681 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
682 } else {
684 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
685 Gpu::streamSynchronize();
686 for (int i = 1; i < N; ++i) {
687 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
688 }
689 *hp = tmp[0];
690 }
691 } else
692#endif
693 {
694 int maxblocks = reduce_data.maxBlocks();
695#ifdef AMREX_USE_SYCL
696 // device reduce needs local(i.e., shared) memory
697 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
698#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
699 // xxxxx SYCL todo: reduce bug workaround
701 auto presult = dtmp.data();
702#else
703 auto presult = hp;
704#endif
705 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
706 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
707 {
708 ReduceTuple r;
709 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
710 ReduceTuple dst = r;
711 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
712 auto dp_stream = dp+istream*maxblocks;
713 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
714 i < nblocks[istream]; i += stride) {
715 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
716 }
717 }
718 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
719 if (gh.threadIdx() == 0) { *presult = dst; }
720 });
721#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
722 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
723#endif
724#else
725 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
726 [=] AMREX_GPU_DEVICE () noexcept
727 {
728 ReduceTuple r;
729 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
730 ReduceTuple dst = r;
731 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
732 auto dp_stream = dp+istream*maxblocks;
733 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
734 i < nblocks[istream]; i += stride) {
735 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
736 }
737 }
738 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
739 if (threadIdx.x == 0) { *hp = dst; }
740 });
741#endif
742 Gpu::streamSynchronize();
743 }
744
745 m_result_is_ready = true;
746 return *hp;
747 }
748
749private:
750 bool m_result_is_ready = false;
751
752public:
753 void resetResultReadiness () { m_result_is_ready = false; }
754};
755
756namespace Reduce {
757
759template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
760T Sum (N n, T const* v, T init_val = 0)
761{
762 ReduceOps<ReduceOpSum> reduce_op;
763 ReduceData<T> reduce_data(reduce_op);
764 using ReduceTuple = typename decltype(reduce_data)::Type;
765 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
766 ReduceTuple hv = reduce_data.value(reduce_op);
767 return amrex::get<0>(hv) + init_val;
768}
769
771template <typename T, typename N, typename F,
772 typename M=std::enable_if_t<std::is_integral_v<N>> >
773T Sum (N n, F const& f, T init_val = 0)
774{
775 ReduceOps<ReduceOpSum> reduce_op;
776 ReduceData<T> reduce_data(reduce_op);
777 using ReduceTuple = typename decltype(reduce_data)::Type;
778 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
779 ReduceTuple hv = reduce_data.value(reduce_op);
780 return amrex::get<0>(hv) + init_val;
781}
782
784template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
785T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
786{
787 ReduceOps<ReduceOpMin> reduce_op;
788 ReduceData<T> reduce_data(reduce_op);
789 using ReduceTuple = typename decltype(reduce_data)::Type;
790 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
791 ReduceTuple hv = reduce_data.value(reduce_op);
792 return std::min(amrex::get<0>(hv),init_val);
793}
794
796template <typename T, typename N, typename F,
797 typename M=std::enable_if_t<std::is_integral_v<N>> >
798T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
799{
800 ReduceOps<ReduceOpMin> reduce_op;
801 ReduceData<T> reduce_data(reduce_op);
802 using ReduceTuple = typename decltype(reduce_data)::Type;
803 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
804 ReduceTuple hv = reduce_data.value(reduce_op);
805 return std::min(amrex::get<0>(hv),init_val);
806}
807
809template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
810T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
811{
812 ReduceOps<ReduceOpMax> reduce_op;
813 ReduceData<T> reduce_data(reduce_op);
814 using ReduceTuple = typename decltype(reduce_data)::Type;
815 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
816 ReduceTuple hv = reduce_data.value(reduce_op);
817 return std::max(amrex::get<0>(hv),init_val);
818}
819
821template <typename T, typename N, typename F,
822 typename M=std::enable_if_t<std::is_integral_v<N>> >
823T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
824{
825 ReduceOps<ReduceOpMax> reduce_op;
826 ReduceData<T> reduce_data(reduce_op);
827 using ReduceTuple = typename decltype(reduce_data)::Type;
828 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
829 ReduceTuple hv = reduce_data.value(reduce_op);
830 return std::max(amrex::get<0>(hv),init_val);
831}
832
834template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
835std::pair<T,T> MinMax (N n, T const* v)
836{
838 ReduceData<T,T> reduce_data(reduce_op);
839 using ReduceTuple = typename decltype(reduce_data)::Type;
840 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
841 return {v[i],v[i]};
842 });
843 auto hv = reduce_data.value(reduce_op);
844 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
845}
846
848template <typename T, typename N, typename F,
849 typename M=std::enable_if_t<std::is_integral_v<N>> >
850std::pair<T,T> MinMax (N n, F const& f)
851{
853 ReduceData<T,T> reduce_data(reduce_op);
854 using ReduceTuple = typename decltype(reduce_data)::Type;
855 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
856 T tmp = f(i);
857 return {tmp,tmp};
858 });
859 auto hv = reduce_data.value(reduce_op);
860 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
861}
862
864template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
865bool AnyOf (N n, T const* v, P const& pred)
866{
867 Gpu::LaunchSafeGuard lsg(true);
869 int* dp = ds.dataPtr();
870 auto ec = Gpu::ExecutionConfig(n);
871 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
872
873#ifdef AMREX_USE_SYCL
874 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
875 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
876 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
877 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
878 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
879 if (gh.threadIdx() == 0) { *has_any = *dp; }
880 gh.sharedBarrier();
881
882 if (!(*has_any))
883 {
884 int r = false;
885 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
886 i < n && !r; i += stride)
887 {
888 r = pred(v[i]) ? 1 : 0;
889 }
890
891 r = Gpu::blockReduce<Gpu::Device::warp_size>
892 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
893 if (gh.threadIdx() == 0 && r) { *dp = 1; }
894 }
895 });
896#else
897 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
898 [=] AMREX_GPU_DEVICE () noexcept {
899 __shared__ int has_any;
900 if (threadIdx.x == 0) { has_any = *dp; }
901 __syncthreads();
902
903 if (!has_any)
904 {
905 int r = false;
906 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
907 i < n && !r; i += stride)
908 {
909 r = pred(v[i]) ? 1 : 0;
910 }
911 r = Gpu::blockReduce<Gpu::Device::warp_size>
912 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
913 if (threadIdx.x == 0 && r) *dp = 1;
914 }
915 });
916#endif
917 return ds.dataValue();
918}
919
921template <typename P>
922bool AnyOf (Box const& box, P const& pred)
923{
924 Gpu::LaunchSafeGuard lsg(true);
926 int* dp = ds.dataPtr();
927 int ncells = box.numPts();
928 const auto lo = amrex::lbound(box);
929 const auto len = amrex::length(box);
930 const auto lenxy = len.x*len.y;
931 const auto lenx = len.x;
932 auto ec = Gpu::ExecutionConfig(ncells);
933 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
934
935#ifdef AMREX_USE_SYCL
936 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
937 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
938 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
939 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
940 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
941 if (gh.threadIdx() == 0) { *has_any = *dp; }
942 gh.sharedBarrier();
943
944 if (!(*has_any))
945 {
946 int r = false;
947 for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
948 icell < ncells && !r; icell += stride) {
949 int k = icell / lenxy;
950 int j = (icell - k*lenxy) / lenx;
951 int i = (icell - k*lenxy) - j*lenx;
952 i += lo.x;
953 j += lo.y;
954 k += lo.z;
955 r = pred(i,j,k) ? 1 : 0;
956 }
957 r = Gpu::blockReduce<Gpu::Device::warp_size>
958 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
959 if (gh.threadIdx() == 0 && r) { *dp = 1; }
960 }
961 });
962#else
963 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
964 Gpu::gpuStream(),
965 [=] AMREX_GPU_DEVICE () noexcept {
966 __shared__ int has_any;
967 if (threadIdx.x == 0) { has_any = *dp; }
968 __syncthreads();
969
970 if (!has_any)
971 {
972 int r = false;
973 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
974 icell < ncells && !r; icell += stride) {
975 int k = icell / lenxy;
976 int j = (icell - k*lenxy) / lenx;
977 int i = (icell - k*lenxy) - j*lenx;
978 i += lo.x;
979 j += lo.y;
980 k += lo.z;
981 r = pred(i,j,k) ? 1 : 0;
982 }
983 r = Gpu::blockReduce<Gpu::Device::warp_size>
984 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
985 if (threadIdx.x == 0 && r) *dp = 1;
986 }
987 });
988#endif
989 return ds.dataValue();
990}
991
992}
993
994#else
995
996template <typename... Ts>
997class ReduceData
998{
999public:
1000 using Type = GpuTuple<Ts...>;
1001
1002 template <typename... Ps>
1003 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1004 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1005 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1006 {
1007 reduce_op.resetResultReadiness();
1008 for (auto& t : m_tuple) {
1009 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1010 }
1011 }
1012
1013 ~ReduceData () = default;
1014 ReduceData (ReduceData<Ts...> const&) = delete;
1015 ReduceData (ReduceData<Ts...> &&) = delete;
1016 void operator= (ReduceData<Ts...> const&) = delete;
1017 void operator= (ReduceData<Ts...> &&) = delete;
1018
1019 Type value () { return m_fn_value(); }
1020
1021 template <typename... Ps>
1022 Type value (ReduceOps<Ps...>& reduce_op)
1023 {
1024 return reduce_op.value(*this);
1025 }
1026
1027 Vector<Type>& reference () { return m_tuple; }
1028
1029 Type& reference (int tid)
1030 {
1031 if (m_tuple.size() == 1) {
1032 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1033 return m_tuple[0];
1034 } else {
1035 return m_tuple[tid];
1036 }
1037 }
1038
1039private:
1040 Vector<Type> m_tuple;
1041 std::function<Type()> m_fn_value;
1042};
1043
1044template <typename... Ps>
1045class ReduceOps
1046{
1047private:
1048
1049 template <typename D, typename F>
1051 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1052 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
1053 typename D::Type>>
1054 {
1055 using ReduceTuple = typename D::Type;
1056 const auto lo = amrex::lbound(box);
1057 const auto hi = amrex::ubound(box);
1058 for (int k = lo.z; k <= hi.z; ++k) {
1059 for (int j = lo.y; j <= hi.y; ++j) {
1060 for (int i = lo.x; i <= hi.x; ++i) {
1061 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
1062 }}}
1063 }
1064
1065 template <typename D, typename F>
1067 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1068 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
1069 typename D::Type>>
1070 {
1071 using ReduceTuple = typename D::Type;
1072 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1073 }
1074
1075public:
1076
1077 template <typename MF, typename D, typename F>
1078 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1079 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1080 {
1081 using ReduceTuple = typename D::Type;
1082#ifdef AMREX_USE_OMP
1083#pragma omp parallel
1084#endif
1085 {
1086 ReduceTuple rr;
1087 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1088 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1089 Box const& b = mfi.growntilebox(nghost);
1090 const int li = mfi.LocalIndex();
1091 const auto lo = amrex::lbound(b);
1092 const auto hi = amrex::ubound(b);
1093 for (int k = lo.z; k <= hi.z; ++k) {
1094 for (int j = lo.y; j <= hi.y; ++j) {
1095 for (int i = lo.x; i <= hi.x; ++i) {
1096 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1097 }}}
1098 }
1099 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1100 reduce_data.reference(OpenMP::get_thread_num()), rr);
1101 }
1102 }
1103
1104 template <typename MF, typename D, typename F>
1105 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1106 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1107 {
1108 using ReduceTuple = typename D::Type;
1109#ifdef AMREX_USE_OMP
1110#pragma omp parallel
1111#endif
1112 {
1113 ReduceTuple rr;
1114 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1115 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1116 Box const& b = mfi.growntilebox(nghost);
1117 const int li = mfi.LocalIndex();
1118 const auto lo = amrex::lbound(b);
1119 const auto hi = amrex::ubound(b);
1120 for (int n = 0; n < ncomp; ++n) {
1121 for (int k = lo.z; k <= hi.z; ++k) {
1122 for (int j = lo.y; j <= hi.y; ++j) {
1123 for (int i = lo.x; i <= hi.x; ++i) {
1124 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1125 }}}}
1126 }
1127 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1128 reduce_data.reference(OpenMP::get_thread_num()), rr);
1129 }
1130 }
1131
1132 template <typename D, typename F>
1133 void eval (Box const& box, D & reduce_data, F&& f)
1134 {
1135 using ReduceTuple = typename D::Type;
1136 ReduceTuple rr;
1137 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1138 call_f<D>(box, rr, std::forward<F>(f));
1139 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1140 reduce_data.reference(OpenMP::get_thread_num()), rr);
1141 }
1142
1143 template <typename N, typename D, typename F,
1144 typename M=std::enable_if_t<std::is_integral_v<N>> >
1145 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
1146 {
1147 using ReduceTuple = typename D::Type;
1148 ReduceTuple rr;
1149 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1150 const auto lo = amrex::lbound(box);
1151 const auto hi = amrex::ubound(box);
1152 for (N n = 0; n < ncomp; ++n) {
1153 for (int k = lo.z; k <= hi.z; ++k) {
1154 for (int j = lo.y; j <= hi.y; ++j) {
1155 for (int i = lo.x; i <= hi.x; ++i) {
1156 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1157 }}}}
1158 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1159 reduce_data.reference(OpenMP::get_thread_num()), rr);
1160 }
1161
1162 template <typename N, typename D, typename F,
1163 typename M=std::enable_if_t<std::is_integral_v<N>> >
1164 void eval (N n, D & reduce_data, F const& f)
1165 {
1166 using ReduceTuple = typename D::Type;
1167 ReduceTuple rr;
1168 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1169 for (N i = 0; i < n; ++i) {
1170 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1171 }
1172 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1173 reduce_data.reference(OpenMP::get_thread_num()), rr);
1174 }
1175
1176 template <typename D>
1177 typename D::Type value (D & reduce_data)
1178 {
1179 auto& rrv = reduce_data.reference();
1180 if (! m_result_is_ready) {
1181 using ReduceTuple = typename D::Type;
1182 if (rrv.size() > 1) {
1183 for (int i = 1, N = rrv.size(); i < N; ++i) {
1184 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1185 }
1186 }
1187 m_result_is_ready = true;
1188 }
1189 return rrv[0];
1190 }
1191
1192 bool m_result_is_ready = false;
1193
1194 void resetResultReadiness () { m_result_is_ready = false; }
1195};
1196
1197namespace Reduce {
1198
1199template <typename T, typename N, typename F,
1200 typename M=std::enable_if_t<std::is_integral_v<N>> >
1201T Sum (N n, F const& f, T init_val = 0)
1202{
1203 T r = init_val;
1204#ifdef AMREX_USE_OMP
1205#pragma omp parallel for reduction(+:r)
1206#endif
1207 for (N i = 0; i < n; ++i) {
1208 r += f(i);
1209 }
1210 return r;
1211}
1212
1213template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1214T Sum (N n, T const* v, T init_val = 0)
1215{
1216 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1217}
1218
1219template <typename T, typename N, typename F,
1220 typename M=std::enable_if_t<std::is_integral_v<N>> >
1221T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1222{
1223 T r = init_val;
1224#ifdef AMREX_USE_OMP
1225#pragma omp parallel for reduction(min:r)
1226#endif
1227 for (N i = 0; i < n; ++i) {
1228 r = std::min(r,f(i));
1229 }
1230 return r;
1231}
1232
1233template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1234T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1235{
1236 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1237}
1238
1239template <typename T, typename N, typename F,
1240 typename M=std::enable_if_t<std::is_integral_v<N>> >
1241T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1242{
1243 T r = init_val;
1244#ifdef AMREX_USE_OMP
1245#pragma omp parallel for reduction(max:r)
1246#endif
1247 for (N i = 0; i < n; ++i) {
1248 r = std::max(r,f(i));
1249 }
1250 return r;
1251}
1252
1253template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1254T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1255{
1256 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1257}
1258
1259template <typename T, typename N, typename F,
1260 typename M=std::enable_if_t<std::is_integral_v<N>> >
1261std::pair<T,T> MinMax (N n, F const& f)
1262{
1263 T r_min = std::numeric_limits<T>::max();
1264 T r_max = std::numeric_limits<T>::lowest();
1265#ifdef AMREX_USE_OMP
1266#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1267#endif
1268 for (N i = 0; i < n; ++i) {
1269 T tmp = f(i);
1270 r_min = std::min(r_min,tmp);
1271 r_max = std::max(r_max,tmp);
1272 }
1273 return std::make_pair(r_min,r_max);
1274}
1275
1276template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1277std::pair<T,T> MinMax (N n, T const* v)
1278{
1279 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1280}
1281
1282template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1283bool AnyOf (N n, T const* v, P&& pred)
1284{
1285 return std::any_of(v, v+n, std::forward<P>(pred));
1286}
1287
1288template <typename P>
1289bool AnyOf (Box const& box, P const& pred)
1290{
1291 const auto lo = amrex::lbound(box);
1292 const auto hi = amrex::ubound(box);
1293 for (int k = lo.z; k <= hi.z; ++k) {
1294 for (int j = lo.y; j <= hi.y; ++j) {
1295 for (int i = lo.x; i <= hi.x; ++i) {
1296 if (pred(i,j,k)) { return true; }
1297 }}}
1298 return false;
1299}
1300
1301}
1302
1303#endif
1304
1309template <typename... Ts, typename... Ps>
1311constexpr GpuTuple<Ts...>
1313{
1314 GpuTuple<Ts...> r{};
1315 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1316 return r;
1317}
1318
1323template <typename... Ts, typename... Ps>
1325constexpr GpuTuple<Ts...>
1327{
1328 GpuTuple<Ts...> r{};
1329 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1330 return r;
1331}
1332
1333}
1334
1335#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition AMReX_SPACE.H:173
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:697
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:63
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:147
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:257
~ReduceData()
Definition AMReX_Reduce.H:279
int maxStreamIndex() const
Definition AMReX_Reduce.H:312
Type value()
Definition AMReX_Reduce.H:289
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:313
Type * m_host_tuple
Definition AMReX_Reduce.H:320
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:308
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:262
int m_max_blocks
Definition AMReX_Reduce.H:318
GpuArray< int, 8 > m_nblocks
Definition AMReX_Reduce.H:322
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:301
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:295
Type * devicePtr()
Definition AMReX_Reduce.H:300
Type * m_device_tuple
Definition AMReX_Reduce.H:321
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:307
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition AMReX_Reduce.H:323
Type * hostPtr()
Definition AMReX_Reduce.H:305
int maxBlocks() const
Definition AMReX_Reduce.H:310
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:377
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:665
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:558
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:382
void eval(Box const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:499
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:446
void resetResultReadiness()
Definition AMReX_Reduce.H:753
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:474
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:618
amrex_long Long
Definition AMReX_INT.H:30
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:835
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:865
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1312
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:27
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_Box.H:2152
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:40
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:116
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:287
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:186
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:214
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:200
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:210
Definition AMReX_Reduce.H:219
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:247
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:243
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:233
Definition AMReX_Reduce.H:152
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:177
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:181
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:173
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:164
Definition AMReX_Reduce.H:118
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:147
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:130
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:139
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
Definition AMReX_Reduce.H:88
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:101
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:110
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:113
Struct for holding types.
Definition AMReX_TypeList.H:12