Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_TypeList.H>
10#include <AMReX_ValLocPair.H>
11
12#include <algorithm>
13#include <functional>
14#include <limits>
15
16namespace amrex {
17
18namespace Reduce {
19
20// The declaration of these functions are here to work around doxygen issues.
21
36template <typename T, typename N,
37 std::enable_if_t<std::is_integral_v<N>,int> = 0>
38T Sum (N n, T const* v, T init_val = 0);
39
56template <typename T, typename N, typename F,
57 std::enable_if_t<std::is_integral_v<N> &&
58 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
59T Sum (N n, F const& f, T init_val = 0);
60
75template <typename T, typename N,
76 std::enable_if_t<std::is_integral_v<N>,int> = 0>
77T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max());
78
95template <typename T, typename N, typename F,
96 std::enable_if_t<std::is_integral_v<N> &&
97 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
98T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max());
99
114template <typename T, typename N,
115 std::enable_if_t<std::is_integral_v<N>,int> = 0>
116T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest());
117
134template <typename T, typename N, typename F,
135 std::enable_if_t<std::is_integral_v<N> &&
136 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
137T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest());
138
152template <typename T, typename N,
153 std::enable_if_t<std::is_integral_v<N>,int> = 0>
154std::pair<T,T> MinMax (N n, T const* v);
155
171template <typename T, typename N, typename F,
172 std::enable_if_t<std::is_integral_v<N> &&
173 !std::is_same_v<T*,std::decay_t<F>>,int> = 0>
174std::pair<T,T> MinMax (N n, F const& f);
175
191template <typename T, typename N, typename P,
192 std::enable_if_t<std::is_integral_v<N>,int> = 0>
193bool AnyOf (N n, T const* v, P const& pred);
194
208template <typename P, int dim>
209bool AnyOf (BoxND<dim> const& box, P const& pred);
210
211}
212
214namespace Reduce::detail {
215
216#ifdef AMREX_USE_GPU
217#ifdef AMREX_USE_SYCL
218 template <std::size_t I, typename T, typename P>
220 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
221 {
222 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
223 }
224
225 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
227 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
228 {
229 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
230 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
231 }
232#else
233 template <std::size_t I, typename T, typename P>
235 void for_each_parallel (T& d, T const& s)
236 {
237 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
238 }
239
240 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
242 void for_each_parallel (T& d, T const& s)
243 {
244 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
245 for_each_parallel<I+1,T,P1,Ps...>(d, s);
246 }
247#endif
248#endif
249
250 template <std::size_t I, typename T, typename P>
252 void for_each_local (T& d, T const& s)
253 {
254 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
255 }
256
257 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
259 void for_each_local (T& d, T const& s)
260 {
261 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
262 for_each_local<I+1,T,P1,Ps...>(d, s);
263 }
264
265 template <std::size_t I, typename T, typename P>
267 constexpr void for_each_init (T& t)
268 {
269 P().init(amrex::get<I>(t));
270 }
271
272 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
274 constexpr void for_each_init (T& t)
275 {
276 P().init(amrex::get<I>(t));
277 for_each_init<I+1,T,P1,Ps...>(t);
278 }
279}
281
284{
285
286#ifdef AMREX_USE_GPU
287#ifdef AMREX_USE_SYCL
288 template <typename T>
290 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
291 T r = Gpu::blockReduceSum(s,h);
292 if (h.threadIdx() == 0) { d += r; }
293 }
294#else
295 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
297 void parallel_update (T& d, T const& s) const noexcept {
298 T r = Gpu::blockReduceSum<MT>(s);
299 if (threadIdx.x == 0) { d += r; }
300 }
301#endif
302#endif
303
304 template <typename T>
306 void local_update (T& d, T const& s) const noexcept { d += s; }
307
308 template <typename T>
309 constexpr void init (T& t) const noexcept { t = 0; }
310};
311
314{
315#ifdef AMREX_USE_GPU
316#ifdef AMREX_USE_SYCL
317 template <typename T>
319 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
320 T r = Gpu::blockReduceMin(s,h);
321 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
322 }
323#else
324 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
326 void parallel_update (T& d, T const& s) const noexcept {
327 T r = Gpu::blockReduceMin<MT>(s);
328 if (threadIdx.x == 0) { d = amrex::min(d,r); }
329 }
330#endif
331#endif
332
333 template <typename T>
335 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
336
337 template <typename T>
338 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
339 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
340
341 template <typename T>
342 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
343 init (T& t) const noexcept { t = T::max(); }
344};
345
348{
349#ifdef AMREX_USE_GPU
350#ifdef AMREX_USE_SYCL
351 template <typename T>
353 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
354 T r = Gpu::blockReduceMax(s,h);
355 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
356 }
357#else
358 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
360 void parallel_update (T& d, T const& s) const noexcept {
361 T r = Gpu::blockReduceMax<MT>(s);
362 if (threadIdx.x == 0) { d = amrex::max(d,r); }
363 }
364#endif
365#endif
366
367 template <typename T>
369 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
370
371 template <typename T>
372 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
373 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
374
375 template <typename T>
376 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
377 init (T& t) const noexcept { t = T::lowest(); }
378};
379
382{
383#ifdef AMREX_USE_GPU
384#ifdef AMREX_USE_SYCL
385 template <typename T>
387 std::enable_if_t<std::is_integral_v<T>>
388 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
390 if (h.threadIdx() == 0) { d = d && r; }
391 }
392#else
393 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
395 std::enable_if_t<std::is_integral_v<T>>
396 parallel_update (T& d, T s) const noexcept {
397 T r = Gpu::blockReduceLogicalAnd<MT>(s);
398 if (threadIdx.x == 0) { d = d && r; }
399 }
400#endif
401#endif
402
403 template <typename T>
405 std::enable_if_t<std::is_integral_v<T>>
406 local_update (T& d, T s) const noexcept { d = d && s; }
407
408 template <typename T>
409 constexpr std::enable_if_t<std::is_integral_v<T>>
410 init (T& t) const noexcept { t = true; }
411};
412
415{
416#ifdef AMREX_USE_GPU
417#ifdef AMREX_USE_SYCL
418 template <typename T>
420 std::enable_if_t<std::is_integral_v<T>>
421 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
422 T r = Gpu::blockReduceLogicalOr(s,h);
423 if (h.threadIdx() == 0) { d = d || r; }
424 }
425#else
426 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
428 std::enable_if_t<std::is_integral_v<T>>
429 parallel_update (T& d, T s) const noexcept {
430 T r = Gpu::blockReduceLogicalOr<MT>(s);
431 if (threadIdx.x == 0) { d = d || r; }
432 }
433#endif
434#endif
435
436 template <typename T>
438 std::enable_if_t<std::is_integral_v<T>>
439 local_update (T& d, T s) const noexcept { d = d || s; }
440
441 template <typename T>
442 constexpr std::enable_if_t<std::is_integral_v<T>>
443 init (T& t) const noexcept { t = false; }
444};
445
446template <typename... Ps> class ReduceOps;
447
448#ifdef AMREX_USE_GPU
449
451template <typename... Ts>
453{
454public:
455 using Type = GpuTuple<Ts...>;
456
457 template <typename... Ps>
458 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
459 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
460 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
461 m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)
462 * m_max_blocks * sizeof(Type)))),
463 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
464 {
465 reduce_op.resetResultReadiness();
466 static_assert(std::is_trivially_copyable<Type>(),
467 "ReduceData::Type must be trivially copyable");
468 static_assert(std::is_trivially_destructible<Type>(),
469 "ReduceData::Type must be trivially destructible");
470
471 new (m_host_tuple) Type();
472 m_nblocks.fill(0);
473 }
474
476 The_Pinned_Arena()->free(m_host_tuple);
477 The_Arena()->free(m_device_tuple);
478 }
479
480 ReduceData (ReduceData<Ts...> const&) = delete;
482 void operator= (ReduceData<Ts...> const&) = delete;
483 void operator= (ReduceData<Ts...> &&) = delete;
484
486 {
487 return m_fn_value();
488 }
489
490 template <typename... Ps>
492 {
493 return reduce_op.value(*this);
494 }
495
496 Type* devicePtr () { return m_device_tuple; }
498 return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
499 }
500
501 Type* hostPtr () { return m_host_tuple; }
502
504 int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
505
506 int maxBlocks () const { return m_max_blocks; }
507
508 int maxStreamIndex () const { return m_max_stream_index; }
510 m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
511 }
512
513private:
514 int m_max_blocks;
515 int m_max_stream_index = 0;
516 Type* m_host_tuple = nullptr;
517 Type* m_device_tuple = nullptr;
519 std::function<Type()> m_fn_value;
520};
521
523namespace Reduce::detail {
524
525 // call_f_intvect_box
526
527 template <typename F, int dim>
529 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
530 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
531 {
532 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
533 }
534
535 template <typename F, int dim>
537 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
538 decltype(f(BoxND<dim>(iv, iv, t)))
539 {
540 return f(BoxND<dim>(iv, iv, t));
541 }
542
543 // call_f_intvect_n
544 template <typename F, typename T, int dim>
546 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
547 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
548 {
549 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
550 }
551
552 // mf_call_f
553
554 struct iterate_box {};
555 struct iterate_box_comp {};
556
557 template <typename I, typename F, typename T, typename... Ps,
558 std::enable_if_t<std::is_same_v<iterate_box,I>,int> = 0>
560 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
561 {
562 auto const& pr = f(ibox,i,j,k);
563 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
564 }
565
566 template <typename I, typename F, typename T, typename... Ps,
567 std::enable_if_t<std::is_same_v<iterate_box_comp,I>,int> = 0>
569 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
570 {
571 for (int n = 0; n < ncomp; ++n) {
572 auto const& pr = f(ibox,i,j,k,n);
573 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
574 }
575 }
576}
578
580template <typename... Ps>
582{
583public:
584
586
587 // This is public for CUDA
588 template <typename I, typename MF, typename D, typename F>
589 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
590 {
591 using ReduceTuple = typename D::Type;
592 const int nboxes = mf.local_size();
593 if (nboxes > 0) {
594 auto const& parforinfo = mf.getParForInfo(nghost);
595 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
596 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
597 const int nblocks = nblocks_per_box * nboxes;
598 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
599
600 auto const& stream = Gpu::gpuStream();
601 auto pdst = reduce_data.devicePtr(stream);
602 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
603 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
604 reduce_data.nBlocks(stream) = nblocks_ec;
605 reduce_data.updateMaxStreamIndex(stream);
606
607#ifdef AMREX_USE_SYCL
608 // device reduce needs local(i.e., shared) memory
609 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
610 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
611 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
612 {
613 Dim1 blockIdx {gh.blockIdx()};
614 Dim1 threadIdx{gh.threadIdx()};
615#else
616 amrex::launch_global<AMREX_GPU_MAX_THREADS>
617 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
618 ([=] AMREX_GPU_DEVICE () noexcept
619 {
620#endif
621 ReduceTuple r;
622 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
623 ReduceTuple& dst = pdst[blockIdx.x];
624 if (threadIdx.x == 0) {
625 dst = r;
626 }
627 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
628 int ibox = iblock / nblocks_per_box;
629 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
630
631 BoxIndexer const& indexer = dp_boxes[ibox];
632 if (icell < indexer.numPts()) {
633 auto [i, j, k] = indexer(icell);
634 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
635 (f, ibox, i, j, k, ncomp, r);
636 }
637 }
638#ifdef AMREX_USE_SYCL
639 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
640#else
641 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
642#endif
643 });
644 }
645 }
646
647 // This is public for CUDA
648 template <typename I, int dim, typename D, typename F>
649 void eval_box (I, BoxND<dim> const& box, int ncomp, D& reduce_data, F const& f)
650 {
651 using ReduceTuple = typename D::Type;
652 auto const& stream = Gpu::gpuStream();
653 auto dp = reduce_data.devicePtr(stream);
654 int& nblocks = reduce_data.nBlocks(stream);
655 const BoxIndexerND<dim> indexer(box);
656 IndexTypeND<dim> ixtype = box.ixType();
657 constexpr int nitems_per_thread = 4;
658 Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
659 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
660 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
661 reduce_data.updateMaxStreamIndex(stream);
662#ifdef AMREX_USE_SYCL
663 // device reduce needs local(i.e., shared) memory
664 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
665 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
666 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
667 {
668 Dim1 blockIdx {gh.blockIdx()};
669 Dim1 threadIdx{gh.threadIdx()};
670 Dim1 gridDim {gh.gridDim()};
671#else
672 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
673 [=] AMREX_GPU_DEVICE () noexcept
674 {
675#endif
676 ReduceTuple r;
677 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
678 ReduceTuple& dst = *(dp+blockIdx.x);
679 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
680 dst = r;
681 }
682 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
683 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
684 icell < indexer.numPts();
685 icell += stride)
686 {
687 auto iv = indexer.intVect(icell);
688 amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture
689 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
690 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
691 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
692 } else {
693 for (int n = 0; n < ncomp; ++n) {
694 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
695 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
696 }
697 }
698 }
699#ifdef AMREX_USE_SYCL
700 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
701#else
702 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
703#endif
704 });
705 nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
706 }
707
709
710 template <typename MF, typename D, typename F>
711 std::enable_if_t<IsFabArray<MF>::value
712#ifndef AMREX_USE_CUDA
714#endif
715 >
716 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
717 {
718 using ReduceTuple = typename D::Type;
719 const int nboxes = mf.local_size();
720 if (nboxes == 0) {
721 return;
722 } else if (!mf.isFusingCandidate()) {
723 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
724 Box const& b = amrex::grow(mfi.validbox(), nghost);
725 const int li = mfi.LocalIndex();
726 this->eval(b, reduce_data,
727 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
728 {
729 return f(li, i, j, k);
730 });
731 }
732 } else {
733 eval_mf(Reduce::detail::iterate_box{},
734 mf, nghost, 0, reduce_data, std::forward<F>(f));
735 }
736 }
737
738 template <typename MF, typename D, typename F>
739 std::enable_if_t<IsFabArray<MF>::value
740#ifndef AMREX_USE_CUDA
742#endif
743 >
744 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
745 {
746 using ReduceTuple = typename D::Type;
747
748 const int nboxes = mf.local_size();
749
750 if (nboxes == 0) {
751 return;
752 } else if (!mf.isFusingCandidate()) {
753 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
754 Box const& b = amrex::grow(mfi.validbox(), nghost);
755 const int li = mfi.LocalIndex();
756 this->eval(b, ncomp, reduce_data,
757 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
758 {
759 return f(li, i, j, k, n);
760 });
761 }
762 } else {
763 eval_mf(Reduce::detail::iterate_box_comp{},
764 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
765 }
766 }
767
768 template <typename D, typename F, int dim>
769 void eval (BoxND<dim> const& box, D & reduce_data, F const& f)
770 {
771 eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);
772 }
773
774 template <typename N, typename D, typename F, int dim,
775 typename M=std::enable_if_t<std::is_integral_v<N>> >
776 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
777 {
778 eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);
779 }
780
781 template <typename N, typename D, typename F,
782 typename M=std::enable_if_t<std::is_integral_v<N>> >
783 void eval (N n, D & reduce_data, F const& f)
784 {
785 if (n <= 0) { return; }
786 using ReduceTuple = typename D::Type;
787 auto const& stream = Gpu::gpuStream();
788 auto dp = reduce_data.devicePtr(stream);
789 int& nblocks = reduce_data.nBlocks(stream);
790 constexpr int nitems_per_thread = 4;
791 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
792 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
793 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
794 reduce_data.updateMaxStreamIndex(stream);
795#ifdef AMREX_USE_SYCL
796 // device reduce needs local(i.e., shared) memory
797 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
798 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
799 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
800 {
801 Dim1 blockIdx {gh.blockIdx()};
802 Dim1 threadIdx{gh.threadIdx()};
803 Dim1 gridDim {gh.gridDim()};
804#else
805 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
806 [=] AMREX_GPU_DEVICE () noexcept
807 {
808#endif
809 ReduceTuple r;
810 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
811 ReduceTuple& dst = *(dp+blockIdx.x);
812 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
813 dst = r;
814 }
815 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
816 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
817 i < n;
818 i += stride)
819 {
820 auto pr = f(i);
821 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
822 }
823#ifdef AMREX_USE_SYCL
824 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
825#else
826 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
827#endif
828 });
829 nblocks = amrex::max(nblocks, nblocks_ec);
830 }
831
832 template <typename D>
833 typename D::Type value (D & reduce_data)
834 {
835 auto hp = reduce_data.hostPtr();
836
837 if (m_result_is_ready) {
838 return *hp;
839 }
840
841 using ReduceTuple = typename D::Type;
842 auto const& stream = Gpu::gpuStream();
843 auto dp = reduce_data.devicePtr();
844 auto const& nblocks = reduce_data.nBlocks();
845#if defined(AMREX_USE_SYCL)
846 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
847 const int N = nblocks[0];
848 if (N == 0) {
849 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
850 } else {
852 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
853 Gpu::streamSynchronize();
854 for (int i = 1; i < N; ++i) {
855 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
856 }
857 *hp = tmp[0];
858 }
859 } else
860#endif
861 {
862 int maxblocks = reduce_data.maxBlocks();
863#ifdef AMREX_USE_SYCL
864 // device reduce needs local(i.e., shared) memory
865 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
866#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
867 // xxxxx SYCL todo: reduce bug workaround
869 auto presult = dtmp.data();
870#else
871 auto presult = hp;
872#endif
873 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
874 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
875 {
876 ReduceTuple r;
877 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
878 ReduceTuple dst = r;
879 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
880 auto dp_stream = dp+istream*maxblocks;
881 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
882 i < nblocks[istream]; i += stride) {
883 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
884 }
885 }
886 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
887 if (gh.threadIdx() == 0) { *presult = dst; }
888 });
889#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
890 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
891#endif
892#else
893 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
894 [=] AMREX_GPU_DEVICE () noexcept
895 {
896 ReduceTuple r;
897 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
898 ReduceTuple dst = r;
899 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
900 auto dp_stream = dp+istream*maxblocks;
901 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
902 i < nblocks[istream]; i += stride) {
903 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
904 }
905 }
906 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
907 if (threadIdx.x == 0) { *hp = dst; }
908 });
909#endif
910 Gpu::streamSynchronize();
911 }
912
913 m_result_is_ready = true;
914 return *hp;
915 }
916
917private:
918 template <typename... T> friend class ReduceData;
919 bool m_result_is_ready = false;
920 void resetResultReadiness () { m_result_is_ready = false; }
921};
922
923namespace Reduce {
924
925template <typename T, typename N,
926 std::enable_if_t<std::is_integral_v<N>,int> FOO>
927T Sum (N n, T const* v, T init_val)
928{
929 ReduceOps<ReduceOpSum> reduce_op;
930 ReduceData<T> reduce_data(reduce_op);
931 using ReduceTuple = typename decltype(reduce_data)::Type;
932 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
933 ReduceTuple hv = reduce_data.value(reduce_op);
934 return amrex::get<0>(hv) + init_val;
935}
936
937template <typename T, typename N, typename F,
938 std::enable_if_t<std::is_integral_v<N> &&
939 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
940T Sum (N n, F const& f, T init_val)
941{
942 ReduceOps<ReduceOpSum> reduce_op;
943 ReduceData<T> reduce_data(reduce_op);
944 using ReduceTuple = typename decltype(reduce_data)::Type;
945 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
946 ReduceTuple hv = reduce_data.value(reduce_op);
947 return amrex::get<0>(hv) + init_val;
948}
949
950template <typename T, typename N,
951 std::enable_if_t<std::is_integral_v<N>,int> FOO>
952T Min (N n, T const* v, T init_val)
953{
954 ReduceOps<ReduceOpMin> reduce_op;
955 ReduceData<T> reduce_data(reduce_op);
956 using ReduceTuple = typename decltype(reduce_data)::Type;
957 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
958 ReduceTuple hv = reduce_data.value(reduce_op);
959 return std::min(amrex::get<0>(hv),init_val);
960}
961
962template <typename T, typename N, typename F,
963 std::enable_if_t<std::is_integral_v<N> &&
964 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
965T Min (N n, F const& f, T init_val)
966{
967 ReduceOps<ReduceOpMin> reduce_op;
968 ReduceData<T> reduce_data(reduce_op);
969 using ReduceTuple = typename decltype(reduce_data)::Type;
970 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
971 ReduceTuple hv = reduce_data.value(reduce_op);
972 return std::min(amrex::get<0>(hv),init_val);
973}
974
975template <typename T, typename N,
976 std::enable_if_t<std::is_integral_v<N>,int> FOO>
977T Max (N n, T const* v, T init_val)
978{
979 ReduceOps<ReduceOpMax> reduce_op;
980 ReduceData<T> reduce_data(reduce_op);
981 using ReduceTuple = typename decltype(reduce_data)::Type;
982 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
983 ReduceTuple hv = reduce_data.value(reduce_op);
984 return std::max(amrex::get<0>(hv),init_val);
985}
986
987template <typename T, typename N, typename F,
988 std::enable_if_t<std::is_integral_v<N> &&
989 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
990T Max (N n, F const& f, T init_val)
991{
992 ReduceOps<ReduceOpMax> reduce_op;
993 ReduceData<T> reduce_data(reduce_op);
994 using ReduceTuple = typename decltype(reduce_data)::Type;
995 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
996 ReduceTuple hv = reduce_data.value(reduce_op);
997 return std::max(amrex::get<0>(hv),init_val);
998}
999
1000template <typename T, typename N,
1001 std::enable_if_t<std::is_integral_v<N>,int> FOO>
1002std::pair<T,T> MinMax (N n, T const* v)
1003{
1005 ReduceData<T,T> reduce_data(reduce_op);
1006 using ReduceTuple = typename decltype(reduce_data)::Type;
1007 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1008 return {v[i],v[i]};
1009 });
1010 auto hv = reduce_data.value(reduce_op);
1011 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1012}
1013
1014template <typename T, typename N, typename F,
1015 std::enable_if_t<std::is_integral_v<N> &&
1016 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1017std::pair<T,T> MinMax (N n, F const& f)
1018{
1020 ReduceData<T,T> reduce_data(reduce_op);
1021 using ReduceTuple = typename decltype(reduce_data)::Type;
1022 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1023 T tmp = f(i);
1024 return {tmp,tmp};
1025 });
1026 auto hv = reduce_data.value(reduce_op);
1027 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1028}
1029
1030template <typename T, typename N, typename P,
1031 std::enable_if_t<std::is_integral_v<N>,int> FOO>
1032bool AnyOf (N n, T const* v, P const& pred)
1033{
1034 Gpu::LaunchSafeGuard lsg(true);
1036 int* dp = ds.dataPtr();
1037 auto ec = Gpu::ExecutionConfig(n);
1038 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1039
1040#ifdef AMREX_USE_SYCL
1041 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1042 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1043 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1044 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1045 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1046 if (gh.threadIdx() == 0) { *has_any = *dp; }
1047 gh.sharedBarrier();
1048
1049 if (!(*has_any))
1050 {
1051 int r = false;
1052 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
1053 i < n && !r; i += stride)
1054 {
1055 r = pred(v[i]) ? 1 : 0;
1056 }
1057
1058 r = Gpu::blockReduce<Gpu::Device::warp_size>
1059 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1060 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1061 }
1062 });
1063#else
1064 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
1065 [=] AMREX_GPU_DEVICE () noexcept {
1066 __shared__ int has_any;
1067 if (threadIdx.x == 0) { has_any = *dp; }
1068 __syncthreads();
1069
1070 if (!has_any)
1071 {
1072 int r = false;
1073 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
1074 i < n && !r; i += stride)
1075 {
1076 r = pred(v[i]) ? 1 : 0;
1077 }
1078 r = Gpu::blockReduce<Gpu::Device::warp_size>
1079 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1080 if (threadIdx.x == 0 && r) *dp = 1;
1081 }
1082 });
1083#endif
1084 return ds.dataValue();
1085}
1086
1087template <typename P, int dim>
1088bool AnyOf (BoxND<dim> const& box, P const& pred)
1089{
1090 Gpu::LaunchSafeGuard lsg(true);
1092 int* dp = ds.dataPtr();
1093 const BoxIndexerND<dim> indexer(box);
1094 auto ec = Gpu::ExecutionConfig(box.numPts());
1095 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1096
1097#ifdef AMREX_USE_SYCL
1098 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1099 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1100 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1101 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1102 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1103 if (gh.threadIdx() == 0) { *has_any = *dp; }
1104 gh.sharedBarrier();
1105
1106 if (!(*has_any))
1107 {
1108 int r = false;
1109 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
1110 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
1111 icell < indexer.numPts() && !r;
1112 icell += stride)
1113 {
1114 auto iv = indexer.intVect(icell);
1115 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1116 }
1117 r = Gpu::blockReduce<Gpu::Device::warp_size>
1118 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1119 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1120 }
1121 });
1122#else
1123 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
1124 Gpu::gpuStream(),
1125 [=] AMREX_GPU_DEVICE () noexcept {
1126 __shared__ int has_any;
1127 if (threadIdx.x == 0) { has_any = *dp; }
1128 __syncthreads();
1129
1130 if (!has_any)
1131 {
1132 int r = false;
1133 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
1134 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
1135 icell < indexer.numPts() && !r;
1136 icell += stride)
1137 {
1138 auto iv = indexer.intVect(icell);
1139 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1140 }
1141 r = Gpu::blockReduce<Gpu::Device::warp_size>
1142 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1143 if (threadIdx.x == 0 && r) *dp = 1;
1144 }
1145 });
1146#endif
1147 return ds.dataValue();
1148}
1149
1150}
1151
1152#else
1153
1154template <typename... Ts>
1155class ReduceData
1156{
1157public:
1158 using Type = GpuTuple<Ts...>;
1159
1160 template <typename... Ps>
1161 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1162 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1163 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1164 {
1165 reduce_op.resetResultReadiness();
1166 for (auto& t : m_tuple) {
1167 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1168 }
1169 }
1170
1171 ~ReduceData () = default;
1172 ReduceData (ReduceData<Ts...> const&) = delete;
1173 ReduceData (ReduceData<Ts...> &&) = delete;
1174 void operator= (ReduceData<Ts...> const&) = delete;
1175 void operator= (ReduceData<Ts...> &&) = delete;
1176
1177 Type value () { return m_fn_value(); }
1178
1179 template <typename... Ps>
1180 Type value (ReduceOps<Ps...>& reduce_op)
1181 {
1182 return reduce_op.value(*this);
1183 }
1184
1185 Vector<Type>& reference () { return m_tuple; }
1186
1187 Type& reference (int tid)
1188 {
1189 if (m_tuple.size() == 1) {
1190 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1191 return m_tuple[0];
1192 } else {
1193 return m_tuple[tid];
1194 }
1195 }
1196
1197private:
1198 Vector<Type> m_tuple;
1199 std::function<Type()> m_fn_value;
1200};
1201
1202namespace Reduce::detail {
1203
1204 // call_f_intvect
1205
1206 template <typename F, int dim>
1208 auto call_f_intvect (F const& f, IntVectND<dim> iv) noexcept ->
1209 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1210 {
1211 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1212 }
1213
1214 // call_f_intvect_n
1215
1216 template <typename F, typename T, int dim>
1218 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
1219 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1220 {
1221 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1222 }
1223}
1224
1225template <typename... Ps>
1226class ReduceOps
1227{
1228private:
1229
1230 // call_f_box
1231
1232 template <typename D, typename F, int dim>
1234 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1235 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(
1236 Reduce::detail::call_f_intvect(f, IntVectND<dim>(0))
1237 )>, typename D::Type>>
1238 {
1239 using ReduceTuple = typename D::Type;
1240 For(box,
1241 [&] (IntVectND<dim> iv) {
1242 auto pr = Reduce::detail::call_f_intvect(f, iv);
1243 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1244 });
1245 }
1246
1247 template <typename D, typename F, int dim>
1249 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1250 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(box))>,
1251 typename D::Type>>
1252 {
1253 using ReduceTuple = typename D::Type;
1254 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1255 }
1256
1257public:
1258
1259 template <typename MF, typename D, typename F>
1260 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1261 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1262 {
1263 using ReduceTuple = typename D::Type;
1264#ifdef AMREX_USE_OMP
1265#pragma omp parallel
1266#endif
1267 {
1268 ReduceTuple rr;
1269 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1270 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1271 Box const& b = mfi.growntilebox(nghost);
1272 const int li = mfi.LocalIndex();
1273 const auto lo = amrex::lbound(b);
1274 const auto hi = amrex::ubound(b);
1275 for (int k = lo.z; k <= hi.z; ++k) {
1276 for (int j = lo.y; j <= hi.y; ++j) {
1277 for (int i = lo.x; i <= hi.x; ++i) {
1278 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1279 }}}
1280 }
1281 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1282 reduce_data.reference(OpenMP::get_thread_num()), rr);
1283 }
1284 }
1285
1286 template <typename MF, typename D, typename F>
1287 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1288 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1289 {
1290 using ReduceTuple = typename D::Type;
1291#ifdef AMREX_USE_OMP
1292#pragma omp parallel
1293#endif
1294 {
1295 ReduceTuple rr;
1296 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1297 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1298 Box const& b = mfi.growntilebox(nghost);
1299 const int li = mfi.LocalIndex();
1300 const auto lo = amrex::lbound(b);
1301 const auto hi = amrex::ubound(b);
1302 for (int n = 0; n < ncomp; ++n) {
1303 for (int k = lo.z; k <= hi.z; ++k) {
1304 for (int j = lo.y; j <= hi.y; ++j) {
1305 for (int i = lo.x; i <= hi.x; ++i) {
1306 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1307 }}}}
1308 }
1309 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1310 reduce_data.reference(OpenMP::get_thread_num()), rr);
1311 }
1312 }
1313
1314 template <typename D, typename F, int dim>
1315 void eval (BoxND<dim> const& box, D & reduce_data, F&& f)
1316 {
1317 using ReduceTuple = typename D::Type;
1318 ReduceTuple rr;
1319 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1320 call_f_box<D>(box, rr, std::forward<F>(f));
1321 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1322 reduce_data.reference(OpenMP::get_thread_num()), rr);
1323 }
1324
1325 template <typename N, typename D, typename F, int dim,
1326 typename M=std::enable_if_t<std::is_integral_v<N>> >
1327 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
1328 {
1329 using ReduceTuple = typename D::Type;
1330 ReduceTuple rr;
1331 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1332 For(box, ncomp,
1333 [&] (IntVectND<dim> iv, int n) {
1334 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1335 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1336 });
1337 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1338 reduce_data.reference(OpenMP::get_thread_num()), rr);
1339 }
1340
1341 template <typename N, typename D, typename F,
1342 typename M=std::enable_if_t<std::is_integral_v<N>> >
1343 void eval (N n, D & reduce_data, F const& f)
1344 {
1345 using ReduceTuple = typename D::Type;
1346 ReduceTuple rr;
1347 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1348 for (N i = 0; i < n; ++i) {
1349 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1350 }
1351 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1352 reduce_data.reference(OpenMP::get_thread_num()), rr);
1353 }
1354
1355 template <typename D>
1356 typename D::Type value (D & reduce_data)
1357 {
1358 auto& rrv = reduce_data.reference();
1359 if (! m_result_is_ready) {
1360 using ReduceTuple = typename D::Type;
1361 if (rrv.size() > 1) {
1362 for (int i = 1, N = rrv.size(); i < N; ++i) {
1363 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1364 }
1365 }
1366 m_result_is_ready = true;
1367 }
1368 return rrv[0];
1369 }
1370
1371private:
1372 template <typename... T> friend class ReduceData;
1373 bool m_result_is_ready = false;
1374 void resetResultReadiness () { m_result_is_ready = false; }
1375};
1376
1377namespace Reduce {
1378
1379template <typename T, typename N, typename F,
1380 std::enable_if_t<std::is_integral_v<N> &&
1381 !std::is_same_v<T*,std::decay_t<F>>,int> >
1382T Sum (N n, F const& f, T init_val)
1383{
1384 T r = init_val;
1385#ifdef AMREX_USE_OMP
1386#pragma omp parallel for reduction(+:r)
1387#endif
1388 for (N i = 0; i < n; ++i) {
1389 r += f(i);
1390 }
1391 return r;
1392}
1393
1394template <typename T, typename N,
1395 std::enable_if_t<std::is_integral_v<N>,int> >
1396T Sum (N n, T const* v, T init_val)
1397{
1398 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1399}
1400
1401template <typename T, typename N, typename F,
1402 std::enable_if_t<std::is_integral_v<N> &&
1403 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1404T Min (N n, F const& f, T init_val)
1405{
1406 T r = init_val;
1407#ifdef AMREX_USE_OMP
1408#pragma omp parallel for reduction(min:r)
1409#endif
1410 for (N i = 0; i < n; ++i) {
1411 r = std::min(r,f(i));
1412 }
1413 return r;
1414}
1415
1416template <typename T, typename N,
1417 std::enable_if_t<std::is_integral_v<N>,int> >
1418T Min (N n, T const* v, T init_val)
1419{
1420 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1421}
1422
1423template <typename T, typename N, typename F,
1424 std::enable_if_t<std::is_integral_v<N> &&
1425 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1426T Max (N n, F const& f, T init_val)
1427{
1428 T r = init_val;
1429#ifdef AMREX_USE_OMP
1430#pragma omp parallel for reduction(max:r)
1431#endif
1432 for (N i = 0; i < n; ++i) {
1433 r = std::max(r,f(i));
1434 }
1435 return r;
1436}
1437
1438template <typename T, typename N,
1439 std::enable_if_t<std::is_integral_v<N>,int> >
1440T Max (N n, T const* v, T init_val)
1441{
1442 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1443}
1444
1445template <typename T, typename N, typename F,
1446 std::enable_if_t<std::is_integral_v<N> &&
1447 !std::is_same_v<T*,std::decay_t<F>>,int> FOO>
1448std::pair<T,T> MinMax (N n, F const& f)
1449{
1450 T r_min = std::numeric_limits<T>::max();
1451 T r_max = std::numeric_limits<T>::lowest();
1452#ifdef AMREX_USE_OMP
1453#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1454#endif
1455 for (N i = 0; i < n; ++i) {
1456 T tmp = f(i);
1457 r_min = std::min(r_min,tmp);
1458 r_max = std::max(r_max,tmp);
1459 }
1460 return std::make_pair(r_min,r_max);
1461}
1462
1463template <typename T, typename N, typename M>
1464std::pair<T,T> MinMax (N n, T const* v)
1465{
1466 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1467}
1468
1469template <typename T, typename N, typename P,
1470 std::enable_if_t<std::is_integral_v<N>,int> >
1471bool AnyOf (N n, T const* v, P&& pred)
1472{
1473 return std::any_of(v, v+n, std::forward<P>(pred));
1474}
1475
1476template <typename P, int dim>
1477bool AnyOf (BoxND<dim> const& box, P const& pred)
1478{
1479 for (auto iv : box.iterator()) { // NOLINT(readability-use-anyofallof)
1480 if (Reduce::detail::call_f_intvect(pred, iv)) { return true; }
1481 }
1482 return false;
1483}
1484
1485}
1486
1487#endif
1488
1493template <typename... Ts, typename... Ps>
1495constexpr GpuTuple<Ts...>
1497{
1498 GpuTuple<Ts...> r{};
1499 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1500 return r;
1501}
1502
1507template <typename... Ts, typename... Ps>
1509constexpr GpuTuple<Ts...>
1511{
1512 GpuTuple<Ts...> r{};
1513 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1514 return r;
1515}
1516
1518template <typename Ops, typename Ts>
1519class ReducerImpl;
1520
1521template <typename... Ops, typename... Ts>
1522class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>
1523{
1524public:
1525 static_assert(sizeof...(Ops) > 0);
1526 static_assert(sizeof...(Ts) > 0);
1527 static_assert(sizeof...(Ops) == sizeof...(Ts));
1528
1529 ReducerImpl ()
1530 : m_reduce_data(m_reduce_op)
1531 {}
1532
1533protected:
1534 using Result_t = GpuTuple<Ts...>;
1535 ReduceOps<Ops...> m_reduce_op;
1536 ReduceData<Ts...> m_reduce_data;
1537};
1539
1604template <typename Ops, typename Ts>
1606 : public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>
1607{
1608 using Base = ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>;
1609public:
1610
1611 using Result_t = typename Base::Result_t;
1612 static constexpr int size = GpuTupleSize<Result_t>::value;
1613
1614 Reducer () = default;
1615 ~Reducer () = default;
1616
1618 Reducer (Reducer const&) = delete;
1619 Reducer (Reducer &&) = delete;
1620 void operator= (Reducer const&) = delete;
1621 void operator= (Reducer &&) = delete;
1623
1637 template <typename F, int dim>
1638 std::enable_if_t<IsCallable<F, int, int, int>::value ||
1640 eval (BoxND<dim> const& box, F&& f)
1641 {
1642 this->m_reduce_op.eval(box, this->m_reduce_data, std::forward<F>(f));
1643 }
1644
1660 template <typename F, int dim>
1661 std::enable_if_t<IsCallable<F, int, int, int, int>::value ||
1662 IsCallable<F, IntVectND<dim>, int>::value>
1663 eval (BoxND<dim> const& box, int ncomp, F&& f)
1664 {
1665 this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));
1666 }
1667
1687 template <typename MF, typename F>
1688 std::enable_if_t<IsFabArray<MF>::value &&
1690 eval (MF const& mf, IntVect const& nghost, F && f)
1691 {
1692 this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));
1693 }
1694
1717 template <typename MF, typename F>
1718 std::enable_if_t<IsFabArray<MF>::value &&
1720 eval (MF const& mf, IntVect const& nghost, int ncomp, F && f)
1721 {
1722 this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));
1723 }
1724
1737 template <typename N, typename F>
1738 std::enable_if_t<IsCallable<F, N>::value>
1739 eval (N n, F && f)
1740 {
1741 this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));
1742 }
1743
1754 [[nodiscard]] Result_t getResult ()
1755 {
1756 return this->m_reduce_data.value(this->m_reduce_op);
1757 }
1758};
1759
1760}
1761
1762#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:36
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:701
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:85
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:453
~ReduceData()
Definition AMReX_Reduce.H:475
int maxStreamIndex() const
Definition AMReX_Reduce.H:508
Type value()
Definition AMReX_Reduce.H:485
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:509
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:504
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:458
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:497
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:491
Type * devicePtr()
Definition AMReX_Reduce.H:496
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:503
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:501
int maxBlocks() const
Definition AMReX_Reduce.H:506
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:582
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:833
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:769
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:716
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:776
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:744
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:783
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1607
std::enable_if_t< IsCallable< F, int, int, int >::value||IsCallable< F, IntVectND< dim > >::value > eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1640
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1720
Reducer()=default
std::enable_if_t< IsCallable< F, N >::value > eval(N n, F &&f)
Reduction over a 1D index range.
Definition AMReX_Reduce.H:1739
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1754
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1611
~Reducer()=default
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1690
std::enable_if_t< IsCallable< F, int, int, int, int >::value||IsCallable< F, IntVectND< dim >, int >::value > eval(BoxND< dim > const &box, int ncomp, F &&f)
Reduction over a Box plus component index.
Definition AMReX_Reduce.H:1663
amrex_long Long
Definition AMReX_INT.H:30
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:952
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:977
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1002
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1032
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:927
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1331
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:420
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:357
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1496
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_Tuple.H:125
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:119
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:287
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:382
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:410
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:406
Definition AMReX_Reduce.H:415
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:443
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:439
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:429
Definition AMReX_Reduce.H:348
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:373
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:377
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:369
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:360
Definition AMReX_Reduce.H:314
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:343
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:326
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:335
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:284
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:297
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:306
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:309
Struct for holding types.
Definition AMReX_TypeList.H:13