Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Concepts.H>
6#include <AMReX_Gpu.H>
7#include <AMReX_Arena.H>
8#include <AMReX_OpenMP.H>
9#include <AMReX_MFIter.H>
10#include <AMReX_TypeList.H>
11#include <AMReX_ValLocPair.H>
12
13#include <algorithm>
14#include <concepts>
15#include <functional>
16#include <limits>
17
18namespace amrex {
19
20namespace Reduce {
21
22// The declaration of these functions are here to work around doxygen issues.
23
38template <typename T, std::integral N>
39T Sum (N n, T const* v, T init_val = 0);
40
57template <typename T, std::integral N, typename F>
58requires (!std::same_as<T*,std::decay_t<F>>)
59T Sum (N n, F const& f, T init_val = 0);
60
75template <typename T, std::integral N>
76T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max());
77
94template <typename T, std::integral N, typename F>
95requires (!std::same_as<T*,std::decay_t<F>>)
96T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max());
97
112template <typename T, std::integral N>
113T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest());
114
131template <typename T, std::integral N, typename F>
132requires (!std::same_as<T*,std::decay_t<F>>)
133T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest());
134
148template <typename T, std::integral N>
149std::pair<T,T> MinMax (N n, T const* v);
150
166template <typename T, std::integral N, typename F>
167requires (!std::same_as<T*,std::decay_t<F>>)
168std::pair<T,T> MinMax (N n, F const& f);
169
185template <typename T, std::integral N, typename P>
186bool AnyOf (N n, T const* v, P const& pred);
187
201template <typename P, int dim>
202bool AnyOf (BoxND<dim> const& box, P const& pred);
203
204}
205
207namespace Reduce::detail {
208
209#ifdef AMREX_USE_GPU
210#ifdef AMREX_USE_SYCL
211 template <std::size_t I, typename T, typename P>
213 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
214 {
215 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
216 }
217
218 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
220 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
221 {
222 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
223 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
224 }
225#else
226 template <std::size_t I, typename T, typename P>
228 void for_each_parallel (T& d, T const& s)
229 {
230 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
231 }
232
233 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
235 void for_each_parallel (T& d, T const& s)
236 {
237 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
238 for_each_parallel<I+1,T,P1,Ps...>(d, s);
239 }
240#endif
241#endif
242
243 template <std::size_t I, typename T, typename P>
245 void for_each_local (T& d, T const& s)
246 {
247 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
248 }
249
250 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
252 void for_each_local (T& d, T const& s)
253 {
254 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
255 for_each_local<I+1,T,P1,Ps...>(d, s);
256 }
257
258 template <std::size_t I, typename T, typename P>
260 constexpr void for_each_init (T& t)
261 {
262 P().init(amrex::get<I>(t));
263 }
264
265 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
267 constexpr void for_each_init (T& t)
268 {
269 P().init(amrex::get<I>(t));
270 for_each_init<I+1,T,P1,Ps...>(t);
271 }
272}
274
277{
278
279#ifdef AMREX_USE_GPU
280#ifdef AMREX_USE_SYCL
281 template <typename T>
283 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
284 T r = Gpu::blockReduceSum(s,h);
285 if (h.threadIdx() == 0) { d += r; }
286 }
287#else
288 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
290 void parallel_update (T& d, T const& s) const noexcept {
291 T r = Gpu::blockReduceSum<MT>(s);
292 if (threadIdx.x == 0) { d += r; }
293 }
294#endif
295#endif
296
297 template <typename T>
299 void local_update (T& d, T const& s) const noexcept { d += s; }
300
301 template <typename T>
302 constexpr void init (T& t) const noexcept { t = 0; }
303};
304
307{
308#ifdef AMREX_USE_GPU
309#ifdef AMREX_USE_SYCL
310 template <typename T>
312 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
313 T r = Gpu::blockReduceMin(s,h);
314 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
315 }
316#else
317 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
319 void parallel_update (T& d, T const& s) const noexcept {
320 T r = Gpu::blockReduceMin<MT>(s);
321 if (threadIdx.x == 0) { d = amrex::min(d,r); }
322 }
323#endif
324#endif
325
326 template <typename T>
328 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
329
330 template <typename T>
331 requires (std::numeric_limits<T>::is_specialized)
332 constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
333
334 template <typename T>
335 requires (!std::numeric_limits<T>::is_specialized)
336 constexpr void init (T& t) const noexcept { t = T::max(); }
337};
338
341{
342#ifdef AMREX_USE_GPU
343#ifdef AMREX_USE_SYCL
344 template <typename T>
346 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
347 T r = Gpu::blockReduceMax(s,h);
348 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
349 }
350#else
351 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
353 void parallel_update (T& d, T const& s) const noexcept {
354 T r = Gpu::blockReduceMax<MT>(s);
355 if (threadIdx.x == 0) { d = amrex::max(d,r); }
356 }
357#endif
358#endif
359
360 template <typename T>
362 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
363
364 template <typename T>
365 requires (std::numeric_limits<T>::is_specialized)
366 constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
367
368 template <typename T>
369 requires (!std::numeric_limits<T>::is_specialized)
370 constexpr void init (T& t) const noexcept { t = T::lowest(); }
371};
372
375{
376#ifdef AMREX_USE_GPU
377#ifdef AMREX_USE_SYCL
378 template <std::integral T>
380 void parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
382 if (h.threadIdx() == 0) { d = d && r; }
383 }
384#else
385 template <std::integral T, int MT=AMREX_GPU_MAX_THREADS>
387 void parallel_update (T& d, T s) const noexcept {
388 T r = Gpu::blockReduceLogicalAnd<MT>(s);
389 if (threadIdx.x == 0) { d = d && r; }
390 }
391#endif
392#endif
393
394 template <std::integral T>
396 void local_update (T& d, T s) const noexcept { d = d && s; }
397
398 template <std::integral T>
399 constexpr void init (T& t) const noexcept { t = true; }
400};
401
404{
405#ifdef AMREX_USE_GPU
406#ifdef AMREX_USE_SYCL
407 template <std::integral T>
409 void parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
410 T r = Gpu::blockReduceLogicalOr(s,h);
411 if (h.threadIdx() == 0) { d = d || r; }
412 }
413#else
414 template <std::integral T, int MT=AMREX_GPU_MAX_THREADS>
416 void parallel_update (T& d, T s) const noexcept {
417 T r = Gpu::blockReduceLogicalOr<MT>(s);
418 if (threadIdx.x == 0) { d = d || r; }
419 }
420#endif
421#endif
422
423 template <std::integral T>
425 void local_update (T& d, T s) const noexcept { d = d || s; }
426
427 template <std::integral T>
428 constexpr void init (T& t) const noexcept { t = false; }
429};
430
431template <typename... Ps> class ReduceOps;
432
433#ifdef AMREX_USE_GPU
434
436template <typename... Ts>
438{
439public:
440 using Type = GpuTuple<Ts...>;
441
442 template <typename... Ps>
443 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
444 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
445 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
446 m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)
447 * m_max_blocks * sizeof(Type)))),
448 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
449 {
450 reduce_op.resetResultReadiness();
451 static_assert(std::is_trivially_copyable<Type>(),
452 "ReduceData::Type must be trivially copyable");
453 static_assert(std::is_trivially_destructible<Type>(),
454 "ReduceData::Type must be trivially destructible");
455
456 new (m_host_tuple) Type();
457 m_nblocks.fill(0);
458 }
459
462 !m_used_external_stream || m_value_called,
463 "ReduceData used on an external GPU stream must call value() before destruction.");
464 The_Pinned_Arena()->free(m_host_tuple);
465 The_Arena()->free(m_device_tuple);
466 }
467
468 ReduceData (ReduceData<Ts...> const&) = delete;
470 void operator= (ReduceData<Ts...> const&) = delete;
471 void operator= (ReduceData<Ts...> &&) = delete;
472
474 {
475 Type r = m_fn_value();
476 m_value_called = true;
477 return r;
478 }
479
480 template <typename... Ps>
482 {
483 Type r = reduce_op.value(*this);
484 m_value_called = true;
485 return r;
486 }
487
488 Type* devicePtr () { return m_device_tuple; }
490 return m_device_tuple+streamIndexChecked(s)*m_max_blocks;
491 }
492
493 Type* hostPtr () { return m_host_tuple; }
494
496 int& nBlocks (gpuStream_t const& s) { return m_nblocks[streamIndexChecked(s)]; }
497
498 int maxBlocks () const { return m_max_blocks; }
499
500 int maxStreamIndex () const { return m_max_stream_index; }
502 m_max_stream_index = std::max(m_max_stream_index,streamIndexChecked(s));
503 }
504
505 void markValueCalled () noexcept { m_value_called = true; }
506
507private:
508 int streamIndexChecked (gpuStream_t const& s)
509 {
510 int const idx = Gpu::Device::streamIndex(s);
511 m_used_external_stream = m_used_external_stream || Gpu::Device::usingExternalStream();
512 if (idx == 0) {
513 if (m_stream_index_zero_set) {
514 AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stream_index_zero == s,
515 "ReduceData cannot be reused across different external GPU streams "
516 "or between an external GPU stream and AMReX stream 0.");
517 } else {
518 m_stream_index_zero = s;
519 m_stream_index_zero_set = true;
520 }
521 }
522 return idx;
523 }
524
525 int m_max_blocks;
526 int m_max_stream_index = 0;
527 Type* m_host_tuple = nullptr;
528 Type* m_device_tuple = nullptr;
529 GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;
530 gpuStream_t m_stream_index_zero{};
531 bool m_stream_index_zero_set = false;
532 bool m_used_external_stream = false;
533 bool m_value_called = false;
534 std::function<Type()> m_fn_value;
535};
536
538namespace Reduce::detail {
539
540 // call_f_intvect_box
541
542 template <typename F, int dim>
544 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
545 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
546 {
547 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
548 }
549
550 template <typename F, int dim>
552 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
553 decltype(f(BoxND<dim>(iv, iv, t)))
554 {
555 return f(BoxND<dim>(iv, iv, t));
556 }
557
558 // call_f_intvect_n
559 template <typename F, typename T, int dim>
561 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
562 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
563 {
564 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
565 }
566
567 // mf_call_f
568
569 struct iterate_box {};
570 struct iterate_box_comp {};
571
572 template <typename I, typename F, typename T, typename... Ps>
573 requires (std::same_as<iterate_box,I>)
575 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
576 {
577 auto const& pr = f(ibox,i,j,k);
578 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
579 }
580
581 template <typename I, typename F, typename T, typename... Ps>
582 requires (std::same_as<iterate_box_comp,I>)
584 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
585 {
586 for (int n = 0; n < ncomp; ++n) {
587 auto const& pr = f(ibox,i,j,k,n);
588 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
589 }
590 }
591}
593
595template <typename... Ps>
597{
598public:
599
601
602 // This is public for CUDA
603 template <typename I, typename MF, typename D, typename F>
604 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
605 {
606 using ReduceTuple = typename D::Type;
607 const int nboxes = mf.local_size();
608 if (nboxes > 0) {
609 auto const& parforinfo = mf.getParForInfo(nghost);
610 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
611 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
612 const int nblocks = nblocks_per_box * nboxes;
613 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
614
615 auto const& stream = Gpu::gpuStream();
616 auto pdst = reduce_data.devicePtr(stream);
617 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
618 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
619 int& nblocks_ref = reduce_data.nBlocks(stream);
620 auto old_nblocks = static_cast<unsigned int>(nblocks_ref);
621 nblocks_ref = amrex::max(nblocks_ref, nblocks_ec);
622 reduce_data.updateMaxStreamIndex(stream);
623
624#ifdef AMREX_USE_SYCL
625 // device reduce needs local(i.e., shared) memory
626 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
627 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
628 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
629 {
630 Dim1 blockIdx {gh.blockIdx()};
631 Dim1 threadIdx{gh.threadIdx()};
632#else
633 amrex::launch_global<AMREX_GPU_MAX_THREADS>
634 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
635 ([=] AMREX_GPU_DEVICE () noexcept
636 {
637#endif
638 ReduceTuple r;
639 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
640 ReduceTuple& dst = pdst[blockIdx.x];
641 if (threadIdx.x == 0 && blockIdx.x >= old_nblocks) {
642 dst = r;
643 }
644 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
645 int ibox = iblock / nblocks_per_box;
646 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
647
648 BoxIndexer const& indexer = dp_boxes[ibox];
649 if (icell < indexer.numPts()) {
650 auto [i, j, k] = indexer(icell);
651 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
652 (f, ibox, i, j, k, ncomp, r);
653 }
654 }
655#ifdef AMREX_USE_SYCL
656 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
657#else
658 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
659#endif
660 });
661 }
662 }
663
664 // This is public for CUDA
665 template <typename I, int dim, typename D, typename F>
666 void eval_box (I, BoxND<dim> const& box, int ncomp, D& reduce_data, F const& f)
667 {
668 using ReduceTuple = typename D::Type;
669 auto const& stream = Gpu::gpuStream();
670 auto dp = reduce_data.devicePtr(stream);
671 int& nblocks = reduce_data.nBlocks(stream);
672 const BoxIndexerND<dim> indexer(box);
673 IndexTypeND<dim> ixtype = box.ixType();
674 constexpr int nitems_per_thread = 4;
675 Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
676 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
677 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
678 reduce_data.updateMaxStreamIndex(stream);
679#ifdef AMREX_USE_SYCL
680 // device reduce needs local(i.e., shared) memory
681 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
682 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
683 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
684 {
685 Dim1 blockIdx {gh.blockIdx()};
686 Dim1 threadIdx{gh.threadIdx()};
687 Dim1 gridDim {gh.gridDim()};
688#else
689 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
690 [=] AMREX_GPU_DEVICE () noexcept
691 {
692#endif
693 ReduceTuple r;
694 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
695 ReduceTuple& dst = *(dp+blockIdx.x);
696 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
697 dst = r;
698 }
699 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
700 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
701 icell < indexer.numPts();
702 icell += stride)
703 {
704 auto iv = indexer.intVect(icell);
705 amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture
706 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
707 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
708 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
709 } else {
710 for (int n = 0; n < ncomp; ++n) {
711 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
712 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
713 }
714 }
715 }
716#ifdef AMREX_USE_SYCL
717 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
718#else
719 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
720#endif
721 });
722 nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
723 }
724
726
727 template <FabArrayType MF, typename D, typename F>
728#ifndef AMREX_USE_CUDA
730#endif
731 void eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
732 {
733 using ReduceTuple = typename D::Type;
734 const int nboxes = mf.local_size();
735 if (nboxes == 0) {
736 return;
737 } else if (!mf.isFusingCandidate()) {
738 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
739 Box const& b = amrex::grow(mfi.validbox(), nghost);
740 const int li = mfi.LocalIndex();
741 this->eval(b, reduce_data,
742 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
743 {
744 return f(li, i, j, k);
745 });
746 }
747 } else {
748 eval_mf(Reduce::detail::iterate_box{},
749 mf, nghost, 0, reduce_data, std::forward<F>(f));
750 }
751 }
752
753 template <FabArrayType MF, typename D, typename F>
754#ifndef AMREX_USE_CUDA
756#endif
757 void eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
758 {
759 using ReduceTuple = typename D::Type;
760
761 const int nboxes = mf.local_size();
762
763 if (nboxes == 0) {
764 return;
765 } else if (!mf.isFusingCandidate()) {
766 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
767 Box const& b = amrex::grow(mfi.validbox(), nghost);
768 const int li = mfi.LocalIndex();
769 this->eval(b, ncomp, reduce_data,
770 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
771 {
772 return f(li, i, j, k, n);
773 });
774 }
775 } else {
776 eval_mf(Reduce::detail::iterate_box_comp{},
777 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
778 }
779 }
780
781 template <typename D, typename F, int dim>
782 void eval (BoxND<dim> const& box, D & reduce_data, F const& f)
783 {
784 eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);
785 }
786
787 template <std::integral N, typename D, typename F, int dim>
788 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
789 {
790 eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);
791 }
792
793 template <std::integral N, typename D, typename F>
794 void eval (N n, D & reduce_data, F const& f)
795 {
796 if (n <= 0) { return; }
797 using ReduceTuple = typename D::Type;
798 auto const& stream = Gpu::gpuStream();
799 auto dp = reduce_data.devicePtr(stream);
800 int& nblocks = reduce_data.nBlocks(stream);
801 constexpr int nitems_per_thread = 4;
802 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
803 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
804 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
805 reduce_data.updateMaxStreamIndex(stream);
806#ifdef AMREX_USE_SYCL
807 // device reduce needs local(i.e., shared) memory
808 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
809 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
810 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
811 {
812 Dim1 blockIdx {gh.blockIdx()};
813 Dim1 threadIdx{gh.threadIdx()};
814 Dim1 gridDim {gh.gridDim()};
815#else
816 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
817 [=] AMREX_GPU_DEVICE () noexcept
818 {
819#endif
820 ReduceTuple r;
821 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
822 ReduceTuple& dst = *(dp+blockIdx.x);
823 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
824 dst = r;
825 }
826 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
827 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
828 i < n;
829 i += stride)
830 {
831 auto pr = f(i);
832 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
833 }
834#ifdef AMREX_USE_SYCL
835 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
836#else
837 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
838#endif
839 });
840 nblocks = amrex::max(nblocks, nblocks_ec);
841 }
842
843 template <typename D>
844 typename D::Type value (D & reduce_data)
845 {
846 auto hp = reduce_data.hostPtr();
847
848 if (m_result_is_ready) {
849 reduce_data.markValueCalled();
850 return *hp;
851 }
852
853 using ReduceTuple = typename D::Type;
854 auto const& stream = Gpu::gpuStream();
855 auto dp = reduce_data.devicePtr();
856 auto const& nblocks = reduce_data.nBlocks();
857#if defined(AMREX_USE_SYCL)
858 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
859 const int N = nblocks[0];
860 if (N == 0) {
861 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
862 } else {
864 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
865 Gpu::streamSynchronize();
866 for (int i = 1; i < N; ++i) {
867 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
868 }
869 *hp = tmp[0];
870 }
871 } else
872#endif
873 {
874 int maxblocks = reduce_data.maxBlocks();
875#ifdef AMREX_USE_SYCL
876 // device reduce needs local(i.e., shared) memory
877 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
878#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
879 // xxxxx SYCL todo: reduce bug workaround
881 auto presult = dtmp.data();
882#else
883 auto presult = hp;
884#endif
885 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
886 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
887 {
888 ReduceTuple r;
889 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
890 ReduceTuple dst = r;
891 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
892 auto dp_stream = dp+istream*maxblocks;
893 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
894 i < nblocks[istream]; i += stride) {
895 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
896 }
897 }
898 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
899 if (gh.threadIdx() == 0) { *presult = dst; }
900 });
901#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
902 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
903#endif
904#else
905 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
906 [=] AMREX_GPU_DEVICE () noexcept
907 {
908 ReduceTuple r;
909 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
910 ReduceTuple dst = r;
911 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
912 auto dp_stream = dp+istream*maxblocks;
913 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
914 i < nblocks[istream]; i += stride) {
915 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
916 }
917 }
918 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
919 if (threadIdx.x == 0) { *hp = dst; }
920 });
921#endif
922 Gpu::streamSynchronize();
923 }
924
925 m_result_is_ready = true;
926 reduce_data.markValueCalled();
927 return *hp;
928 }
929
930private:
931 template <typename... T> friend class ReduceData;
932 bool m_result_is_ready = false;
933 void resetResultReadiness () { m_result_is_ready = false; }
934};
935
936namespace Reduce {
937
938template <typename T, std::integral N>
939T Sum (N n, T const* v, T init_val)
940{
941 ReduceOps<ReduceOpSum> reduce_op;
942 ReduceData<T> reduce_data(reduce_op);
943 using ReduceTuple = typename decltype(reduce_data)::Type;
944 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
945 ReduceTuple hv = reduce_data.value(reduce_op);
946 return amrex::get<0>(hv) + init_val;
947}
948
949template <typename T, std::integral N, typename F>
950requires (!std::same_as<T*,std::decay_t<F>>)
951T Sum (N n, F const& f, T init_val)
952{
953 ReduceOps<ReduceOpSum> reduce_op;
954 ReduceData<T> reduce_data(reduce_op);
955 using ReduceTuple = typename decltype(reduce_data)::Type;
956 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
957 ReduceTuple hv = reduce_data.value(reduce_op);
958 return amrex::get<0>(hv) + init_val;
959}
960
961template <typename T, std::integral N>
962T Min (N n, T const* v, T init_val)
963{
964 ReduceOps<ReduceOpMin> reduce_op;
965 ReduceData<T> reduce_data(reduce_op);
966 using ReduceTuple = typename decltype(reduce_data)::Type;
967 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
968 ReduceTuple hv = reduce_data.value(reduce_op);
969 return std::min(amrex::get<0>(hv),init_val);
970}
971
972template <typename T, std::integral N, typename F>
973requires (!std::same_as<T*,std::decay_t<F>>)
974T Min (N n, F const& f, T init_val)
975{
976 ReduceOps<ReduceOpMin> reduce_op;
977 ReduceData<T> reduce_data(reduce_op);
978 using ReduceTuple = typename decltype(reduce_data)::Type;
979 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
980 ReduceTuple hv = reduce_data.value(reduce_op);
981 return std::min(amrex::get<0>(hv),init_val);
982}
983
984template <typename T, std::integral N>
985T Max (N n, T const* v, T init_val)
986{
987 ReduceOps<ReduceOpMax> reduce_op;
988 ReduceData<T> reduce_data(reduce_op);
989 using ReduceTuple = typename decltype(reduce_data)::Type;
990 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
991 ReduceTuple hv = reduce_data.value(reduce_op);
992 return std::max(amrex::get<0>(hv),init_val);
993}
994
995template <typename T, std::integral N, typename F>
996requires (!std::same_as<T*,std::decay_t<F>>)
997T Max (N n, F const& f, T init_val)
998{
999 ReduceOps<ReduceOpMax> reduce_op;
1000 ReduceData<T> reduce_data(reduce_op);
1001 using ReduceTuple = typename decltype(reduce_data)::Type;
1002 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
1003 ReduceTuple hv = reduce_data.value(reduce_op);
1004 return std::max(amrex::get<0>(hv),init_val);
1005}
1006
1007template <typename T, std::integral N>
1008std::pair<T,T> MinMax (N n, T const* v)
1009{
1011 ReduceData<T,T> reduce_data(reduce_op);
1012 using ReduceTuple = typename decltype(reduce_data)::Type;
1013 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1014 return {v[i],v[i]};
1015 });
1016 auto hv = reduce_data.value(reduce_op);
1017 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1018}
1019
1020template <typename T, std::integral N, typename F>
1021requires (!std::same_as<T*,std::decay_t<F>>)
1022std::pair<T,T> MinMax (N n, F const& f)
1023{
1025 ReduceData<T,T> reduce_data(reduce_op);
1026 using ReduceTuple = typename decltype(reduce_data)::Type;
1027 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
1028 T tmp = f(i);
1029 return {tmp,tmp};
1030 });
1031 auto hv = reduce_data.value(reduce_op);
1032 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1033}
1034
1035template <typename T, std::integral N, typename P>
1036bool AnyOf (N n, T const* v, P const& pred)
1037{
1038 Gpu::LaunchSafeGuard lsg(true);
1040 int* dp = ds.dataPtr();
1041 auto ec = Gpu::ExecutionConfig(n);
1042 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1043
1044#ifdef AMREX_USE_SYCL
1045 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1046 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1047 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1048 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1049 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1050 if (gh.threadIdx() == 0) { *has_any = *dp; }
1051 gh.sharedBarrier();
1052
1053 if (!(*has_any))
1054 {
1055 int r = false;
1056 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
1057 i < n && !r; i += stride)
1058 {
1059 r = pred(v[i]) ? 1 : 0;
1060 }
1061
1062 r = Gpu::blockReduce<Gpu::Device::warp_size>
1063 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1064 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1065 }
1066 });
1067#else
1068 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
1069 [=] AMREX_GPU_DEVICE () noexcept {
1070 __shared__ int has_any;
1071 if (threadIdx.x == 0) { has_any = *dp; }
1072 __syncthreads();
1073
1074 if (!has_any)
1075 {
1076 int r = false;
1077 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
1078 i < n && !r; i += stride)
1079 {
1080 r = pred(v[i]) ? 1 : 0;
1081 }
1082 r = Gpu::blockReduce<Gpu::Device::warp_size>
1083 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1084 if (threadIdx.x == 0 && r) *dp = 1;
1085 }
1086 });
1087#endif
1088 return ds.dataValue();
1089}
1090
1091template <typename P, int dim>
1092bool AnyOf (BoxND<dim> const& box, P const& pred)
1093{
1094 Gpu::LaunchSafeGuard lsg(true);
1096 int* dp = ds.dataPtr();
1097 const BoxIndexerND<dim> indexer(box);
1098 auto ec = Gpu::ExecutionConfig(box.numPts());
1099 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1100
1101#ifdef AMREX_USE_SYCL
1102 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1103 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
1104 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1105 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
1106 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
1107 if (gh.threadIdx() == 0) { *has_any = *dp; }
1108 gh.sharedBarrier();
1109
1110 if (!(*has_any))
1111 {
1112 int r = false;
1113 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
1114 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
1115 icell < indexer.numPts() && !r;
1116 icell += stride)
1117 {
1118 auto iv = indexer.intVect(icell);
1119 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1120 }
1121 r = Gpu::blockReduce<Gpu::Device::warp_size>
1122 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
1123 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1124 }
1125 });
1126#else
1127 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
1128 Gpu::gpuStream(),
1129 [=] AMREX_GPU_DEVICE () noexcept {
1130 __shared__ int has_any;
1131 if (threadIdx.x == 0) { has_any = *dp; }
1132 __syncthreads();
1133
1134 if (!has_any)
1135 {
1136 int r = false;
1137 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
1138 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
1139 icell < indexer.numPts() && !r;
1140 icell += stride)
1141 {
1142 auto iv = indexer.intVect(icell);
1143 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1144 }
1145 r = Gpu::blockReduce<Gpu::Device::warp_size>
1146 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
1147 if (threadIdx.x == 0 && r) *dp = 1;
1148 }
1149 });
1150#endif
1151 return ds.dataValue();
1152}
1153
1154}
1155
1156#else
1157
1158template <typename... Ts>
1159class ReduceData
1160{
1161public:
1162 using Type = GpuTuple<Ts...>;
1163
1164 template <typename... Ps>
1165 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1166 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1167 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1168 {
1169 reduce_op.resetResultReadiness();
1170 for (auto& t : m_tuple) {
1171 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1172 }
1173 }
1174
1175 ~ReduceData () = default;
1176 ReduceData (ReduceData<Ts...> const&) = delete;
1177 ReduceData (ReduceData<Ts...> &&) = delete;
1178 void operator= (ReduceData<Ts...> const&) = delete;
1179 void operator= (ReduceData<Ts...> &&) = delete;
1180
1181 Type value () { return m_fn_value(); }
1182
1183 template <typename... Ps>
1184 Type value (ReduceOps<Ps...>& reduce_op)
1185 {
1186 return reduce_op.value(*this);
1187 }
1188
1189 Vector<Type>& reference () { return m_tuple; }
1190
1191 Type& reference (int tid)
1192 {
1193 if (m_tuple.size() == 1) {
1194 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1195 return m_tuple[0];
1196 } else {
1197 return m_tuple[tid];
1198 }
1199 }
1200
1201private:
1202 Vector<Type> m_tuple;
1203 std::function<Type()> m_fn_value;
1204};
1205
1206namespace Reduce::detail {
1207
1208 // call_f_intvect
1209
1210 template <typename F, int dim>
1212 auto call_f_intvect (F const& f, IntVectND<dim> iv) noexcept ->
1213 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1214 {
1215 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1216 }
1217
1218 // call_f_intvect_n
1219
1220 template <typename F, typename T, int dim>
1222 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
1223 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1224 {
1225 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1226 }
1227}
1228
1229template <typename... Ps>
1230class ReduceOps
1231{
1232private:
1233
1234 // call_f_box
1235
1236 template <typename D, typename F, int dim>
1237 requires (std::same_as<std::decay_t<decltype(
1238 Reduce::detail::call_f_intvect(std::declval<F const&>(), IntVectND<dim>()))>,
1239 typename D::Type>)
1241 static void call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f) noexcept
1242 {
1243 using ReduceTuple = typename D::Type;
1244 For(box,
1245 [&] (IntVectND<dim> iv) {
1246 auto pr = Reduce::detail::call_f_intvect(f, iv);
1247 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1248 });
1249 }
1250
1251 template <typename D, typename F, int dim>
1252 requires (std::same_as<std::decay_t<decltype(
1253 std::declval<F const&>()(std::declval<BoxND<dim> const&>()))>,
1254 typename D::Type>)
1256 static void call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f) noexcept
1257 {
1258 using ReduceTuple = typename D::Type;
1259 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1260 }
1261
1262public:
1263
1264 template <FabArrayType MF, typename D, typename F>
1265 requires (IsCallable<F, int, int, int, int>::value)
1266 void eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1267 {
1268 using ReduceTuple = typename D::Type;
1269#ifdef AMREX_USE_OMP
1270#pragma omp parallel
1271#endif
1272 {
1273 ReduceTuple rr;
1274 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1275 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1276 Box const& b = mfi.growntilebox(nghost);
1277 const int li = mfi.LocalIndex();
1278 const auto lo = amrex::lbound(b);
1279 const auto hi = amrex::ubound(b);
1280 for (int k = lo.z; k <= hi.z; ++k) {
1281 for (int j = lo.y; j <= hi.y; ++j) {
1282 for (int i = lo.x; i <= hi.x; ++i) {
1283 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1284 }}}
1285 }
1286 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1287 reduce_data.reference(OpenMP::get_thread_num()), rr);
1288 }
1289 }
1290
1291 template <FabArrayType MF, typename D, typename F>
1292 requires (IsCallable<F, int, int, int, int, int>::value)
1293 void eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1294 {
1295 using ReduceTuple = typename D::Type;
1296#ifdef AMREX_USE_OMP
1297#pragma omp parallel
1298#endif
1299 {
1300 ReduceTuple rr;
1301 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1302 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1303 Box const& b = mfi.growntilebox(nghost);
1304 const int li = mfi.LocalIndex();
1305 const auto lo = amrex::lbound(b);
1306 const auto hi = amrex::ubound(b);
1307 for (int n = 0; n < ncomp; ++n) {
1308 for (int k = lo.z; k <= hi.z; ++k) {
1309 for (int j = lo.y; j <= hi.y; ++j) {
1310 for (int i = lo.x; i <= hi.x; ++i) {
1311 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1312 }}}}
1313 }
1314 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1315 reduce_data.reference(OpenMP::get_thread_num()), rr);
1316 }
1317 }
1318
1319 template <typename D, typename F, int dim>
1320 void eval (BoxND<dim> const& box, D & reduce_data, F&& f)
1321 {
1322 using ReduceTuple = typename D::Type;
1323 ReduceTuple rr;
1324 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1325 call_f_box<D>(box, rr, std::forward<F>(f));
1326 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1327 reduce_data.reference(OpenMP::get_thread_num()), rr);
1328 }
1329
1330 template <std::integral N, typename D, typename F, int dim>
1331 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
1332 {
1333 using ReduceTuple = typename D::Type;
1334 ReduceTuple rr;
1335 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1336 For(box, ncomp,
1337 [&] (IntVectND<dim> iv, int n) {
1338 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1339 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1340 });
1341 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1342 reduce_data.reference(OpenMP::get_thread_num()), rr);
1343 }
1344
1345 template <std::integral N, typename D, typename F>
1346 void eval (N n, D & reduce_data, F const& f)
1347 {
1348 using ReduceTuple = typename D::Type;
1349 ReduceTuple rr;
1350 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1351 for (N i = 0; i < n; ++i) {
1352 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1353 }
1354 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1355 reduce_data.reference(OpenMP::get_thread_num()), rr);
1356 }
1357
1358 template <typename D>
1359 typename D::Type value (D & reduce_data)
1360 {
1361 auto& rrv = reduce_data.reference();
1362 if (! m_result_is_ready) {
1363 using ReduceTuple = typename D::Type;
1364 if (rrv.size() > 1) {
1365 for (int i = 1, N = rrv.size(); i < N; ++i) {
1366 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1367 }
1368 }
1369 m_result_is_ready = true;
1370 }
1371 return rrv[0];
1372 }
1373
1374private:
1375 template <typename... T> friend class ReduceData;
1376 bool m_result_is_ready = false;
1377 void resetResultReadiness () { m_result_is_ready = false; }
1378};
1379
1380namespace Reduce {
1381
1382template <typename T, std::integral N, typename F>
1383requires (!std::same_as<T*,std::decay_t<F>>)
1384T Sum (N n, F const& f, T init_val)
1385{
1386 T r = init_val;
1387#ifdef AMREX_USE_OMP
1388#pragma omp parallel for reduction(+:r)
1389#endif
1390 for (N i = 0; i < n; ++i) {
1391 r += f(i);
1392 }
1393 return r;
1394}
1395
1396template <typename T, std::integral N>
1397T Sum (N n, T const* v, T init_val)
1398{
1399 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1400}
1401
1402template <typename T, std::integral N, typename F>
1403requires (!std::same_as<T*,std::decay_t<F>>)
1404T Min (N n, F const& f, T init_val)
1405{
1406 T r = init_val;
1407#ifdef AMREX_USE_OMP
1408#pragma omp parallel for reduction(min:r)
1409#endif
1410 for (N i = 0; i < n; ++i) {
1411 r = std::min(r,f(i));
1412 }
1413 return r;
1414}
1415
1416template <typename T, std::integral N>
1417T Min (N n, T const* v, T init_val)
1418{
1419 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1420}
1421
1422template <typename T, std::integral N, typename F>
1423requires (!std::same_as<T*,std::decay_t<F>>)
1424T Max (N n, F const& f, T init_val)
1425{
1426 T r = init_val;
1427#ifdef AMREX_USE_OMP
1428#pragma omp parallel for reduction(max:r)
1429#endif
1430 for (N i = 0; i < n; ++i) {
1431 r = std::max(r,f(i));
1432 }
1433 return r;
1434}
1435
1436template <typename T, std::integral N>
1437T Max (N n, T const* v, T init_val)
1438{
1439 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1440}
1441
1442template <typename T, std::integral N, typename F>
1443requires (!std::same_as<T*,std::decay_t<F>>)
1444std::pair<T,T> MinMax (N n, F const& f)
1445{
1446 T r_min = std::numeric_limits<T>::max();
1447 T r_max = std::numeric_limits<T>::lowest();
1448#ifdef AMREX_USE_OMP
1449#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1450#endif
1451 for (N i = 0; i < n; ++i) {
1452 T tmp = f(i);
1453 r_min = std::min(r_min,tmp);
1454 r_max = std::max(r_max,tmp);
1455 }
1456 return std::make_pair(r_min,r_max);
1457}
1458
1459template <typename T, std::integral N>
1460std::pair<T,T> MinMax (N n, T const* v)
1461{
1462 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1463}
1464
1465template <typename T, std::integral N, typename P>
1466bool AnyOf (N n, T const* v, P&& pred)
1467{
1468 return std::any_of(v, v+n, std::forward<P>(pred));
1469}
1470
1471template <typename P, int dim>
1472bool AnyOf (BoxND<dim> const& box, P const& pred)
1473{
1474 for (auto iv : box.iterator()) { // NOLINT(readability-use-anyofallof)
1475 if (Reduce::detail::call_f_intvect(pred, iv)) { return true; }
1476 }
1477 return false;
1478}
1479
1480}
1481
1482#endif
1483
1488template <typename... Ts, typename... Ps>
1490constexpr GpuTuple<Ts...>
1492{
1493 GpuTuple<Ts...> r{};
1494 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1495 return r;
1496}
1497
1502template <typename... Ts, typename... Ps>
1504constexpr GpuTuple<Ts...>
1506{
1507 GpuTuple<Ts...> r{};
1508 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1509 return r;
1510}
1511
1513template <typename Ops, typename Ts>
1514class ReducerImpl;
1515
1516template <typename... Ops, typename... Ts>
1517class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>
1518{
1519public:
1520 static_assert(sizeof...(Ops) > 0);
1521 static_assert(sizeof...(Ts) > 0);
1522 static_assert(sizeof...(Ops) == sizeof...(Ts));
1523
1524 ReducerImpl ()
1525 : m_reduce_data(m_reduce_op)
1526 {}
1527
1528protected:
1529 using Result_t = GpuTuple<Ts...>;
1530 ReduceOps<Ops...> m_reduce_op;
1531 ReduceData<Ts...> m_reduce_data;
1532};
1534
1599template <typename Ops, typename Ts>
1601 : public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>
1602{
1603 using Base = ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>;
1604public:
1605
1606 using Result_t = typename Base::Result_t;
1607 static constexpr int size = GpuTupleSize<Result_t>::value;
1608
1609 Reducer () = default;
1610 ~Reducer () = default;
1611
1613 Reducer (Reducer const&) = delete;
1614 Reducer (Reducer &&) = delete;
1615 void operator= (Reducer const&) = delete;
1616 void operator= (Reducer &&) = delete;
1618
1632 template <typename F, int dim>
1635 void eval (BoxND<dim> const& box, F&& f)
1636 {
1637 this->m_reduce_op.eval(box, this->m_reduce_data, std::forward<F>(f));
1638 }
1639
1655 template <typename F, int dim>
1657 IsCallable<F, IntVectND<dim>, int>::value)
1658 void eval (BoxND<dim> const& box, int ncomp, F&& f)
1659 {
1660 this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));
1661 }
1662
1682 template <FabArrayType MF, typename F>
1684 void eval (MF const& mf, IntVect const& nghost, F && f)
1685 {
1686 this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));
1687 }
1688
1711 template <FabArrayType MF, typename F>
1713 void eval (MF const& mf, IntVect const& nghost, int ncomp, F && f)
1714 {
1715 this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));
1716 }
1717
1730 template <typename N, typename F>
1731 requires (IsCallable<F, N>::value)
1732 void eval (N n, F && f)
1733 {
1734 this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));
1735 }
1736
1747 [[nodiscard]] Result_t getResult ()
1748 {
1749 return this->m_reduce_data.value(this->m_reduce_op);
1750 }
1751};
1752
1753}
1754
1755#endif
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:136
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:710
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:837
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:438
~ReduceData()
Definition AMReX_Reduce.H:460
int maxStreamIndex() const
Definition AMReX_Reduce.H:500
Type value()
Definition AMReX_Reduce.H:473
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:501
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:496
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:443
void markValueCalled() noexcept
Definition AMReX_Reduce.H:505
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:489
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:481
Type * devicePtr()
Definition AMReX_Reduce.H:488
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:495
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:493
int maxBlocks() const
Definition AMReX_Reduce.H:498
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:597
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:788
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:844
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:782
void eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:731
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:794
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1602
Reducer()=default
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1747
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1606
~Reducer()=default
void eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1635
amrex_long Long
Definition AMReX_INT.H:30
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:962
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1036
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1008
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:985
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:939
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1359
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1289
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:553
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:452
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:397
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:505
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:347
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1491
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:25
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:45
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
Definition AMReX_Box.H:2170
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2187
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2211
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_Tuple.H:127
Definition AMReX_GpuMemory.H:57
T * dataPtr()
Definition AMReX_GpuMemory.H:91
T dataValue() const
Definition AMReX_GpuMemory.H:93
Definition AMReX_GpuLaunch.H:120
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:127
Definition AMReX_GpuReduce.H:284
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:208
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:375
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:399
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:387
Definition AMReX_Reduce.H:404
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:416
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:425
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:428
Definition AMReX_Reduce.H:341
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:366
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:362
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:353
Definition AMReX_Reduce.H:307
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:332
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:319
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:328
Definition AMReX_Reduce.H:277
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:290
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:299
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:302
Struct for holding types.
Definition AMReX_TypeList.H:13