3#include <AMReX_Config.H>
38template <
typename T, std::
integral N>
39T
Sum (N n, T
const* v, T init_val = 0);
57template <
typename T, std::
integral N,
typename F>
58requires (!std::same_as<T*,std::decay_t<F>>)
59T
Sum (N n,
F const& f, T init_val = 0);
75template <
typename T, std::
integral N>
76T
Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max());
94template <
typename T, std::
integral N,
typename F>
95requires (!std::same_as<T*,std::decay_t<F>>)
96T
Min (N n,
F const& f, T init_val = std::numeric_limits<T>::max());
112template <
typename T, std::
integral N>
113T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest());
131template <
typename T, std::
integral N,
typename F>
132requires (!std::same_as<T*,std::decay_t<F>>)
133T
Max (N n,
F const& f, T init_val = std::numeric_limits<T>::lowest());
148template <
typename T, std::
integral N>
149std::pair<T,T>
MinMax (N n, T
const* v);
166template <
typename T, std::
integral N,
typename F>
167requires (!std::same_as<T*,std::decay_t<F>>)
168std::pair<T,T>
MinMax (N n,
F const& f);
185template <
typename T, std::
integral N,
typename P>
186bool AnyOf (N n, T
const* v, P
const& pred);
201template <
typename P,
int dim>
207namespace Reduce::detail {
211 template <std::
size_t I,
typename T,
typename P>
213 void for_each_parallel (T& d, T
const& s,
Gpu::Handler const& h)
215 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
218 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
220 void for_each_parallel (T& d, T
const& s, Gpu::Handler
const& h)
222 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
223 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
226 template <std::
size_t I,
typename T,
typename P>
228 void for_each_parallel (T& d, T
const& s)
230 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
233 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
235 void for_each_parallel (T& d, T
const& s)
237 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
238 for_each_parallel<I+1,T,P1,Ps...>(d, s);
243 template <std::
size_t I,
typename T,
typename P>
245 void for_each_local (T& d, T
const& s)
247 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
250 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
252 void for_each_local (T& d, T
const& s)
254 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
255 for_each_local<I+1,T,P1,Ps...>(d, s);
258 template <std::
size_t I,
typename T,
typename P>
260 constexpr void for_each_init (T& t)
262 P().init(amrex::get<I>(t));
265 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
267 constexpr void for_each_init (T& t)
269 P().init(amrex::get<I>(t));
270 for_each_init<I+1,T,P1,Ps...>(t);
281 template <
typename T>
285 if (h.threadIdx() == 0) { d += r; }
288 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
291 T r = Gpu::blockReduceSum<MT>(s);
292 if (threadIdx.x == 0) { d += r; }
297 template <
typename T>
301 template <
typename T>
302 constexpr void init (T& t)
const noexcept { t = 0; }
310 template <
typename T>
314 if (h.threadIdx() == 0) { d =
amrex::min(d,r); }
317 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
320 T r = Gpu::blockReduceMin<MT>(s);
321 if (threadIdx.x == 0) { d =
amrex::min(d,r); }
326 template <
typename T>
330 template <
typename T>
331 requires (std::numeric_limits<T>::is_specialized)
332 constexpr void init (T& t)
const noexcept { t = std::numeric_limits<T>::max(); }
334 template <
typename T>
335 requires (!std::numeric_limits<T>::is_specialized)
336 constexpr void init (T& t)
const noexcept { t = T::max(); }
344 template <
typename T>
348 if (h.threadIdx() == 0) { d =
amrex::max(d,r); }
351 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
354 T r = Gpu::blockReduceMax<MT>(s);
355 if (threadIdx.x == 0) { d =
amrex::max(d,r); }
360 template <
typename T>
364 template <
typename T>
365 requires (std::numeric_limits<T>::is_specialized)
366 constexpr void init (T& t)
const noexcept { t = std::numeric_limits<T>::lowest(); }
368 template <
typename T>
369 requires (!std::numeric_limits<T>::is_specialized)
370 constexpr void init (T& t)
const noexcept { t = T::lowest(); }
378 template <std::
integral T>
382 if (h.threadIdx() == 0) { d = d && r; }
385 template <std::
integral T,
int MT=AMREX_GPU_MAX_THREADS>
388 T r = Gpu::blockReduceLogicalAnd<MT>(s);
389 if (threadIdx.x == 0) { d = d && r; }
394 template <std::
integral T>
398 template <std::
integral T>
399 constexpr void init (T& t)
const noexcept { t =
true; }
407 template <std::
integral T>
411 if (h.threadIdx() == 0) { d = d || r; }
414 template <std::
integral T,
int MT=AMREX_GPU_MAX_THREADS>
417 T r = Gpu::blockReduceLogicalOr<MT>(s);
418 if (threadIdx.x == 0) { d = d || r; }
423 template <std::
integral T>
427 template <std::
integral T>
428 constexpr void init (T& t)
const noexcept { t =
false; }
431template <
typename... Ps>
class ReduceOps;
436template <
typename... Ts>
442 template <
typename... Ps>
444 : m_max_blocks(
Gpu::
Device::maxBlocksPerLaunch()),
447 * m_max_blocks * sizeof(
Type)))),
448 m_fn_value([&reduce_op,this] () ->
Type { return this->
value(reduce_op); })
450 reduce_op.resetResultReadiness();
451 static_assert(std::is_trivially_copyable<Type>(),
452 "ReduceData::Type must be trivially copyable");
453 static_assert(std::is_trivially_destructible<Type>(),
454 "ReduceData::Type must be trivially destructible");
456 new (m_host_tuple) Type();
462 !m_used_external_stream || m_value_called,
463 "ReduceData used on an external GPU stream must call value() before destruction.");
475 Type r = m_fn_value();
476 m_value_called =
true;
480 template <
typename... Ps>
484 m_value_called =
true;
490 return m_device_tuple+streamIndexChecked(s)*m_max_blocks;
502 m_max_stream_index = std::max(m_max_stream_index,streamIndexChecked(s));
513 if (m_stream_index_zero_set) {
515 "ReduceData cannot be reused across different external GPU streams "
516 "or between an external GPU stream and AMReX stream 0.");
518 m_stream_index_zero = s;
519 m_stream_index_zero_set =
true;
526 int m_max_stream_index = 0;
527 Type* m_host_tuple =
nullptr;
528 Type* m_device_tuple =
nullptr;
529 GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;
531 bool m_stream_index_zero_set =
false;
532 bool m_used_external_stream =
false;
533 bool m_value_called =
false;
534 std::function<Type()> m_fn_value;
538namespace Reduce::detail {
542 template <
typename F,
int dim>
544 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim>)
noexcept ->
545 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
547 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
550 template <
typename F,
int dim>
552 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim> t)
noexcept ->
553 decltype(f(BoxND<dim>(iv, iv, t)))
555 return f(BoxND<dim>(iv, iv, t));
559 template <
typename F,
typename T,
int dim>
561 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
562 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
564 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
569 struct iterate_box {};
570 struct iterate_box_comp {};
572 template <
typename I,
typename F,
typename T,
typename... Ps>
573 requires (std::same_as<iterate_box,I>)
575 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int, T& r)
noexcept
577 auto const& pr = f(ibox,i,j,k);
578 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
581 template <
typename I,
typename F,
typename T,
typename... Ps>
582 requires (std::same_as<iterate_box_comp,I>)
584 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int ncomp, T& r)
noexcept
586 for (
int n = 0; n < ncomp; ++n) {
587 auto const& pr = f(ibox,i,j,k,n);
588 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
595template <
typename... Ps>
603 template <
typename I,
typename MF,
typename D,
typename F>
604 void eval_mf (I, MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data,
F const& f)
606 using ReduceTuple =
typename D::Type;
607 const int nboxes = mf.local_size();
609 auto const& parforinfo = mf.getParForInfo(nghost);
610 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
612 const int nblocks = nblocks_per_box * nboxes;
613 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
615 auto const& stream = Gpu::gpuStream();
616 auto pdst = reduce_data.devicePtr(stream);
617 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
619 int& nblocks_ref = reduce_data.nBlocks(stream);
620 auto old_nblocks =
static_cast<unsigned int>(nblocks_ref);
621 nblocks_ref =
amrex::max(nblocks_ref, nblocks_ec);
622 reduce_data.updateMaxStreamIndex(stream);
626 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
627 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
630 Dim1 blockIdx {gh.blockIdx()};
631 Dim1 threadIdx{gh.threadIdx()};
633 amrex::launch_global<AMREX_GPU_MAX_THREADS>
634 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
639 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
640 ReduceTuple& dst =
pdst[blockIdx.x];
641 if (threadIdx.x == 0 && blockIdx.x >= old_nblocks) {
644 for (
int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
645 int ibox = iblock / nblocks_per_box;
646 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
649 if (icell < indexer.
numPts()) {
650 auto [i, j, k] = indexer(icell);
651 Reduce::detail::mf_call_f<I,
F, ReduceTuple, Ps...>
652 (f, ibox, i, j, k, ncomp, r);
656 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
658 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
665 template <
typename I,
int dim,
typename D,
typename F>
666 void eval_box (I,
BoxND<dim> const& box,
int ncomp, D& reduce_data,
F const& f)
668 using ReduceTuple =
typename D::Type;
669 auto const& stream = Gpu::gpuStream();
670 auto dp = reduce_data.devicePtr(stream);
671 int& nblocks = reduce_data.nBlocks(stream);
674 constexpr int nitems_per_thread = 4;
675 Long nblocks_ec = (box.
numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
676 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
677 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
678 reduce_data.updateMaxStreamIndex(stream);
681 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
682 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
685 Dim1 blockIdx {gh.blockIdx()};
686 Dim1 threadIdx{gh.threadIdx()};
687 Dim1 gridDim {gh.gridDim()};
689 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
694 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
695 ReduceTuple& dst = *(dp+blockIdx.x);
696 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
699 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
700 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
704 auto iv = indexer.
intVect(icell);
706 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
707 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
708 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
710 for (
int n = 0; n < ncomp; ++n) {
711 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
712 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
717 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
719 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
722 nblocks = std::max(nblocks,
static_cast<int>(nblocks_ec));
727 template <FabArrayType MF,
typename D,
typename F>
728#ifndef AMREX_USE_CUDA
731 void eval (MF
const& mf,
IntVect const& nghost, D& reduce_data, F&& f)
733 using ReduceTuple =
typename D::Type;
734 const int nboxes = mf.local_size();
737 }
else if (!mf.isFusingCandidate()) {
740 const int li = mfi.LocalIndex();
741 this->eval(b, reduce_data,
744 return f(li, i, j, k);
748 eval_mf(Reduce::detail::iterate_box{},
749 mf, nghost, 0, reduce_data, std::forward<F>(f));
753 template <FabArrayType MF,
typename D,
typename F>
754#ifndef AMREX_USE_CUDA
757 void eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F&& f)
759 using ReduceTuple =
typename D::Type;
761 const int nboxes = mf.local_size();
765 }
else if (!mf.isFusingCandidate()) {
768 const int li = mfi.LocalIndex();
769 this->eval(b, ncomp, reduce_data,
772 return f(li, i, j, k, n);
776 eval_mf(Reduce::detail::iterate_box_comp{},
777 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
781 template <
typename D,
typename F,
int dim>
784 eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);
787 template <std::
integral N,
typename D,
typename F,
int dim>
790 eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);
793 template <std::
integral N,
typename D,
typename F>
794 void eval (N n, D & reduce_data, F
const& f)
796 if (n <= 0) {
return; }
797 using ReduceTuple =
typename D::Type;
798 auto const& stream = Gpu::gpuStream();
799 auto dp = reduce_data.devicePtr(stream);
800 int& nblocks = reduce_data.nBlocks(stream);
801 constexpr int nitems_per_thread = 4;
802 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
803 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
804 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
805 reduce_data.updateMaxStreamIndex(stream);
808 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
809 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
812 Dim1 blockIdx {gh.blockIdx()};
813 Dim1 threadIdx{gh.threadIdx()};
814 Dim1 gridDim {gh.gridDim()};
816 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
821 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
822 ReduceTuple& dst = *(dp+blockIdx.x);
823 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
826 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
827 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
832 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
835 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
837 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
843 template <
typename D>
844 typename D::Type
value (D & reduce_data)
846 auto hp = reduce_data.hostPtr();
848 if (m_result_is_ready) {
849 reduce_data.markValueCalled();
853 using ReduceTuple =
typename D::Type;
854 auto const& stream = Gpu::gpuStream();
855 auto dp = reduce_data.devicePtr();
856 auto const& nblocks = reduce_data.nBlocks();
857#if defined(AMREX_USE_SYCL)
858 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
859 const int N = nblocks[0];
861 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
864 Gpu::dtoh_memcpy_async(tmp.
data(), dp,
sizeof(ReduceTuple)*N);
865 Gpu::streamSynchronize();
866 for (
int i = 1; i < N; ++i) {
867 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
874 int maxblocks = reduce_data.maxBlocks();
877 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
878#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
881 auto presult = dtmp.
data();
885 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
889 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
891 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
892 auto dp_stream = dp+istream*maxblocks;
893 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
894 i < nblocks[istream]; i += stride) {
895 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
898 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
899 if (gh.threadIdx() == 0) { *presult = dst; }
901#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
902 Gpu::dtoh_memcpy_async(hp, dtmp.
data(),
sizeof(ReduceTuple));
905 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
909 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
911 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
912 auto dp_stream = dp+istream*maxblocks;
913 for (
int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
914 i < nblocks[istream]; i += stride) {
915 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
918 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
919 if (threadIdx.x == 0) { *hp = dst; }
922 Gpu::streamSynchronize();
925 m_result_is_ready =
true;
926 reduce_data.markValueCalled();
932 bool m_result_is_ready =
false;
933 void resetResultReadiness () { m_result_is_ready =
false; }
938template <
typename T, std::
integral N>
939T Sum (N n, T
const* v, T init_val)
943 using ReduceTuple =
typename decltype(reduce_data)::Type;
945 ReduceTuple hv = reduce_data.
value(reduce_op);
946 return amrex::get<0>(hv) + init_val;
949template <
typename T, std::
integral N,
typename F>
950requires (!std::same_as<T*,std::decay_t<F>>)
951T Sum (N n, F
const& f, T init_val)
955 using ReduceTuple =
typename decltype(reduce_data)::Type;
957 ReduceTuple hv = reduce_data.
value(reduce_op);
958 return amrex::get<0>(hv) + init_val;
961template <
typename T, std::
integral N>
962T Min (N n, T
const* v, T init_val)
966 using ReduceTuple =
typename decltype(reduce_data)::Type;
968 ReduceTuple hv = reduce_data.
value(reduce_op);
969 return std::min(amrex::get<0>(hv),init_val);
972template <
typename T, std::
integral N,
typename F>
973requires (!std::same_as<T*,std::decay_t<F>>)
974T Min (N n, F
const& f, T init_val)
978 using ReduceTuple =
typename decltype(reduce_data)::Type;
980 ReduceTuple hv = reduce_data.
value(reduce_op);
981 return std::min(amrex::get<0>(hv),init_val);
984template <
typename T, std::
integral N>
985T Max (N n, T
const* v, T init_val)
989 using ReduceTuple =
typename decltype(reduce_data)::Type;
991 ReduceTuple hv = reduce_data.
value(reduce_op);
992 return std::max(amrex::get<0>(hv),init_val);
995template <
typename T, std::
integral N,
typename F>
996requires (!std::same_as<T*,std::decay_t<F>>)
997T Max (N n, F
const& f, T init_val)
1001 using ReduceTuple =
typename decltype(reduce_data)::Type;
1003 ReduceTuple hv = reduce_data.
value(reduce_op);
1004 return std::max(amrex::get<0>(hv),init_val);
1007template <
typename T, std::
integral N>
1012 using ReduceTuple =
typename decltype(reduce_data)::Type;
1016 auto hv = reduce_data.
value(reduce_op);
1017 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1020template <
typename T, std::
integral N,
typename F>
1021requires (!std::same_as<T*,std::decay_t<F>>)
1022std::pair<T,T> MinMax (N n, F
const& f)
1026 using ReduceTuple =
typename decltype(reduce_data)::Type;
1031 auto hv = reduce_data.
value(reduce_op);
1032 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1035template <
typename T, std::
integral N,
typename P>
1036bool AnyOf (N n, T
const* v, P
const& pred)
1042 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1044#ifdef AMREX_USE_SYCL
1045 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1046 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
1047 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1049 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
1050 if (gh.threadIdx() == 0) { *has_any = *dp; }
1056 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
1057 i < n && !r; i += stride)
1059 r = pred(v[i]) ? 1 : 0;
1062 r = Gpu::blockReduce<Gpu::Device::warp_size>
1064 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1068 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
1070 __shared__
int has_any;
1071 if (threadIdx.x == 0) { has_any = *dp; }
1077 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
1078 i < n && !r; i += stride)
1080 r = pred(v[i]) ? 1 : 0;
1082 r = Gpu::blockReduce<Gpu::Device::warp_size>
1084 if (threadIdx.x == 0 && r) *dp = 1;
1091template <
typename P,
int dim>
1099 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1101#ifdef AMREX_USE_SYCL
1102 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1103 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
1104 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1106 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
1107 if (gh.threadIdx() == 0) { *has_any = *dp; }
1113 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
1114 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
1115 icell < indexer.
numPts() && !r;
1118 auto iv = indexer.
intVect(icell);
1119 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1121 r = Gpu::blockReduce<Gpu::Device::warp_size>
1123 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1130 __shared__
int has_any;
1131 if (threadIdx.x == 0) { has_any = *dp; }
1137 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
1138 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
1139 icell < indexer.
numPts() && !r;
1142 auto iv = indexer.
intVect(icell);
1143 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1145 r = Gpu::blockReduce<Gpu::Device::warp_size>
1147 if (threadIdx.x == 0 && r) *dp = 1;
1158template <
typename... Ts>
1162 using Type = GpuTuple<Ts...>;
1164 template <
typename... Ps>
1165 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1166 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1167 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1169 reduce_op.resetResultReadiness();
1170 for (
auto& t : m_tuple) {
1171 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1175 ~ReduceData () =
default;
1176 ReduceData (ReduceData<Ts...>
const&) =
delete;
1177 ReduceData (ReduceData<Ts...> &&) =
delete;
1178 void operator= (ReduceData<Ts...>
const&) =
delete;
1179 void operator= (ReduceData<Ts...> &&) =
delete;
1181 Type value () {
return m_fn_value(); }
1183 template <
typename... Ps>
1184 Type value (ReduceOps<Ps...>& reduce_op)
1186 return reduce_op.value(*
this);
1189 Vector<Type>& reference () {
return m_tuple; }
1191 Type& reference (
int tid)
1193 if (m_tuple.size() == 1) {
1197 return m_tuple[tid];
1202 Vector<Type> m_tuple;
1203 std::function<Type()> m_fn_value;
1206namespace Reduce::detail {
1210 template <
typename F,
int dim>
1212 auto call_f_intvect (F
const& f, IntVectND<dim> iv)
noexcept ->
1213 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1215 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1220 template <
typename F,
typename T,
int dim>
1222 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
1223 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1225 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1229template <
typename... Ps>
1236 template <
typename D,
typename F,
int dim>
1237 requires (std::same_as<std::decay_t<
decltype(
1238 Reduce::detail::call_f_intvect(std::declval<F const&>(), IntVectND<dim>()))>,
1241 static void call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
noexcept
1243 using ReduceTuple =
typename D::Type;
1245 [&] (IntVectND<dim> iv) {
1246 auto pr = Reduce::detail::call_f_intvect(f, iv);
1247 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1251 template <
typename D,
typename F,
int dim>
1252 requires (std::same_as<std::decay_t<
decltype(
1253 std::declval<F const&>()(std::declval<BoxND<dim>
const&>()))>,
1256 static void call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
noexcept
1258 using ReduceTuple =
typename D::Type;
1259 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1264 template <FabArrayType MF,
typename D,
typename F>
1265 requires (IsCallable<F, int, int, int, int>::value)
1266 void eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F
const& f)
1268 using ReduceTuple =
typename D::Type;
1274 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1275 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1276 Box const& b = mfi.growntilebox(nghost);
1277 const int li = mfi.LocalIndex();
1280 for (
int k = lo.z; k <= hi.z; ++k) {
1281 for (
int j = lo.y; j <= hi.y; ++j) {
1282 for (
int i = lo.x; i <= hi.x; ++i) {
1283 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1286 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1287 reduce_data.reference(OpenMP::get_thread_num()), rr);
1291 template <FabArrayType MF,
typename D,
typename F>
1292 requires (IsCallable<F, int, int, int, int, int>::value)
1293 void eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F
const& f)
1295 using ReduceTuple =
typename D::Type;
1301 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1302 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1303 Box const& b = mfi.growntilebox(nghost);
1304 const int li = mfi.LocalIndex();
1307 for (
int n = 0; n < ncomp; ++n) {
1308 for (
int k = lo.z; k <= hi.z; ++k) {
1309 for (
int j = lo.y; j <= hi.y; ++j) {
1310 for (
int i = lo.x; i <= hi.x; ++i) {
1311 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1314 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1315 reduce_data.reference(OpenMP::get_thread_num()), rr);
1319 template <
typename D,
typename F,
int dim>
1320 void eval (BoxND<dim>
const& box, D & reduce_data, F&& f)
1322 using ReduceTuple =
typename D::Type;
1324 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1325 call_f_box<D>(box, rr, std::forward<F>(f));
1326 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1327 reduce_data.reference(OpenMP::get_thread_num()), rr);
1330 template <std::
integral N,
typename D,
typename F,
int dim>
1331 void eval (BoxND<dim>
const& box, N ncomp, D & reduce_data, F
const& f)
1333 using ReduceTuple =
typename D::Type;
1335 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1337 [&] (IntVectND<dim> iv,
int n) {
1338 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1339 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1341 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1342 reduce_data.reference(OpenMP::get_thread_num()), rr);
1345 template <std::
integral N,
typename D,
typename F>
1346 void eval (N n, D & reduce_data, F
const& f)
1348 using ReduceTuple =
typename D::Type;
1350 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1351 for (N i = 0; i < n; ++i) {
1352 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1354 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1355 reduce_data.reference(OpenMP::get_thread_num()), rr);
1358 template <
typename D>
1359 typename D::Type value (D & reduce_data)
1361 auto& rrv = reduce_data.reference();
1362 if (! m_result_is_ready) {
1363 using ReduceTuple =
typename D::Type;
1364 if (rrv.size() > 1) {
1365 for (
int i = 1, N = rrv.size(); i < N; ++i) {
1366 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1369 m_result_is_ready =
true;
1375 template <
typename... T>
friend class ReduceData;
1376 bool m_result_is_ready =
false;
1377 void resetResultReadiness () { m_result_is_ready =
false; }
1382template <
typename T, std::
integral N,
typename F>
1383requires (!std::same_as<T*,std::decay_t<F>>)
1384T Sum (N n, F
const& f, T init_val)
1388#pragma omp parallel for reduction(+:r)
1390 for (N i = 0; i < n; ++i) {
1396template <
typename T, std::
integral N>
1397T
Sum (N n, T
const* v, T init_val)
1399 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
1402template <
typename T, std::
integral N,
typename F>
1403requires (!std::same_as<T*,std::decay_t<F>>)
1404T Min (N n, F
const& f, T init_val)
1408#pragma omp parallel for reduction(min:r)
1410 for (N i = 0; i < n; ++i) {
1411 r = std::min(r,f(i));
1416template <
typename T, std::
integral N>
1417T
Min (N n, T
const* v, T init_val)
1419 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
1422template <
typename T, std::
integral N,
typename F>
1423requires (!std::same_as<T*,std::decay_t<F>>)
1424T Max (N n, F
const& f, T init_val)
1428#pragma omp parallel for reduction(max:r)
1430 for (N i = 0; i < n; ++i) {
1431 r = std::max(r,f(i));
1436template <
typename T, std::
integral N>
1437T
Max (N n, T
const* v, T init_val)
1439 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
1442template <
typename T, std::
integral N,
typename F>
1443requires (!std::same_as<T*,std::decay_t<F>>)
1444std::pair<T,T> MinMax (N n, F
const& f)
1446 T r_min = std::numeric_limits<T>::max();
1447 T r_max = std::numeric_limits<T>::lowest();
1449#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1451 for (N i = 0; i < n; ++i) {
1453 r_min = std::min(r_min,tmp);
1454 r_max = std::max(r_max,tmp);
1456 return std::make_pair(r_min,r_max);
1459template <
typename T, std::
integral N>
1460std::pair<T,T>
MinMax (N n, T
const* v)
1462 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
1465template <
typename T, std::
integral N,
typename P>
1466bool AnyOf (N n, T
const* v, P&& pred)
1468 return std::any_of(v, v+n, std::forward<P>(pred));
1471template <
typename P,
int dim>
1472bool AnyOf (BoxND<dim>
const& box, P
const& pred)
1474 for (
auto iv : box.iterator()) {
1475 if (Reduce::detail::call_f_intvect(pred, iv)) {
return true; }
1488template <
typename... Ts,
typename... Ps>
1490constexpr GpuTuple<Ts...>
1494 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1502template <
typename... Ts,
typename... Ps>
1504constexpr GpuTuple<Ts...>
1508 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1513template <
typename Ops,
typename Ts>
1516template <
typename... Ops,
typename... Ts>
1517class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>
1520 static_assert(
sizeof...(Ops) > 0);
1521 static_assert(
sizeof...(Ts) > 0);
1522 static_assert(
sizeof...(Ops) ==
sizeof...(Ts));
1525 : m_reduce_data(m_reduce_op)
1529 using Result_t = GpuTuple<Ts...>;
1530 ReduceOps<Ops...> m_reduce_op;
1531 ReduceData<Ts...> m_reduce_data;
1599template <
typename Ops,
typename Ts>
1601 :
public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>
1615 void operator= (
Reducer const&) =
delete;
1616 void operator= (
Reducer &&) =
delete;
1632 template <
typename F,
int dim>
1637 this->m_reduce_op.
eval(box, this->m_reduce_data, std::forward<F>(f));
1655 template <
typename F,
int dim>
1660 this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));
1682 template <FabArrayType MF,
typename F>
1684 void eval (MF
const& mf,
IntVect const& nghost, F && f)
1686 this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));
1711 template <FabArrayType MF,
typename F>
1713 void eval (MF
const& mf,
IntVect const& nghost,
int ncomp, F && f)
1715 this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));
1730 template <
typename N,
typename F>
1732 void eval (N n, F && f)
1734 this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));
1749 return this->m_reduce_data.value(this->m_reduce_op);
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:136
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:710
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:837
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:438
~ReduceData()
Definition AMReX_Reduce.H:460
int maxStreamIndex() const
Definition AMReX_Reduce.H:500
Type value()
Definition AMReX_Reduce.H:473
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:501
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:496
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:443
void markValueCalled() noexcept
Definition AMReX_Reduce.H:505
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:489
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:481
Type * devicePtr()
Definition AMReX_Reduce.H:488
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:495
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:493
int maxBlocks() const
Definition AMReX_Reduce.H:498
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:597
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:788
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:844
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:782
void eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:731
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:794
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1602
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1747
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1606
void eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1635
amrex_long Long
Definition AMReX_INT.H:30
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:962
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1036
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1008
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:985
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:939
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1359
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1289
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:553
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:452
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:397
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:505
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:347
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1491
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:25
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:45
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
Definition AMReX_Box.H:2170
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2187
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2211
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_Tuple.H:127
Definition AMReX_GpuMemory.H:57
T * dataPtr()
Definition AMReX_GpuMemory.H:91
T dataValue() const
Definition AMReX_GpuMemory.H:93
Definition AMReX_GpuLaunch.H:120
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:127
Definition AMReX_GpuReduce.H:284
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:208
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:375
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:399
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:387
Definition AMReX_Reduce.H:404
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:416
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:425
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:428
Definition AMReX_Reduce.H:341
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:366
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:362
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:353
Definition AMReX_Reduce.H:307
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:332
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:319
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:328
Definition AMReX_Reduce.H:277
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:290
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:299
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:302
Struct for holding types.
Definition AMReX_TypeList.H:13