3#include <AMReX_Config.H>
36template <
typename T,
typename N,
37 std::enable_if_t<std::is_integral_v<N>,
int> = 0>
38T
Sum (N n, T
const* v, T init_val = 0);
56template <
typename T,
typename N,
typename F,
57 std::enable_if_t<std::is_integral_v<N> &&
58 !std::is_same_v<T*,std::decay_t<F>>,
int> = 0>
59T
Sum (N n,
F const& f, T init_val = 0);
75template <
typename T,
typename N,
76 std::enable_if_t<std::is_integral_v<N>,
int> = 0>
77T
Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max());
95template <
typename T,
typename N,
typename F,
96 std::enable_if_t<std::is_integral_v<N> &&
97 !std::is_same_v<T*,std::decay_t<F>>,
int> = 0>
98T
Min (N n,
F const& f, T init_val = std::numeric_limits<T>::max());
114template <
typename T,
typename N,
115 std::enable_if_t<std::is_integral_v<N>,
int> = 0>
116T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest());
134template <
typename T,
typename N,
typename F,
135 std::enable_if_t<std::is_integral_v<N> &&
136 !std::is_same_v<T*,std::decay_t<F>>,
int> = 0>
137T
Max (N n,
F const& f, T init_val = std::numeric_limits<T>::lowest());
152template <
typename T,
typename N,
153 std::enable_if_t<std::is_integral_v<N>,
int> = 0>
154std::pair<T,T>
MinMax (N n, T
const* v);
171template <
typename T,
typename N,
typename F,
172 std::enable_if_t<std::is_integral_v<N> &&
173 !std::is_same_v<T*,std::decay_t<F>>,
int> = 0>
174std::pair<T,T>
MinMax (N n,
F const& f);
191template <
typename T,
typename N,
typename P,
192 std::enable_if_t<std::is_integral_v<N>,
int> = 0>
193bool AnyOf (N n, T
const* v, P
const& pred);
208template <
typename P,
int dim>
214namespace Reduce::detail {
218 template <std::
size_t I,
typename T,
typename P>
220 void for_each_parallel (T& d, T
const& s,
Gpu::Handler const& h)
222 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
225 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
227 void for_each_parallel (T& d, T
const& s, Gpu::Handler
const& h)
229 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
230 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
233 template <std::
size_t I,
typename T,
typename P>
235 void for_each_parallel (T& d, T
const& s)
237 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
240 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
242 void for_each_parallel (T& d, T
const& s)
244 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
245 for_each_parallel<I+1,T,P1,Ps...>(d, s);
250 template <std::
size_t I,
typename T,
typename P>
252 void for_each_local (T& d, T
const& s)
254 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
257 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
259 void for_each_local (T& d, T
const& s)
261 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
262 for_each_local<I+1,T,P1,Ps...>(d, s);
265 template <std::
size_t I,
typename T,
typename P>
267 constexpr void for_each_init (T& t)
269 P().init(amrex::get<I>(t));
272 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
274 constexpr void for_each_init (T& t)
276 P().init(amrex::get<I>(t));
277 for_each_init<I+1,T,P1,Ps...>(t);
288 template <
typename T>
292 if (h.threadIdx() == 0) { d += r; }
295 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
298 T r = Gpu::blockReduceSum<MT>(s);
299 if (threadIdx.x == 0) { d += r; }
304 template <
typename T>
308 template <
typename T>
309 constexpr void init (T& t)
const noexcept { t = 0; }
317 template <
typename T>
321 if (h.threadIdx() == 0) { d =
amrex::min(d,r); }
324 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
327 T r = Gpu::blockReduceMin<MT>(s);
328 if (threadIdx.x == 0) { d =
amrex::min(d,r); }
333 template <
typename T>
337 template <
typename T>
338 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
339 init (T& t)
const noexcept { t = std::numeric_limits<T>::max(); }
341 template <
typename T>
342 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
343 init (T& t)
const noexcept { t = T::max(); }
351 template <
typename T>
355 if (h.threadIdx() == 0) { d =
amrex::max(d,r); }
358 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
361 T r = Gpu::blockReduceMax<MT>(s);
362 if (threadIdx.x == 0) { d =
amrex::max(d,r); }
367 template <
typename T>
371 template <
typename T>
372 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
373 init (T& t)
const noexcept { t = std::numeric_limits<T>::lowest(); }
375 template <
typename T>
376 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
377 init (T& t)
const noexcept { t = T::lowest(); }
385 template <
typename T>
387 std::enable_if_t<std::is_integral_v<T>>
390 if (h.threadIdx() == 0) { d = d && r; }
393 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
395 std::enable_if_t<std::is_integral_v<T>>
397 T r = Gpu::blockReduceLogicalAnd<MT>(s);
398 if (threadIdx.x == 0) { d = d && r; }
403 template <
typename T>
405 std::enable_if_t<std::is_integral_v<T>>
408 template <
typename T>
409 constexpr std::enable_if_t<std::is_integral_v<T>>
410 init (T& t)
const noexcept { t =
true; }
418 template <
typename T>
420 std::enable_if_t<std::is_integral_v<T>>
423 if (h.threadIdx() == 0) { d = d || r; }
426 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
428 std::enable_if_t<std::is_integral_v<T>>
430 T r = Gpu::blockReduceLogicalOr<MT>(s);
431 if (threadIdx.x == 0) { d = d || r; }
436 template <
typename T>
438 std::enable_if_t<std::is_integral_v<T>>
441 template <
typename T>
442 constexpr std::enable_if_t<std::is_integral_v<T>>
443 init (T& t)
const noexcept { t =
false; }
446template <
typename... Ps>
class ReduceOps;
451template <
typename... Ts>
457 template <
typename... Ps>
459 : m_max_blocks(
Gpu::
Device::maxBlocksPerLaunch()),
462 * m_max_blocks * sizeof(
Type)))),
463 m_fn_value([&reduce_op,this] () ->
Type { return this->
value(reduce_op); })
465 reduce_op.resetResultReadiness();
466 static_assert(std::is_trivially_copyable<Type>(),
467 "ReduceData::Type must be trivially copyable");
468 static_assert(std::is_trivially_destructible<Type>(),
469 "ReduceData::Type must be trivially destructible");
471 new (m_host_tuple) Type();
477 !m_used_external_stream || m_value_called,
478 "ReduceData used on an external GPU stream must call value() before destruction.");
490 Type r = m_fn_value();
491 m_value_called =
true;
495 template <
typename... Ps>
499 m_value_called =
true;
505 return m_device_tuple+streamIndexChecked(s)*m_max_blocks;
517 m_max_stream_index = std::max(m_max_stream_index,streamIndexChecked(s));
528 if (m_stream_index_zero_set) {
530 "ReduceData cannot be reused across different external GPU streams "
531 "or between an external GPU stream and AMReX stream 0.");
533 m_stream_index_zero = s;
534 m_stream_index_zero_set =
true;
541 int m_max_stream_index = 0;
542 Type* m_host_tuple =
nullptr;
543 Type* m_device_tuple =
nullptr;
544 GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;
546 bool m_stream_index_zero_set =
false;
547 bool m_used_external_stream =
false;
548 bool m_value_called =
false;
549 std::function<Type()> m_fn_value;
553namespace Reduce::detail {
557 template <
typename F,
int dim>
559 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim>)
noexcept ->
560 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
562 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
565 template <
typename F,
int dim>
567 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim> t)
noexcept ->
568 decltype(f(BoxND<dim>(iv, iv, t)))
570 return f(BoxND<dim>(iv, iv, t));
574 template <
typename F,
typename T,
int dim>
576 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
577 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
579 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
584 struct iterate_box {};
585 struct iterate_box_comp {};
587 template <
typename I,
typename F,
typename T,
typename... Ps,
588 std::enable_if_t<std::is_same_v<iterate_box,I>,
int> = 0>
590 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int, T& r)
noexcept
592 auto const& pr = f(ibox,i,j,k);
593 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
596 template <
typename I,
typename F,
typename T,
typename... Ps,
597 std::enable_if_t<std::is_same_v<iterate_box_comp,I>,
int> = 0>
599 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int ncomp, T& r)
noexcept
601 for (
int n = 0; n < ncomp; ++n) {
602 auto const& pr = f(ibox,i,j,k,n);
603 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
610template <
typename... Ps>
618 template <
typename I,
typename MF,
typename D,
typename F>
619 void eval_mf (I, MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data,
F const& f)
621 using ReduceTuple =
typename D::Type;
622 const int nboxes = mf.local_size();
624 auto const& parforinfo = mf.getParForInfo(nghost);
625 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
627 const int nblocks = nblocks_per_box * nboxes;
628 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
630 auto const& stream = Gpu::gpuStream();
631 auto pdst = reduce_data.devicePtr(stream);
632 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
634 reduce_data.nBlocks(stream) = nblocks_ec;
635 reduce_data.updateMaxStreamIndex(stream);
639 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
640 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
643 Dim1 blockIdx {gh.blockIdx()};
644 Dim1 threadIdx{gh.threadIdx()};
646 amrex::launch_global<AMREX_GPU_MAX_THREADS>
647 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
652 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
653 ReduceTuple& dst =
pdst[blockIdx.x];
654 if (threadIdx.x == 0) {
657 for (
int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
658 int ibox = iblock / nblocks_per_box;
659 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
662 if (icell < indexer.
numPts()) {
663 auto [i, j, k] = indexer(icell);
664 Reduce::detail::mf_call_f<I,
F, ReduceTuple, Ps...>
665 (f, ibox, i, j, k, ncomp, r);
669 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
671 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
678 template <
typename I,
int dim,
typename D,
typename F>
679 void eval_box (I,
BoxND<dim> const& box,
int ncomp, D& reduce_data,
F const& f)
681 using ReduceTuple =
typename D::Type;
682 auto const& stream = Gpu::gpuStream();
683 auto dp = reduce_data.devicePtr(stream);
684 int& nblocks = reduce_data.nBlocks(stream);
687 constexpr int nitems_per_thread = 4;
688 Long nblocks_ec = (box.
numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
689 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
690 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
691 reduce_data.updateMaxStreamIndex(stream);
694 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
695 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
698 Dim1 blockIdx {gh.blockIdx()};
699 Dim1 threadIdx{gh.threadIdx()};
700 Dim1 gridDim {gh.gridDim()};
702 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
707 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
708 ReduceTuple& dst = *(dp+blockIdx.x);
709 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
712 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
713 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
717 auto iv = indexer.
intVect(icell);
719 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
720 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
721 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
723 for (
int n = 0; n < ncomp; ++n) {
724 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
725 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
730 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
732 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
735 nblocks = std::max(nblocks,
static_cast<int>(nblocks_ec));
740 template <
typename MF,
typename D,
typename F>
741 std::enable_if_t<IsFabArray<MF>::value
742#ifndef AMREX_USE_CUDA
748 using ReduceTuple =
typename D::Type;
749 const int nboxes = mf.local_size();
752 }
else if (!mf.isFusingCandidate()) {
755 const int li = mfi.LocalIndex();
756 this->eval(b, reduce_data,
759 return f(li, i, j, k);
763 eval_mf(Reduce::detail::iterate_box{},
764 mf, nghost, 0, reduce_data, std::forward<F>(f));
768 template <
typename MF,
typename D,
typename F>
769 std::enable_if_t<IsFabArray<MF>::value
770#ifndef AMREX_USE_CUDA
774 eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F&& f)
776 using ReduceTuple =
typename D::Type;
778 const int nboxes = mf.local_size();
782 }
else if (!mf.isFusingCandidate()) {
785 const int li = mfi.LocalIndex();
786 this->eval(b, ncomp, reduce_data,
789 return f(li, i, j, k, n);
793 eval_mf(Reduce::detail::iterate_box_comp{},
794 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
798 template <
typename D,
typename F,
int dim>
801 eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);
804 template <
typename N,
typename D,
typename F,
int dim,
805 typename M=std::enable_if_t<std::is_integral_v<N>> >
808 eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);
811 template <
typename N,
typename D,
typename F,
812 typename M=std::enable_if_t<std::is_integral_v<N>> >
813 void eval (N n, D & reduce_data, F
const& f)
815 if (n <= 0) {
return; }
816 using ReduceTuple =
typename D::Type;
817 auto const& stream = Gpu::gpuStream();
818 auto dp = reduce_data.devicePtr(stream);
819 int& nblocks = reduce_data.nBlocks(stream);
820 constexpr int nitems_per_thread = 4;
821 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
822 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
823 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
824 reduce_data.updateMaxStreamIndex(stream);
827 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
828 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
831 Dim1 blockIdx {gh.blockIdx()};
832 Dim1 threadIdx{gh.threadIdx()};
833 Dim1 gridDim {gh.gridDim()};
835 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
840 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
841 ReduceTuple& dst = *(dp+blockIdx.x);
842 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
845 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
846 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
851 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
854 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
856 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
862 template <
typename D>
863 typename D::Type
value (D & reduce_data)
865 auto hp = reduce_data.hostPtr();
867 if (m_result_is_ready) {
868 reduce_data.markValueCalled();
872 using ReduceTuple =
typename D::Type;
873 auto const& stream = Gpu::gpuStream();
874 auto dp = reduce_data.devicePtr();
875 auto const& nblocks = reduce_data.nBlocks();
876#if defined(AMREX_USE_SYCL)
877 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
878 const int N = nblocks[0];
880 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
883 Gpu::dtoh_memcpy_async(tmp.
data(), dp,
sizeof(ReduceTuple)*N);
884 Gpu::streamSynchronize();
885 for (
int i = 1; i < N; ++i) {
886 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
893 int maxblocks = reduce_data.maxBlocks();
896 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
897#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
900 auto presult = dtmp.
data();
904 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
908 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
910 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
911 auto dp_stream = dp+istream*maxblocks;
912 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
913 i < nblocks[istream]; i += stride) {
914 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
917 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
918 if (gh.threadIdx() == 0) { *presult = dst; }
920#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
921 Gpu::dtoh_memcpy_async(hp, dtmp.
data(),
sizeof(ReduceTuple));
924 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
928 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
930 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
931 auto dp_stream = dp+istream*maxblocks;
932 for (
int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
933 i < nblocks[istream]; i += stride) {
934 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
937 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
938 if (threadIdx.x == 0) { *hp = dst; }
941 Gpu::streamSynchronize();
944 m_result_is_ready =
true;
945 reduce_data.markValueCalled();
951 bool m_result_is_ready =
false;
952 void resetResultReadiness () { m_result_is_ready =
false; }
957template <
typename T,
typename N,
958 std::enable_if_t<std::is_integral_v<N>,
int> FOO>
959T Sum (N n, T
const* v, T init_val)
963 using ReduceTuple =
typename decltype(reduce_data)::Type;
965 ReduceTuple hv = reduce_data.
value(reduce_op);
966 return amrex::get<0>(hv) + init_val;
969template <
typename T,
typename N,
typename F,
970 std::enable_if_t<std::is_integral_v<N> &&
971 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
972T Sum (N n, F
const& f, T init_val)
976 using ReduceTuple =
typename decltype(reduce_data)::Type;
978 ReduceTuple hv = reduce_data.
value(reduce_op);
979 return amrex::get<0>(hv) + init_val;
982template <
typename T,
typename N,
983 std::enable_if_t<std::is_integral_v<N>,
int> FOO>
984T Min (N n, T
const* v, T init_val)
988 using ReduceTuple =
typename decltype(reduce_data)::Type;
990 ReduceTuple hv = reduce_data.
value(reduce_op);
991 return std::min(amrex::get<0>(hv),init_val);
994template <
typename T,
typename N,
typename F,
995 std::enable_if_t<std::is_integral_v<N> &&
996 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
997T Min (N n, F
const& f, T init_val)
1001 using ReduceTuple =
typename decltype(reduce_data)::Type;
1003 ReduceTuple hv = reduce_data.
value(reduce_op);
1004 return std::min(amrex::get<0>(hv),init_val);
1007template <
typename T,
typename N,
1008 std::enable_if_t<std::is_integral_v<N>,
int> FOO>
1009T Max (N n, T
const* v, T init_val)
1013 using ReduceTuple =
typename decltype(reduce_data)::Type;
1015 ReduceTuple hv = reduce_data.
value(reduce_op);
1016 return std::max(amrex::get<0>(hv),init_val);
1019template <
typename T,
typename N,
typename F,
1020 std::enable_if_t<std::is_integral_v<N> &&
1021 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
1022T Max (N n, F
const& f, T init_val)
1026 using ReduceTuple =
typename decltype(reduce_data)::Type;
1028 ReduceTuple hv = reduce_data.
value(reduce_op);
1029 return std::max(amrex::get<0>(hv),init_val);
1032template <
typename T,
typename N,
1033 std::enable_if_t<std::is_integral_v<N>,
int> FOO>
1038 using ReduceTuple =
typename decltype(reduce_data)::Type;
1042 auto hv = reduce_data.
value(reduce_op);
1043 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1046template <
typename T,
typename N,
typename F,
1047 std::enable_if_t<std::is_integral_v<N> &&
1048 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
1053 using ReduceTuple =
typename decltype(reduce_data)::Type;
1058 auto hv = reduce_data.
value(reduce_op);
1059 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
1062template <
typename T,
typename N,
typename P,
1063 std::enable_if_t<std::is_integral_v<N>,
int> FOO>
1064bool AnyOf (N n, T
const* v, P
const& pred)
1070 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1072#ifdef AMREX_USE_SYCL
1073 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1074 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
1075 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1077 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
1078 if (gh.threadIdx() == 0) { *has_any = *dp; }
1084 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
1085 i < n && !r; i += stride)
1087 r = pred(v[i]) ? 1 : 0;
1090 r = Gpu::blockReduce<Gpu::Device::warp_size>
1092 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1096 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
1098 __shared__
int has_any;
1099 if (threadIdx.x == 0) { has_any = *dp; }
1105 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
1106 i < n && !r; i += stride)
1108 r = pred(v[i]) ? 1 : 0;
1110 r = Gpu::blockReduce<Gpu::Device::warp_size>
1112 if (threadIdx.x == 0 && r) *dp = 1;
1119template <
typename P,
int dim>
1127 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
1129#ifdef AMREX_USE_SYCL
1130 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
1131 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
1132 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
1134 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
1135 if (gh.threadIdx() == 0) { *has_any = *dp; }
1141 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
1142 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
1143 icell < indexer.
numPts() && !r;
1146 auto iv = indexer.
intVect(icell);
1147 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1149 r = Gpu::blockReduce<Gpu::Device::warp_size>
1151 if (gh.threadIdx() == 0 && r) { *dp = 1; }
1158 __shared__
int has_any;
1159 if (threadIdx.x == 0) { has_any = *dp; }
1165 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
1166 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
1167 icell < indexer.
numPts() && !r;
1170 auto iv = indexer.
intVect(icell);
1171 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
1173 r = Gpu::blockReduce<Gpu::Device::warp_size>
1175 if (threadIdx.x == 0 && r) *dp = 1;
1186template <
typename... Ts>
1190 using Type = GpuTuple<Ts...>;
1192 template <
typename... Ps>
1193 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
1194 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1195 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1197 reduce_op.resetResultReadiness();
1198 for (
auto& t : m_tuple) {
1199 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1203 ~ReduceData () =
default;
1204 ReduceData (ReduceData<Ts...>
const&) =
delete;
1205 ReduceData (ReduceData<Ts...> &&) =
delete;
1206 void operator= (ReduceData<Ts...>
const&) =
delete;
1207 void operator= (ReduceData<Ts...> &&) =
delete;
1209 Type value () {
return m_fn_value(); }
1211 template <
typename... Ps>
1212 Type value (ReduceOps<Ps...>& reduce_op)
1214 return reduce_op.value(*
this);
1217 Vector<Type>& reference () {
return m_tuple; }
1219 Type& reference (
int tid)
1221 if (m_tuple.size() == 1) {
1225 return m_tuple[tid];
1230 Vector<Type> m_tuple;
1231 std::function<Type()> m_fn_value;
1234namespace Reduce::detail {
1238 template <
typename F,
int dim>
1240 auto call_f_intvect (F
const& f, IntVectND<dim> iv)
noexcept ->
1241 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1243 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1248 template <
typename F,
typename T,
int dim>
1250 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
1251 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1253 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1257template <
typename... Ps>
1264 template <
typename D,
typename F,
int dim>
1266 static auto call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
1267 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(
1268 Reduce::detail::call_f_intvect(f, IntVectND<dim>(0))
1269 )>,
typename D::Type>>
1271 using ReduceTuple =
typename D::Type;
1273 [&] (IntVectND<dim> iv) {
1274 auto pr = Reduce::detail::call_f_intvect(f, iv);
1275 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1279 template <
typename D,
typename F,
int dim>
1281 static auto call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
1282 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(f(box))>,
1285 using ReduceTuple =
typename D::Type;
1286 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1291 template <
typename MF,
typename D,
typename F>
1292 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1293 eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F
const& f)
1295 using ReduceTuple =
typename D::Type;
1301 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1302 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1303 Box const& b = mfi.growntilebox(nghost);
1304 const int li = mfi.LocalIndex();
1307 for (
int k = lo.z; k <= hi.z; ++k) {
1308 for (
int j = lo.y; j <= hi.y; ++j) {
1309 for (
int i = lo.x; i <= hi.x; ++i) {
1310 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1313 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1314 reduce_data.reference(OpenMP::get_thread_num()), rr);
1318 template <
typename MF,
typename D,
typename F>
1319 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1320 eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F
const& f)
1322 using ReduceTuple =
typename D::Type;
1328 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1329 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1330 Box const& b = mfi.growntilebox(nghost);
1331 const int li = mfi.LocalIndex();
1334 for (
int n = 0; n < ncomp; ++n) {
1335 for (
int k = lo.z; k <= hi.z; ++k) {
1336 for (
int j = lo.y; j <= hi.y; ++j) {
1337 for (
int i = lo.x; i <= hi.x; ++i) {
1338 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1341 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1342 reduce_data.reference(OpenMP::get_thread_num()), rr);
1346 template <
typename D,
typename F,
int dim>
1347 void eval (BoxND<dim>
const& box, D & reduce_data, F&& f)
1349 using ReduceTuple =
typename D::Type;
1351 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1352 call_f_box<D>(box, rr, std::forward<F>(f));
1353 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1354 reduce_data.reference(OpenMP::get_thread_num()), rr);
1357 template <
typename N,
typename D,
typename F,
int dim,
1358 typename M=std::enable_if_t<std::is_integral_v<N>> >
1359 void eval (BoxND<dim>
const& box, N ncomp, D & reduce_data, F
const& f)
1361 using ReduceTuple =
typename D::Type;
1363 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1365 [&] (IntVectND<dim> iv,
int n) {
1366 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1367 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1369 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1370 reduce_data.reference(OpenMP::get_thread_num()), rr);
1373 template <
typename N,
typename D,
typename F,
1374 typename M=std::enable_if_t<std::is_integral_v<N>> >
1375 void eval (N n, D & reduce_data, F
const& f)
1377 using ReduceTuple =
typename D::Type;
1379 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1380 for (N i = 0; i < n; ++i) {
1381 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1383 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1384 reduce_data.reference(OpenMP::get_thread_num()), rr);
1387 template <
typename D>
1388 typename D::Type value (D & reduce_data)
1390 auto& rrv = reduce_data.reference();
1391 if (! m_result_is_ready) {
1392 using ReduceTuple =
typename D::Type;
1393 if (rrv.size() > 1) {
1394 for (
int i = 1, N = rrv.size(); i < N; ++i) {
1395 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1398 m_result_is_ready =
true;
1404 template <
typename... T>
friend class ReduceData;
1405 bool m_result_is_ready =
false;
1406 void resetResultReadiness () { m_result_is_ready =
false; }
1411template <
typename T,
typename N,
typename F,
1412 std::enable_if_t<std::is_integral_v<N> &&
1413 !std::is_same_v<T*,std::decay_t<F>>,
int> >
1414T
Sum (N n, F
const& f, T init_val)
1418#pragma omp parallel for reduction(+:r)
1420 for (N i = 0; i < n; ++i) {
1426template <
typename T,
typename N,
1427 std::enable_if_t<std::is_integral_v<N>,
int> >
1428T
Sum (N n, T
const* v, T init_val)
1430 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
1433template <
typename T,
typename N,
typename F,
1434 std::enable_if_t<std::is_integral_v<N> &&
1435 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
1436T
Min (N n, F
const& f, T init_val)
1440#pragma omp parallel for reduction(min:r)
1442 for (N i = 0; i < n; ++i) {
1443 r = std::min(r,f(i));
1448template <
typename T,
typename N,
1449 std::enable_if_t<std::is_integral_v<N>,
int> >
1450T
Min (N n, T
const* v, T init_val)
1452 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
1455template <
typename T,
typename N,
typename F,
1456 std::enable_if_t<std::is_integral_v<N> &&
1457 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
1458T
Max (N n, F
const& f, T init_val)
1462#pragma omp parallel for reduction(max:r)
1464 for (N i = 0; i < n; ++i) {
1465 r = std::max(r,f(i));
1470template <
typename T,
typename N,
1471 std::enable_if_t<std::is_integral_v<N>,
int> >
1472T
Max (N n, T
const* v, T init_val)
1474 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
1477template <
typename T,
typename N,
typename F,
1478 std::enable_if_t<std::is_integral_v<N> &&
1479 !std::is_same_v<T*,std::decay_t<F>>,
int> FOO>
1480std::pair<T,T>
MinMax (N n, F
const& f)
1482 T r_min = std::numeric_limits<T>::max();
1483 T r_max = std::numeric_limits<T>::lowest();
1485#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1487 for (N i = 0; i < n; ++i) {
1489 r_min = std::min(r_min,tmp);
1490 r_max = std::max(r_max,tmp);
1492 return std::make_pair(r_min,r_max);
1495template <
typename T,
typename N,
typename M>
1496std::pair<T,T>
MinMax (N n, T
const* v)
1498 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
1501template <
typename T,
typename N,
typename P,
1502 std::enable_if_t<std::is_integral_v<N>,
int> >
1503bool AnyOf (N n, T
const* v, P&& pred)
1505 return std::any_of(v, v+n, std::forward<P>(pred));
1508template <
typename P,
int dim>
1509bool AnyOf (BoxND<dim>
const& box, P
const& pred)
1511 for (
auto iv : box.iterator()) {
1512 if (Reduce::detail::call_f_intvect(pred, iv)) {
return true; }
1525template <
typename... Ts,
typename... Ps>
1527constexpr GpuTuple<Ts...>
1531 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1539template <
typename... Ts,
typename... Ps>
1541constexpr GpuTuple<Ts...>
1545 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1550template <
typename Ops,
typename Ts>
1553template <
typename... Ops,
typename... Ts>
1554class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>
1557 static_assert(
sizeof...(Ops) > 0);
1558 static_assert(
sizeof...(Ts) > 0);
1559 static_assert(
sizeof...(Ops) ==
sizeof...(Ts));
1562 : m_reduce_data(m_reduce_op)
1566 using Result_t = GpuTuple<Ts...>;
1567 ReduceOps<Ops...> m_reduce_op;
1568 ReduceData<Ts...> m_reduce_data;
1636template <
typename Ops,
typename Ts>
1638 :
public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>
1652 void operator= (
Reducer const&) =
delete;
1653 void operator= (
Reducer &&) =
delete;
1669 template <
typename F,
int dim>
1670 std::enable_if_t<IsCallable<F, int, int, int>::value ||
1674 this->m_reduce_op.eval(box, this->m_reduce_data, std::forward<F>(f));
1692 template <
typename F,
int dim>
1693 std::enable_if_t<IsCallable<F, int, int, int, int>::value ||
1697 this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));
1719 template <
typename MF,
typename F>
1720 std::enable_if_t<IsFabArray<MF>::value &&
1724 this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));
1749 template <
typename MF,
typename F>
1750 std::enable_if_t<IsFabArray<MF>::value &&
1754 this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));
1769 template <
typename N,
typename F>
1770 std::enable_if_t<IsCallable<F, N>::value>
1773 this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));
1788 return this->m_reduce_data.value(this->m_reduce_op);
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:36
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:49
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:709
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:836
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:85
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:453
~ReduceData()
Definition AMReX_Reduce.H:475
int maxStreamIndex() const
Definition AMReX_Reduce.H:515
Type value()
Definition AMReX_Reduce.H:488
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:516
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:511
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:458
void markValueCalled() noexcept
Definition AMReX_Reduce.H:520
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:504
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:496
Type * devicePtr()
Definition AMReX_Reduce.H:503
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:510
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:508
int maxBlocks() const
Definition AMReX_Reduce.H:513
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:612
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:863
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:799
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:746
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:806
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:774
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:813
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1639
std::enable_if_t< IsCallable< F, int, int, int >::value||IsCallable< F, IntVectND< dim > >::value > eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1672
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1752
std::enable_if_t< IsCallable< F, N >::value > eval(N n, F &&f)
Reduction over a 1D index range.
Definition AMReX_Reduce.H:1771
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1786
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1643
std::enable_if_t< IsFabArray< MF >::value &&IsCallable< F, int, int, int, int >::value > eval(MF const &mf, IntVect const &nghost, F &&f)
Reduction over a MultiFab-like object.
Definition AMReX_Reduce.H:1722
std::enable_if_t< IsCallable< F, int, int, int, int >::value||IsCallable< F, IntVectND< dim >, int >::value > eval(BoxND< dim > const &box, int ncomp, F &&f)
Reduction over a Box plus component index.
Definition AMReX_Reduce.H:1695
amrex_long Long
Definition AMReX_INT.H:30
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:984
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:1009
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1034
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1064
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:959
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1331
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:845
Arena * The_Arena()
Definition AMReX_Arena.cpp:805
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:420
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:357
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1528
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_Tuple.H:125
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:119
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:287
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:382
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:410
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:406
Definition AMReX_Reduce.H:415
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:443
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:439
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:429
Definition AMReX_Reduce.H:348
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:373
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:377
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:369
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:360
Definition AMReX_Reduce.H:314
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:343
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:326
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:335
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:284
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:297
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:306
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:309
Struct for holding types.
Definition AMReX_TypeList.H:13