1 #ifndef AMREX_REDUCE_H_
2 #define AMREX_REDUCE_H_
3 #include <AMReX_Config.H>
17 namespace Reduce::detail {
21 template <std::
size_t I,
typename T,
typename P>
25 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
28 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
32 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
36 template <std::
size_t I,
typename T,
typename P>
40 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
43 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
47 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
53 template <std::
size_t I,
typename T,
typename P>
57 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
60 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
64 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
68 template <std::
size_t I,
typename T,
typename P>
72 P().init(amrex::get<I>(t));
75 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
79 P().init(amrex::get<I>(t));
93 if (h.threadIdx() == 0) { d +=
r; }
96 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
99 T
r = Gpu::blockReduceSum<MT>(s);
100 if (threadIdx.x == 0) { d +=
r; }
105 template <
typename T>
109 template <
typename T>
110 constexpr
void init (T& t)
const noexcept { t = 0; }
116 #ifdef AMREX_USE_SYCL
117 template <
typename T>
124 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
127 T
r = Gpu::blockReduceMin<MT>(s);
133 template <
typename T>
137 template <
typename T>
138 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
141 template <
typename T>
142 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
149 #ifdef AMREX_USE_SYCL
150 template <
typename T>
157 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
160 T
r = Gpu::blockReduceMax<MT>(s);
166 template <
typename T>
170 template <
typename T>
171 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172 init (T& t)
const noexcept { t = std::numeric_limits<T>::lowest(); }
174 template <
typename T>
175 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176 init (T& t)
const noexcept { t = T::lowest(); }
182 #ifdef AMREX_USE_SYCL
183 template <
typename T>
185 std::enable_if_t<std::is_integral<T>::value>
187 T
r = Gpu::blockReduceLogicalAnd(s,h);
188 if (h.threadIdx() == 0) { d = d &&
r; }
191 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
193 std::enable_if_t<std::is_integral<T>::value>
195 T
r = Gpu::blockReduceLogicalAnd<MT>(s);
196 if (threadIdx.x == 0) { d = d &&
r; }
201 template <
typename T>
203 std::enable_if_t<std::is_integral_v<T>>
206 template <
typename T>
207 constexpr std::enable_if_t<std::is_integral_v<T>>
208 init (T& t)
const noexcept { t =
true; }
214 #ifdef AMREX_USE_SYCL
215 template <
typename T>
217 std::enable_if_t<std::is_integral<T>::value>
219 T
r = Gpu::blockReduceLogicalOr(s,h);
220 if (h.threadIdx() == 0) { d = d ||
r; }
223 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
225 std::enable_if_t<std::is_integral<T>::value>
227 T
r = Gpu::blockReduceLogicalOr<MT>(s);
228 if (threadIdx.x == 0) { d = d ||
r; }
233 template <
typename T>
235 std::enable_if_t<std::is_integral_v<T>>
238 template <
typename T>
239 constexpr std::enable_if_t<std::is_integral_v<T>>
240 init (T& t)
const noexcept { t =
false; }
243 template <
typename... Ps>
class ReduceOps;
247 template <
typename... Ts>
253 template <
typename... Ps>
261 reduce_op.resetResultReadiness();
262 static_assert(std::is_trivially_copyable<Type>(),
263 "ReduceData::Type must be trivially copyable");
264 static_assert(std::is_trivially_destructible<Type>(),
265 "ReduceData::Type must be trivially destructible");
267 new (m_host_tuple) Type();
286 template <
typename... Ps>
289 return reduce_op.
value(*
this);
311 int m_max_stream_index = 0;
313 Type* m_device_tuple =
nullptr;
318 namespace Reduce::detail {
319 template <
typename F>
322 noexcept -> decltype(
f(0,0,0))
327 template <
typename F>
330 noexcept -> decltype(
f(
Box()))
341 template <
typename I,
typename F,
typename T,
typename... Ps,
342 std::enable_if_t<std::is_same<iterate_box,I>::value,
int> = 0>
344 void mf_call_f (F
const&
f,
int ibox,
int i,
int j,
int k,
int, T& r) noexcept
346 auto const& pr =
f(ibox,i,j,k);
350 template <
typename I,
typename F,
typename T,
typename... Ps,
351 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,
int> = 0>
353 void mf_call_f (F
const&
f,
int ibox,
int i,
int j,
int k,
int ncomp, T& r) noexcept
355 for (
int n = 0; n < ncomp; ++n) {
356 auto const& pr =
f(ibox,i,j,k,n);
362 template <
typename... Ps>
368 template <
typename I,
typename MF,
typename D,
typename F>
369 void eval_mf (I, MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F
const&
f)
371 using ReduceTuple =
typename D::Type;
372 const int nboxes = mf.local_size();
374 auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS);
375 auto par_for_blocks = parforinfo.getBlocks();
376 const int nblocks = par_for_blocks.first[nboxes];
377 const int block_0_size = par_for_blocks.first[1];
378 const int* dp_nblocks = par_for_blocks.second;
379 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
382 auto pdst = reduce_data.devicePtr(stream);
383 int nblocks_ec =
std::min(nblocks, reduce_data.maxBlocks());
385 reduce_data.nBlocks(stream) = nblocks_ec;
386 reduce_data.updateMaxStreamIndex(stream);
388 #ifdef AMREX_USE_SYCL
390 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
391 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
394 Dim1 blockIdx {gh.blockIdx()};
395 Dim1 threadIdx{gh.threadIdx()};
397 amrex::launch_global<AMREX_GPU_MAX_THREADS>
398 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
404 ReduceTuple& dst =
pdst[blockIdx.x];
405 if (threadIdx.x == 0) {
408 for (
int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
413 icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
415 ibox = iblock / block_0_size;
416 icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
420 if (icell < indexer.
numPts()) {
421 auto [i, j, k] = indexer(icell);
423 (
f, ibox, i, j, k, ncomp,
r);
426 #ifdef AMREX_USE_SYCL
435 template <
typename MF,
typename D,
typename F>
436 std::enable_if_t<IsFabArray<MF>::value
437 #ifndef AMREX_USE_CUDA
443 using ReduceTuple =
typename D::Type;
444 const int nboxes = mf.local_size();
447 }
else if (!mf.isFusingCandidate()) {
450 const int li = mfi.LocalIndex();
451 this->eval(
b, reduce_data,
454 return f(li, i, j, k);
459 mf, nghost, 0, reduce_data, std::forward<F>(
f));
463 template <
typename MF,
typename D,
typename F>
464 std::enable_if_t<IsFabArray<MF>::value
465 #ifndef AMREX_USE_CUDA
469 eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F&&
f)
471 using ReduceTuple =
typename D::Type;
473 const int nboxes = mf.local_size();
477 }
else if (!mf.isFusingCandidate()) {
480 const int li = mfi.LocalIndex();
481 this->eval(
b, ncomp, reduce_data,
484 return f(li, i, j, k, n);
489 mf, nghost, ncomp, reduce_data, std::forward<F>(
f));
493 template <
typename D,
typename F>
494 void eval (
Box const& box, D & reduce_data, F
const&
f)
496 using ReduceTuple =
typename D::Type;
498 auto dp = reduce_data.devicePtr(stream);
499 int& nblocks = reduce_data.nBlocks(stream);
500 int ncells = box.
numPts();
503 const auto lenxy = len.x*len.y;
504 const auto lenx = len.x;
506 constexpr
int nitems_per_thread = 4;
507 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
508 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
509 nblocks_ec =
std::min(nblocks_ec, reduce_data.maxBlocks());
510 reduce_data.updateMaxStreamIndex(stream);
511 #ifdef AMREX_USE_SYCL
513 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
514 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
517 Dim1 blockIdx {gh.blockIdx()};
518 Dim1 threadIdx{gh.threadIdx()};
519 Dim1 gridDim {gh.gridDim()};
521 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
527 ReduceTuple& dst = *(
dp+blockIdx.x);
528 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
531 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
532 icell < ncells; icell += stride) {
533 int k = icell / lenxy;
534 int j = (icell - k*lenxy) / lenx;
535 int i = (icell - k*lenxy) - j*lenx;
542 #ifdef AMREX_USE_SYCL
548 nblocks =
std::max(nblocks, nblocks_ec);
551 template <
typename N,
typename D,
typename F,
552 typename M=std::enable_if_t<std::is_integral<N>::value> >
553 void eval (
Box const& box, N ncomp, D & reduce_data, F
const&
f)
555 using ReduceTuple =
typename D::Type;
557 auto dp = reduce_data.devicePtr(stream);
558 int& nblocks = reduce_data.nBlocks(stream);
559 int ncells = box.
numPts();
562 const auto lenxy = len.x*len.y;
563 const auto lenx = len.x;
564 constexpr
int nitems_per_thread = 4;
565 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
566 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
567 nblocks_ec =
std::min(nblocks_ec, reduce_data.maxBlocks());
568 reduce_data.updateMaxStreamIndex(stream);
569 #ifdef AMREX_USE_SYCL
571 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
572 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
575 Dim1 blockIdx {gh.blockIdx()};
576 Dim1 threadIdx{gh.threadIdx()};
577 Dim1 gridDim {gh.gridDim()};
579 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
585 ReduceTuple& dst = *(
dp+blockIdx.x);
586 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
589 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
590 icell < ncells; icell += stride) {
591 int k = icell / lenxy;
592 int j = (icell - k*lenxy) / lenx;
593 int i = (icell - k*lenxy) - j*lenx;
597 for (N n = 0; n < ncomp; ++n) {
598 auto pr =
f(i,j,k,n);
602 #ifdef AMREX_USE_SYCL
608 nblocks =
std::max(nblocks, nblocks_ec);
611 template <
typename N,
typename D,
typename F,
612 typename M=std::enable_if_t<std::is_integral<N>::value> >
613 void eval (N n, D & reduce_data, F
const&
f)
615 if (n <= 0) {
return; }
616 using ReduceTuple =
typename D::Type;
618 auto dp = reduce_data.devicePtr(stream);
619 int& nblocks = reduce_data.nBlocks(stream);
620 constexpr
int nitems_per_thread = 4;
621 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
622 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
623 nblocks_ec =
std::min(nblocks_ec, reduce_data.maxBlocks());
624 reduce_data.updateMaxStreamIndex(stream);
625 #ifdef AMREX_USE_SYCL
627 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
628 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
631 Dim1 blockIdx {gh.blockIdx()};
632 Dim1 threadIdx{gh.threadIdx()};
633 Dim1 gridDim {gh.gridDim()};
635 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
641 ReduceTuple& dst = *(
dp+blockIdx.x);
642 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
645 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
646 i < n; i += stride) {
650 #ifdef AMREX_USE_SYCL
659 template <
typename D>
660 typename D::Type
value (D & reduce_data)
662 auto hp = reduce_data.hostPtr();
664 if (m_result_is_ready) {
668 using ReduceTuple =
typename D::Type;
670 auto dp = reduce_data.devicePtr();
671 auto const& nblocks = reduce_data.nBlocks();
672 #if defined(AMREX_USE_SYCL)
673 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
674 const int N = nblocks[0];
681 for (
int i = 1; i < N; ++i) {
689 int maxblocks = reduce_data.maxBlocks();
690 #ifdef AMREX_USE_SYCL
692 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
693 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
696 auto presult = dtmp.
data();
700 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
706 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
707 auto dp_stream =
dp+istream*maxblocks;
708 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
709 i < nblocks[istream]; i += stride) {
714 if (gh.threadIdx() == 0) { *presult = dst; }
716 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
720 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
726 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
727 auto dp_stream =
dp+istream*maxblocks;
728 for (
int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
729 i < nblocks[istream]; i += stride) {
734 if (threadIdx.x == 0) { *hp = dst; }
740 m_result_is_ready =
true;
745 bool m_result_is_ready =
false;
753 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
754 T
Sum (N n, T
const* v, T init_val = 0)
758 using ReduceTuple =
typename decltype(reduce_data)::Type;
760 ReduceTuple hv = reduce_data.
value(reduce_op);
761 return amrex::get<0>(hv) + init_val;
764 template <
typename T,
typename N,
typename F,
765 typename M=std::enable_if_t<std::is_integral<N>::value> >
766 T
Sum (N n, F
const&
f, T init_val = 0)
770 using ReduceTuple =
typename decltype(reduce_data)::Type;
772 ReduceTuple hv = reduce_data.
value(reduce_op);
773 return amrex::get<0>(hv) + init_val;
776 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
781 using ReduceTuple =
typename decltype(reduce_data)::Type;
783 ReduceTuple hv = reduce_data.
value(reduce_op);
784 return std::min(amrex::get<0>(hv),init_val);
787 template <
typename T,
typename N,
typename F,
788 typename M=std::enable_if_t<std::is_integral<N>::value> >
793 using ReduceTuple =
typename decltype(reduce_data)::Type;
795 ReduceTuple hv = reduce_data.
value(reduce_op);
796 return std::min(amrex::get<0>(hv),init_val);
799 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
800 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
804 using ReduceTuple =
typename decltype(reduce_data)::Type;
806 ReduceTuple hv = reduce_data.
value(reduce_op);
807 return std::max(amrex::get<0>(hv),init_val);
810 template <
typename T,
typename N,
typename F,
811 typename M=std::enable_if_t<std::is_integral<N>::value> >
812 T
Max (N n, F
const&
f, T init_val = std::numeric_limits<T>::lowest())
816 using ReduceTuple =
typename decltype(reduce_data)::Type;
818 ReduceTuple hv = reduce_data.
value(reduce_op);
819 return std::max(amrex::get<0>(hv),init_val);
822 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
827 using ReduceTuple =
typename decltype(reduce_data)::Type;
831 auto hv = reduce_data.
value(reduce_op);
832 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
835 template <
typename T,
typename N,
typename F,
836 typename M=std::enable_if_t<std::is_integral<N>::value> >
841 using ReduceTuple =
typename decltype(reduce_data)::Type;
846 auto hv = reduce_data.
value(reduce_op);
847 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
850 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
851 bool AnyOf (N n, T
const* v,
P const& pred)
857 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
859 #ifdef AMREX_USE_SYCL
860 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
861 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
862 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
864 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
865 if (gh.threadIdx() == 0) { *has_any = *
dp; }
871 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
872 i < n && !
r; i += stride)
874 r = pred(v[i]) ? 1 : 0;
877 r = Gpu::blockReduce<Gpu::Device::warp_size>
879 if (gh.threadIdx() == 0 &&
r) { *
dp = 1; }
883 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0,
Gpu::gpuStream(),
885 __shared__
int has_any;
886 if (threadIdx.x == 0) { has_any = *dp; }
892 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
893 i < n && !
r; i += stride)
895 r = pred(v[i]) ? 1 : 0;
897 r = Gpu::blockReduce<Gpu::Device::warp_size>
899 if (threadIdx.x == 0 &&
r) *
dp = 1;
906 template <
typename P>
912 int ncells = box.
numPts();
915 const auto lenxy = len.x*len.y;
916 const auto lenx = len.x;
918 ec.numBlocks.x =
std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
920 #ifdef AMREX_USE_SYCL
921 const int num_ints =
std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
922 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
923 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes,
Gpu::gpuStream(),
925 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
926 if (gh.threadIdx() == 0) { *has_any = *
dp; }
932 for (
int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
933 icell < ncells && !
r; icell += stride) {
934 int k = icell / lenxy;
935 int j = (icell - k*lenxy) / lenx;
936 int i = (icell - k*lenxy) - j*lenx;
940 r = pred(i,j,k) ? 1 : 0;
942 r = Gpu::blockReduce<Gpu::Device::warp_size>
944 if (gh.threadIdx() == 0 &&
r) { *
dp = 1; }
951 __shared__
int has_any;
952 if (threadIdx.x == 0) { has_any = *dp; }
958 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
959 icell < ncells && !
r; icell += stride) {
960 int k = icell / lenxy;
961 int j = (icell - k*lenxy) / lenx;
962 int i = (icell - k*lenxy) - j*lenx;
966 r = pred(i,j,k) ? 1 : 0;
968 r = Gpu::blockReduce<Gpu::Device::warp_size>
970 if (threadIdx.x == 0 &&
r) *
dp = 1;
981 template <
typename... Ts>
985 using Type = GpuTuple<Ts...>;
987 template <
typename... Ps>
988 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
990 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
992 reduce_op.resetResultReadiness();
993 for (
auto& t : m_tuple) {
998 ~ReduceData () =
default;
999 ReduceData (ReduceData<Ts...>
const&) =
delete;
1000 ReduceData (ReduceData<Ts...> &&) =
delete;
1001 void operator= (ReduceData<Ts...>
const&) =
delete;
1002 void operator= (ReduceData<Ts...> &&) =
delete;
1004 Type value () {
return m_fn_value(); }
1006 template <
typename... Ps>
1007 Type value (ReduceOps<Ps...>& reduce_op)
1009 return reduce_op.value(*
this);
1012 Vector<Type>& reference () {
return m_tuple; }
1014 Type& reference (
int tid)
1016 if (m_tuple.size() == 1) {
1020 return m_tuple[tid];
1025 Vector<Type> m_tuple;
1026 std::function<Type()> m_fn_value;
1029 template <
typename... Ps>
1034 template <
typename D,
typename F>
1036 static auto call_f (
Box const& box,
typename D::Type & r, F
const&
f)
1037 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(
f(0,0,0))>,
1040 using ReduceTuple =
typename D::Type;
1043 for (
int k = lo.z; k <= hi.z; ++k) {
1044 for (
int j = lo.y; j <= hi.y; ++j) {
1045 for (
int i = lo.x; i <= hi.x; ++i) {
1050 template <
typename D,
typename F>
1052 static auto call_f (
Box const& box,
typename D::Type & r, F
const&
f)
1053 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(
f(
Box()))>,
1056 using ReduceTuple =
typename D::Type;
1062 template <
typename MF,
typename D,
typename F>
1063 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1064 eval (MF
const& mf,
IntVect const& nghost, D & reduce_data, F
const&
f)
1066 using ReduceTuple =
typename D::Type;
1067 #ifdef AMREX_USE_OMP
1068 #pragma omp parallel
1070 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1071 Box const&
b = mfi.growntilebox(nghost);
1072 const int li = mfi.LocalIndex();
1076 for (
int k = lo.z; k <= hi.z; ++k) {
1077 for (
int j = lo.y; j <= hi.y; ++j) {
1078 for (
int i = lo.x; i <= hi.x; ++i) {
1084 template <
typename MF,
typename D,
typename F>
1085 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1086 eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D & reduce_data, F
const&
f)
1088 using ReduceTuple =
typename D::Type;
1089 #ifdef AMREX_USE_OMP
1090 #pragma omp parallel
1092 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1093 Box const&
b = mfi.growntilebox(nghost);
1094 const int li = mfi.LocalIndex();
1098 for (
int n = 0; n < ncomp; ++n) {
1099 for (
int k = lo.z; k <= hi.z; ++k) {
1100 for (
int j = lo.y; j <= hi.y; ++j) {
1101 for (
int i = lo.x; i <= hi.x; ++i) {
1107 template <
typename D,
typename F>
1108 void eval (
Box const& box, D & reduce_data, F&&
f)
1111 call_f<D>(box, rr, std::forward<F>(
f));
1114 template <
typename N,
typename D,
typename F,
1115 typename M=std::enable_if_t<std::is_integral_v<N>> >
1116 void eval (
Box const& box, N ncomp, D & reduce_data, F
const&
f)
1118 using ReduceTuple =
typename D::Type;
1122 for (N n = 0; n < ncomp; ++n) {
1123 for (
int k = lo.z; k <= hi.z; ++k) {
1124 for (
int j = lo.y; j <= hi.y; ++j) {
1125 for (
int i = lo.x; i <= hi.x; ++i) {
1130 template <
typename N,
typename D,
typename F,
1131 typename M=std::enable_if_t<std::is_integral_v<N>> >
1132 void eval (N n, D & reduce_data, F
const&
f)
1134 using ReduceTuple =
typename D::Type;
1136 for (N i = 0; i < n; ++i) {
1141 template <
typename D>
1142 typename D::Type value (D & reduce_data)
1144 auto& rrv = reduce_data.reference();
1145 if (! m_result_is_ready) {
1146 using ReduceTuple =
typename D::Type;
1147 if (rrv.size() > 1) {
1148 for (
int i = 1, N = rrv.size(); i < N; ++i) {
1152 m_result_is_ready =
true;
1157 bool m_result_is_ready =
false;
1159 void resetResultReadiness () { m_result_is_ready =
false; }
1164 template <
typename T,
typename N,
typename F,
1165 typename M=std::enable_if_t<std::is_integral_v<N>> >
1166 T
Sum (N n, F
const&
f, T init_val = 0)
1169 #ifdef AMREX_USE_OMP
1170 #pragma omp parallel for reduction(+:r)
1172 for (N i = 0; i < n; ++i) {
1178 template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1179 T
Sum (N n, T
const* v, T init_val = 0)
1181 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
1184 template <
typename T,
typename N,
typename F,
1185 typename M=std::enable_if_t<std::is_integral_v<N>> >
1189 #ifdef AMREX_USE_OMP
1190 #pragma omp parallel for reduction(min:r)
1192 for (N i = 0; i < n; ++i) {
1198 template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1201 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
1204 template <
typename T,
typename N,
typename F,
1205 typename M=std::enable_if_t<std::is_integral_v<N>> >
1206 T
Max (N n, F
const&
f, T init_val = std::numeric_limits<T>::lowest())
1209 #ifdef AMREX_USE_OMP
1210 #pragma omp parallel for reduction(max:r)
1212 for (N i = 0; i < n; ++i) {
1218 template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1219 T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
1221 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
1224 template <
typename T,
typename N,
typename F,
1225 typename M=std::enable_if_t<std::is_integral_v<N>> >
1226 std::pair<T,T>
Min (N n, F
const&
f)
1229 T r_max = std::numeric_limits<T>::lowest();
1230 #ifdef AMREX_USE_OMP
1231 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1233 for (N i = 0; i < n; ++i) {
1238 return std::make_pair(r_min,r_max);
1241 template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1242 std::pair<T,T>
MinMax (N n, T
const* v)
1244 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
1247 template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1248 bool AnyOf (N n, T
const* v,
P&& pred)
1250 return std::any_of(v, v+n, std::forward<P>(pred));
1253 template <
typename P>
1254 bool AnyOf (
Box const& box,
P const& pred)
1258 for (
int k = lo.z; k <= hi.z; ++k) {
1259 for (
int j = lo.y; j <= hi.y; ++j) {
1260 for (
int i = lo.x; i <= hi.x; ++i) {
1261 if (pred(i,j,k)) {
return true; }
1274 template <
typename... Ts,
typename... Ps>
1276 constexpr GpuTuple<Ts...>
1288 template <
typename... Ts,
typename... Ps>
1290 constexpr GpuTuple<Ts...>
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition: AMReX_GpuDevice.H:20
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
Real * pdst
Definition: AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition: AMReX_SPACE.H:151
#define AMREX_D_DECL(a, b, c)
Definition: AMReX_SPACE.H:104
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
AMREX_GPU_HOST_DEVICE IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition: AMReX_Box.H:127
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
Definition: AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition: AMReX_GpuDevice.cpp:586
Definition: AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
Definition: AMReX_PODVector.H:246
T * data() noexcept
Definition: AMReX_PODVector.H:593
Definition: AMReX_Reduce.H:249
~ReduceData()
Definition: AMReX_Reduce.H:271
int maxStreamIndex() const
Definition: AMReX_Reduce.H:304
Type * devicePtr(gpuStream_t const &s)
Definition: AMReX_Reduce.H:293
Type value()
Definition: AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition: AMReX_Reduce.H:305
Type * m_host_tuple
Definition: AMReX_Reduce.H:312
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:254
Type * devicePtr()
Definition: AMReX_Reduce.H:292
Type * hostPtr()
Definition: AMReX_Reduce.H:297
int m_max_blocks
Definition: AMReX_Reduce.H:310
int & nBlocks(gpuStream_t const &s)
Definition: AMReX_Reduce.H:300
Type value(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:287
Type * m_device_tuple
Definition: AMReX_Reduce.H:313
GpuArray< int, AMREX_GPU_MAX_STREAMS > & nBlocks()
Definition: AMReX_Reduce.H:299
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition: AMReX_Reduce.H:315
int maxBlocks() const
Definition: AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
GpuArray< int, AMREX_GPU_MAX_STREAMS > m_nblocks
Definition: AMReX_Reduce.H:314
Definition: AMReX_Reduce.H:364
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:441
D::Type value(D &reduce_data)
Definition: AMReX_Reduce.H:660
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:553
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:494
void resetResultReadiness()
Definition: AMReX_Reduce.H:748
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:469
void eval(N n, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:613
static constexpr OpenMPBinPolicy OpenMP
Definition: AMReX_DenseBins.H:20
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition: AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition: AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition: AMReX_GpuReduce.H:445
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition: AMReX_OpenMP.H:37
constexpr int in_parallel()
Definition: AMReX_OpenMP.H:38
constexpr int get_max_threads()
Definition: AMReX_OpenMP.H:36
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:62
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:70
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:45
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition: AMReX_Reduce.H:344
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f(F const &f, int i, int j, int k, IndexType) noexcept -> decltype(f(0, 0, 0))
Definition: AMReX_Reduce.H:321
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:38
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:55
bool AnyOf(Box const &box, P const &pred)
Definition: AMReX_Reduce.H:907
T Min(N n, F const &f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:789
std::pair< T, T > MinMax(N n, F const &f)
Definition: AMReX_Reduce.H:837
T Max(N n, F const &f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:812
T Sum(N n, F const &f, T init_val=0)
Definition: AMReX_Reduce.H:766
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition: AMReX_ParallelReduce.H:92
static constexpr int M
Definition: AMReX_OpenBC.H:13
static constexpr int P
Definition: AMReX_OpenBC.H:14
Definition: AMReX_Amr.cpp:49
constexpr AMREX_GPU_HOST_DEVICE GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, TypeList< Ps... >) noexcept
Return a GpuTuple containing the identity element for each ReduceOp in TypeList. For example 0,...
Definition: AMReX_Reduce.H:1291
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:35
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition: AMReX_Box.H:1211
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:315
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition: AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:308
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:322
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:649
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609
integer, parameter dp
Definition: AMReX_SDCquadrature.F90:8
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuMemory.H:56
T dataValue() const
Definition: AMReX_GpuMemory.H:92
T * dataPtr()
Definition: AMReX_GpuMemory.H:90
Definition: AMReX_GpuLaunch.H:128
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_GpuControl.H:125
Definition: AMReX_GpuReduce.H:282
Test if a given type T is callable with arguments of type Args...
Definition: AMReX_TypeTraits.H:201
Definition: AMReX_Functional.H:14
Definition: AMReX_Reduce.H:180
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:204
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:194
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:208
Definition: AMReX_Reduce.H:212
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:240
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:226
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:236
Definition: AMReX_Reduce.H:147
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:159
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:168
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:176
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:172
Definition: AMReX_Reduce.H:114
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:139
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:143
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:126
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:135
Definition: AMReX_Reduce.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition: AMReX_Reduce.H:110
Definition: AMReX_Reduce.H:339
Definition: AMReX_Reduce.H:338
Struct for holding types.
Definition: AMReX_TypeList.H:12