3#include <AMReX_Config.H>
17namespace Reduce::detail {
21 template <std::
size_t I,
typename T,
typename P>
25 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
28 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
32 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
36 template <std::
size_t I,
typename T,
typename P>
40 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
43 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
47 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
53 template <std::
size_t I,
typename T,
typename P>
57 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
60 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
64 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
68 template <std::
size_t I,
typename T,
typename P>
72 P().init(amrex::get<I>(t));
75 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
79 P().init(amrex::get<I>(t));
93 if (h.threadIdx() == 0) { d +=
r; }
96 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
99 T
r = Gpu::blockReduceSum<MT>(s);
100 if (threadIdx.x == 0) { d +=
r; }
105 template <
typename T>
109 template <
typename T>
110 constexpr void init (T& t)
const noexcept { t = 0; }
117 template <
typename T>
124 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
127 T
r = Gpu::blockReduceMin<MT>(s);
133 template <
typename T>
137 template <
typename T>
138 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
139 init (T& t)
const noexcept { t = std::numeric_limits<T>::max(); }
141 template <
typename T>
142 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
143 init (T& t)
const noexcept { t = T::max(); }
150 template <
typename T>
157 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
160 T
r = Gpu::blockReduceMax<MT>(s);
166 template <
typename T>
170 template <
typename T>
171 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172 init (T& t)
const noexcept { t = std::numeric_limits<T>::lowest(); }
174 template <
typename T>
175 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176 init (T& t)
const noexcept { t = T::lowest(); }
183 template <
typename T>
185 std::enable_if_t<std::is_integral_v<T>>
187 T
r = Gpu::blockReduceLogicalAnd(s,h);
188 if (h.threadIdx() == 0) { d = d &&
r; }
191 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
193 std::enable_if_t<std::is_integral_v<T>>
195 T
r = Gpu::blockReduceLogicalAnd<MT>(s);
196 if (threadIdx.x == 0) { d = d &&
r; }
201 template <
typename T>
203 std::enable_if_t<std::is_integral_v<T>>
206 template <
typename T>
207 constexpr std::enable_if_t<std::is_integral_v<T>>
208 init (T& t)
const noexcept { t =
true; }
215 template <
typename T>
217 std::enable_if_t<std::is_integral_v<T>>
219 T
r = Gpu::blockReduceLogicalOr(s,h);
220 if (h.threadIdx() == 0) { d = d ||
r; }
223 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
225 std::enable_if_t<std::is_integral_v<T>>
227 T
r = Gpu::blockReduceLogicalOr<MT>(s);
228 if (threadIdx.x == 0) { d = d ||
r; }
233 template <
typename T>
235 std::enable_if_t<std::is_integral_v<T>>
238 template <
typename T>
239 constexpr std::enable_if_t<std::is_integral_v<T>>
240 init (T& t)
const noexcept { t =
false; }
243template <
typename... Ps>
class ReduceOps;
247template <
typename... Ts>
253 template <
typename... Ps>
261 reduce_op.resetResultReadiness();
262 static_assert(std::is_trivially_copyable<Type>(),
263 "ReduceData::Type must be trivially copyable");
264 static_assert(std::is_trivially_destructible<Type>(),
265 "ReduceData::Type must be trivially destructible");
267 new (m_host_tuple) Type();
286 template <
typename... Ps>
289 return reduce_op.
value(*
this);
311 int m_max_stream_index = 0;
313 Type* m_device_tuple =
nullptr;
318namespace Reduce::detail {
319 template <
typename F>
321 auto call_f (F
const& f,
int i,
int j,
int k,
IndexType)
322 noexcept ->
decltype(f(0,0,0))
327 template <
typename F>
329 auto call_f (F
const& f,
int i,
int j,
int k,
IndexType t)
330 noexcept ->
decltype(f(
Box()))
341 template <
typename I,
typename F,
typename T,
typename... Ps,
342 std::enable_if_t<std::is_same<iterate_box,I>::value,
int> = 0>
344 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int, T& r)
noexcept
346 auto const& pr = f(ibox,i,j,k);
347 Reduce::detail::for_each_local<0, T, Ps...>(
r, pr);
350 template <
typename I,
typename F,
typename T,
typename... Ps,
351 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,
int> = 0>
353 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int ncomp, T& r)
noexcept
355 for (
int n = 0; n < ncomp; ++n) {
356 auto const& pr = f(ibox,i,j,k,n);
357 Reduce::detail::for_each_local<0, T, Ps...>(
r, pr);
362template <
typename... Ps>
368 template <
typename I,
typename MF,
typename D,
typename F>
369 void eval_mf (I, MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F
const& f)
371 using ReduceTuple =
typename D::Type;
372 const int nboxes = mf.local_size();
374 auto const& parforinfo = mf.getParForInfo(nghost);
375 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
376 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
377 const int nblocks = nblocks_per_box * nboxes;
378 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
380 auto const& stream = Gpu::gpuStream();
381 auto pdst = reduce_data.devicePtr(stream);
382 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
383 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
384 reduce_data.nBlocks(stream) = nblocks_ec;
385 reduce_data.updateMaxStreamIndex(stream);
389 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
390 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
393 Dim1 blockIdx {gh.blockIdx()};
394 Dim1 threadIdx{gh.threadIdx()};
396 amrex::launch_global<AMREX_GPU_MAX_THREADS>
397 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
402 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
403 ReduceTuple& dst =
pdst[blockIdx.x];
404 if (threadIdx.x == 0) {
407 for (
int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
408 int ibox = iblock / nblocks_per_box;
409 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
412 if (icell < indexer.
numPts()) {
413 auto [i, j, k] = indexer(icell);
414 Reduce::detail::mf_call_f<I,
F, ReduceTuple, Ps...>
415 (f, ibox, i, j, k, ncomp,
r);
419 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r, gh);
421 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r);
427 template <
typename MF,
typename D,
typename F>
428 std::enable_if_t<IsFabArray<MF>::value
429#ifndef AMREX_USE_CUDA
435 using ReduceTuple =
typename D::Type;
436 const int nboxes = mf.local_size();
439 }
else if (!mf.isFusingCandidate()) {
442 const int li = mfi.LocalIndex();
443 this->eval(
b, reduce_data,
446 return f(li, i, j, k);
451 mf, nghost, 0, reduce_data, std::forward<F>(f));
455 template <
typename MF,
typename D,
typename F>
456 std::enable_if_t<IsFabArray<MF>::value
457#ifndef AMREX_USE_CUDA
461 eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F&& f)
463 using ReduceTuple =
typename D::Type;
465 const int nboxes = mf.local_size();
469 }
else if (!mf.isFusingCandidate()) {
472 const int li = mfi.LocalIndex();
473 this->eval(
b, ncomp, reduce_data,
476 return f(li, i, j, k, n);
481 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
485 template <
typename D,
typename F>
486 void eval (
Box const& box, D & reduce_data, F
const& f)
488 using ReduceTuple =
typename D::Type;
489 auto const& stream = Gpu::gpuStream();
490 auto dp = reduce_data.devicePtr(stream);
491 int& nblocks = reduce_data.nBlocks(stream);
492 int ncells = box.
numPts();
495 const auto lenxy = len.x*len.y;
496 const auto lenx = len.x;
498 constexpr int nitems_per_thread = 4;
499 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
500 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
501 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
502 reduce_data.updateMaxStreamIndex(stream);
505 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
506 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
509 Dim1 blockIdx {gh.blockIdx()};
510 Dim1 threadIdx{gh.threadIdx()};
511 Dim1 gridDim {gh.gridDim()};
513 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
518 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
519 ReduceTuple& dst = *(dp+blockIdx.x);
520 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
523 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
524 icell < ncells; icell += stride) {
525 int k = icell / lenxy;
526 int j = (icell - k*lenxy) / lenx;
527 int i = (icell - k*lenxy) - j*lenx;
531 auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
532 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, pr);
535 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r, gh);
537 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r);
540 nblocks = std::max(nblocks, nblocks_ec);
543 template <
typename N,
typename D,
typename F,
544 typename M=std::enable_if_t<std::is_integral_v<N>> >
545 void eval (
Box const& box, N ncomp, D & reduce_data, F
const& f)
547 using ReduceTuple =
typename D::Type;
548 auto const& stream = Gpu::gpuStream();
549 auto dp = reduce_data.devicePtr(stream);
550 int& nblocks = reduce_data.nBlocks(stream);
551 int ncells = box.
numPts();
554 const auto lenxy = len.x*len.y;
555 const auto lenx = len.x;
556 constexpr int nitems_per_thread = 4;
557 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
558 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
559 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
560 reduce_data.updateMaxStreamIndex(stream);
563 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
564 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
567 Dim1 blockIdx {gh.blockIdx()};
568 Dim1 threadIdx{gh.threadIdx()};
569 Dim1 gridDim {gh.gridDim()};
571 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
576 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
577 ReduceTuple& dst = *(dp+blockIdx.x);
578 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
581 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
582 icell < ncells; icell += stride) {
583 int k = icell / lenxy;
584 int j = (icell - k*lenxy) / lenx;
585 int i = (icell - k*lenxy) - j*lenx;
589 for (N n = 0; n < ncomp; ++n) {
590 auto pr = f(i,j,k,n);
591 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, pr);
595 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r, gh);
597 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r);
600 nblocks = std::max(nblocks, nblocks_ec);
603 template <
typename N,
typename D,
typename F,
604 typename M=std::enable_if_t<std::is_integral_v<N>> >
605 void eval (N n, D & reduce_data, F
const& f)
607 if (n <= 0) {
return; }
608 using ReduceTuple =
typename D::Type;
609 auto const& stream = Gpu::gpuStream();
610 auto dp = reduce_data.devicePtr(stream);
611 int& nblocks = reduce_data.nBlocks(stream);
612 constexpr int nitems_per_thread = 4;
613 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
614 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
615 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
616 reduce_data.updateMaxStreamIndex(stream);
619 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
620 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
623 Dim1 blockIdx {gh.blockIdx()};
624 Dim1 threadIdx{gh.threadIdx()};
625 Dim1 gridDim {gh.gridDim()};
627 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
632 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
633 ReduceTuple& dst = *(dp+blockIdx.x);
634 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
637 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
638 i < n; i += stride) {
640 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r,pr);
643 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r, gh);
645 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r);
651 template <
typename D>
652 typename D::Type
value (D & reduce_data)
654 auto hp = reduce_data.hostPtr();
656 if (m_result_is_ready) {
660 using ReduceTuple =
typename D::Type;
661 auto const& stream = Gpu::gpuStream();
662 auto dp = reduce_data.devicePtr();
663 auto const& nblocks = reduce_data.nBlocks();
664#if defined(AMREX_USE_SYCL)
665 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
666 const int N = nblocks[0];
668 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
671 Gpu::dtoh_memcpy_async(tmp.
data(), dp,
sizeof(ReduceTuple)*N);
672 Gpu::streamSynchronize();
673 for (
int i = 1; i < N; ++i) {
674 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
681 int maxblocks = reduce_data.maxBlocks();
684 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
685#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
688 auto presult = dtmp.
data();
692 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
696 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
698 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
699 auto dp_stream = dp+istream*maxblocks;
700 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
701 i < nblocks[istream]; i += stride) {
702 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, dp_stream[i]);
705 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r, gh);
706 if (gh.threadIdx() == 0) { *presult = dst; }
708#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
709 Gpu::dtoh_memcpy_async(hp, dtmp.
data(),
sizeof(ReduceTuple));
712 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
716 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(
r);
718 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
719 auto dp_stream = dp+istream*maxblocks;
720 for (
int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
721 i < nblocks[istream]; i += stride) {
722 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, dp_stream[i]);
725 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst,
r);
726 if (threadIdx.x == 0) { *hp = dst; }
729 Gpu::streamSynchronize();
732 m_result_is_ready =
true;
737 bool m_result_is_ready =
false;
745template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
746T Sum (N n, T
const* v, T init_val = 0)
750 using ReduceTuple =
typename decltype(reduce_data)::Type;
752 ReduceTuple hv = reduce_data.
value(reduce_op);
753 return amrex::get<0>(hv) + init_val;
756template <
typename T,
typename N,
typename F,
757 typename M=std::enable_if_t<std::is_integral_v<N>> >
758T Sum (N n, F
const& f, T init_val = 0)
762 using ReduceTuple =
typename decltype(reduce_data)::Type;
764 ReduceTuple hv = reduce_data.
value(reduce_op);
765 return amrex::get<0>(hv) + init_val;
768template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
769T Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max())
773 using ReduceTuple =
typename decltype(reduce_data)::Type;
775 ReduceTuple hv = reduce_data.
value(reduce_op);
776 return std::min(amrex::get<0>(hv),init_val);
779template <
typename T,
typename N,
typename F,
780 typename M=std::enable_if_t<std::is_integral_v<N>> >
781T Min (N n, F
const& f, T init_val = std::numeric_limits<T>::max())
785 using ReduceTuple =
typename decltype(reduce_data)::Type;
787 ReduceTuple hv = reduce_data.
value(reduce_op);
788 return std::min(amrex::get<0>(hv),init_val);
791template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
792T Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
796 using ReduceTuple =
typename decltype(reduce_data)::Type;
798 ReduceTuple hv = reduce_data.
value(reduce_op);
799 return std::max(amrex::get<0>(hv),init_val);
802template <
typename T,
typename N,
typename F,
803 typename M=std::enable_if_t<std::is_integral_v<N>> >
804T Max (N n, F
const& f, T init_val = std::numeric_limits<T>::lowest())
808 using ReduceTuple =
typename decltype(reduce_data)::Type;
810 ReduceTuple hv = reduce_data.
value(reduce_op);
811 return std::max(amrex::get<0>(hv),init_val);
814template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
819 using ReduceTuple =
typename decltype(reduce_data)::Type;
823 auto hv = reduce_data.
value(reduce_op);
824 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
827template <
typename T,
typename N,
typename F,
828 typename M=std::enable_if_t<std::is_integral_v<N>> >
833 using ReduceTuple =
typename decltype(reduce_data)::Type;
838 auto hv = reduce_data.
value(reduce_op);
839 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
842template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
843bool AnyOf (N n, T
const* v, P
const& pred)
849 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
852 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
853 const std::size_t shared_mem_bytes = num_ints*
sizeof(int);
854 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
856 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
857 if (gh.threadIdx() == 0) { *has_any = *dp; }
863 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
864 i < n && !
r; i += stride)
866 r = pred(v[i]) ? 1 : 0;
869 r = Gpu::blockReduce<Gpu::Device::warp_size>
871 if (gh.threadIdx() == 0 &&
r) { *dp = 1; }
875 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
877 __shared__
int has_any;
878 if (threadIdx.x == 0) { has_any = *dp; }
884 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
885 i < n && !
r; i += stride)
887 r = pred(v[i]) ? 1 : 0;
889 r = Gpu::blockReduce<Gpu::Device::warp_size>
891 if (threadIdx.x == 0 &&
r) *dp = 1;
904 int ncells = box.
numPts();
907 const auto lenxy = len.x*len.y;
908 const auto lenx = len.x;
910 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
913 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
914 const std::size_t shared_mem_bytes = num_ints*
sizeof(int);
915 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
917 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
918 if (gh.threadIdx() == 0) { *has_any = *dp; }
924 for (
int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
925 icell < ncells && !
r; icell += stride) {
926 int k = icell / lenxy;
927 int j = (icell - k*lenxy) / lenx;
928 int i = (icell - k*lenxy) - j*lenx;
932 r = pred(i,j,k) ? 1 : 0;
934 r = Gpu::blockReduce<Gpu::Device::warp_size>
936 if (gh.threadIdx() == 0 &&
r) { *dp = 1; }
943 __shared__
int has_any;
944 if (threadIdx.x == 0) { has_any = *dp; }
950 for (
int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
951 icell < ncells && !
r; icell += stride) {
952 int k = icell / lenxy;
953 int j = (icell - k*lenxy) / lenx;
954 int i = (icell - k*lenxy) - j*lenx;
958 r = pred(i,j,k) ? 1 : 0;
960 r = Gpu::blockReduce<Gpu::Device::warp_size>
962 if (threadIdx.x == 0 &&
r) *dp = 1;
973template <
typename... Ts>
977 using Type = GpuTuple<Ts...>;
979 template <
typename... Ps>
980 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
981 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
982 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
984 reduce_op.resetResultReadiness();
985 for (
auto& t : m_tuple) {
986 Reduce::detail::for_each_init<0, Type, Ps...>(t);
990 ~ReduceData () =
default;
991 ReduceData (ReduceData<Ts...>
const&) =
delete;
992 ReduceData (ReduceData<Ts...> &&) =
delete;
993 void operator= (ReduceData<Ts...>
const&) =
delete;
994 void operator= (ReduceData<Ts...> &&) =
delete;
996 Type value () {
return m_fn_value(); }
998 template <
typename... Ps>
999 Type value (ReduceOps<Ps...>& reduce_op)
1001 return reduce_op.value(*
this);
1004 Vector<Type>& reference () {
return m_tuple; }
1006 Type& reference (
int tid)
1008 if (m_tuple.size() == 1) {
1012 return m_tuple[tid];
1017 Vector<Type> m_tuple;
1018 std::function<Type()> m_fn_value;
1021template <
typename... Ps>
1026 template <
typename D,
typename F>
1028 static auto call_f (Box
const& box,
typename D::Type & r, F
const& f)
1029 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(f(0,0,0))>,
1032 using ReduceTuple =
typename D::Type;
1035 for (
int k = lo.z; k <= hi.z; ++k) {
1036 for (
int j = lo.y; j <= hi.y; ++j) {
1037 for (
int i = lo.x; i <= hi.x; ++i) {
1038 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, f(i,j,k));
1042 template <
typename D,
typename F>
1044 static auto call_f (Box
const& box,
typename D::Type & r, F
const& f)
1045 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(f(
Box()))>,
1048 using ReduceTuple =
typename D::Type;
1049 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
r, f(box));
1054 template <
typename MF,
typename D,
typename F>
1055 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1056 eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F
const& f)
1058 using ReduceTuple =
typename D::Type;
1064 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1065 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1066 Box const&
b = mfi.growntilebox(nghost);
1067 const int li = mfi.LocalIndex();
1070 for (
int k = lo.z; k <= hi.z; ++k) {
1071 for (
int j = lo.y; j <= hi.y; ++j) {
1072 for (
int i = lo.x; i <= hi.x; ++i) {
1073 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1076 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1077 reduce_data.reference(OpenMP::get_thread_num()), rr);
1081 template <
typename MF,
typename D,
typename F>
1082 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1083 eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F
const& f)
1085 using ReduceTuple =
typename D::Type;
1091 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1092 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1093 Box const&
b = mfi.growntilebox(nghost);
1094 const int li = mfi.LocalIndex();
1097 for (
int n = 0; n < ncomp; ++n) {
1098 for (
int k = lo.z; k <= hi.z; ++k) {
1099 for (
int j = lo.y; j <= hi.y; ++j) {
1100 for (
int i = lo.x; i <= hi.x; ++i) {
1101 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1104 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1105 reduce_data.reference(OpenMP::get_thread_num()), rr);
1109 template <
typename D,
typename F>
1110 void eval (Box
const& box, D & reduce_data, F&& f)
1112 using ReduceTuple =
typename D::Type;
1114 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1115 call_f<D>(box, rr, std::forward<F>(f));
1116 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1117 reduce_data.reference(OpenMP::get_thread_num()), rr);
1120 template <
typename N,
typename D,
typename F,
1121 typename M=std::enable_if_t<std::is_integral_v<N>> >
1122 void eval (Box
const& box, N ncomp, D & reduce_data, F
const& f)
1124 using ReduceTuple =
typename D::Type;
1126 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1129 for (N n = 0; n < ncomp; ++n) {
1130 for (
int k = lo.z; k <= hi.z; ++k) {
1131 for (
int j = lo.y; j <= hi.y; ++j) {
1132 for (
int i = lo.x; i <= hi.x; ++i) {
1133 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1135 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1136 reduce_data.reference(OpenMP::get_thread_num()), rr);
1139 template <
typename N,
typename D,
typename F,
1140 typename M=std::enable_if_t<std::is_integral_v<N>> >
1141 void eval (N n, D & reduce_data, F
const& f)
1143 using ReduceTuple =
typename D::Type;
1145 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1146 for (N i = 0; i < n; ++i) {
1147 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1149 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1150 reduce_data.reference(OpenMP::get_thread_num()), rr);
1153 template <
typename D>
1154 typename D::Type value (D & reduce_data)
1156 auto& rrv = reduce_data.reference();
1157 if (! m_result_is_ready) {
1158 using ReduceTuple =
typename D::Type;
1159 if (rrv.size() > 1) {
1160 for (
int i = 1, N = rrv.size(); i < N; ++i) {
1161 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1164 m_result_is_ready =
true;
1169 bool m_result_is_ready =
false;
1171 void resetResultReadiness () { m_result_is_ready =
false; }
1176template <
typename T,
typename N,
typename F,
1177 typename M=std::enable_if_t<std::is_integral_v<N>> >
1178T
Sum (N n, F
const& f, T init_val = 0)
1182#pragma omp parallel for reduction(+:r)
1184 for (N i = 0; i < n; ++i) {
1190template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1191T
Sum (N n, T
const* v, T init_val = 0)
1193 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
1196template <
typename T,
typename N,
typename F,
1197 typename M=std::enable_if_t<std::is_integral_v<N>> >
1198T
Min (N n, F
const& f, T init_val = std::numeric_limits<T>::max())
1202#pragma omp parallel for reduction(min:r)
1204 for (N i = 0; i < n; ++i) {
1205 r = std::min(r,f(i));
1210template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1211T
Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max())
1213 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
1216template <
typename T,
typename N,
typename F,
1217 typename M=std::enable_if_t<std::is_integral_v<N>> >
1218T
Max (N n, F
const& f, T init_val = std::numeric_limits<T>::lowest())
1222#pragma omp parallel for reduction(max:r)
1224 for (N i = 0; i < n; ++i) {
1225 r = std::max(r,f(i));
1230template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1231T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
1233 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
1236template <
typename T,
typename N,
typename F,
1237 typename M=std::enable_if_t<std::is_integral_v<N>> >
1238std::pair<T,T>
MinMax (N n, F
const& f)
1240 T r_min = std::numeric_limits<T>::max();
1241 T r_max = std::numeric_limits<T>::lowest();
1243#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1245 for (N i = 0; i < n; ++i) {
1247 r_min = std::min(r_min,tmp);
1248 r_max = std::max(r_max,tmp);
1250 return std::make_pair(r_min,r_max);
1253template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1254std::pair<T,T>
MinMax (N n, T
const* v)
1256 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
1259template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1260bool AnyOf (N n, T
const* v, P&& pred)
1262 return std::any_of(v, v+n, std::forward<P>(pred));
1265template <
typename P>
1266bool AnyOf (Box
const& box, P
const& pred)
1270 for (
int k = lo.z; k <= hi.z; ++k) {
1271 for (
int j = lo.y; j <= hi.y; ++j) {
1272 for (
int i = lo.x; i <= hi.x; ++i) {
1273 if (pred(i,j,k)) {
return true; }
1286template <
typename... Ts,
typename... Ps>
1288constexpr GpuTuple<Ts...>
1292 Reduce::detail::for_each_init<0,
decltype(
r), Ps...>(
r);
1300template <
typename... Ts,
typename... Ps>
1302constexpr GpuTuple<Ts...>
1306 Reduce::detail::for_each_init<0,
decltype(
r), Ps...>(
r);
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition AMReX_SPACE.H:173
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
__host__ __device__ Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:349
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition AMReX_Box.H:130
Definition AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:690
Definition AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141
Definition AMReX_PODVector.H:297
T * data() noexcept
Definition AMReX_PODVector.H:655
Definition AMReX_Reduce.H:249
~ReduceData()
Definition AMReX_Reduce.H:271
int maxStreamIndex() const
Definition AMReX_Reduce.H:304
Type value()
Definition AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:305
Type * m_host_tuple
Definition AMReX_Reduce.H:312
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:300
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:254
int m_max_blocks
Definition AMReX_Reduce.H:310
GpuArray< int, 8 > m_nblocks
Definition AMReX_Reduce.H:314
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:293
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:287
Type * devicePtr()
Definition AMReX_Reduce.H:292
Type * m_device_tuple
Definition AMReX_Reduce.H:313
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:299
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition AMReX_Reduce.H:315
Type * hostPtr()
Definition AMReX_Reduce.H:297
int maxBlocks() const
Definition AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:364
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:652
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:545
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:486
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:433
void resetResultReadiness()
Definition AMReX_Reduce.H:740
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:461
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:605
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:453
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:398
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:348
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:204
__device__ void for_each_parallel(T &d, T const &s)
Definition AMReX_Reduce.H:38
__host__ __device__ constexpr void for_each_init(T &t)
Definition AMReX_Reduce.H:70
__host__ __device__ void for_each_local(T &d, T const &s)
Definition AMReX_Reduce.H:55
__device__ void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition AMReX_Reduce.H:344
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:815
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:843
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition AMReX_ParallelReduce.H:92
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition AMReX_MFParallelForG.H:44
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1289
BoxND< 3 > Box
Definition AMReX_BaseFwd.H:27
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:745
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1229
Arena * The_Arena()
Definition AMReX_Arena.cpp:705
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_Box.H:2045
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2086
Definition AMReX_Array.H:34
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:132
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:285
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:209
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:180
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:208
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:194
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:204
Definition AMReX_Reduce.H:212
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:240
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:236
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:226
Definition AMReX_Reduce.H:147
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:172
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:176
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:168
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:159
Definition AMReX_Reduce.H:114
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:126
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:135
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:139
Definition AMReX_Reduce.H:85
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:98
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:110
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:338
Struct for holding types.
Definition AMReX_TypeList.H:12