3#include <AMReX_Config.H>
18namespace Reduce::detail {
22 template <std::
size_t I,
typename T,
typename P>
24 void for_each_parallel (T& d, T
const& s, Gpu::Handler
const& h)
26 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
29 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
31 void for_each_parallel (T& d, T
const& s, Gpu::Handler
const& h)
33 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
34 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
37 template <std::
size_t I,
typename T,
typename P>
39 void for_each_parallel (T& d, T
const& s)
41 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
44 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
46 void for_each_parallel (T& d, T
const& s)
48 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
49 for_each_parallel<I+1,T,P1,Ps...>(d, s);
54 template <std::
size_t I,
typename T,
typename P>
56 void for_each_local (T& d, T
const& s)
58 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
61 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
63 void for_each_local (T& d, T
const& s)
65 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
66 for_each_local<I+1,T,P1,Ps...>(d, s);
69 template <std::
size_t I,
typename T,
typename P>
71 constexpr void for_each_init (T& t)
73 P().init(amrex::get<I>(t));
76 template <std::size_t I,
typename T,
typename P,
typename P1,
typename... Ps>
78 constexpr void for_each_init (T& t)
80 P().init(amrex::get<I>(t));
81 for_each_init<I+1,T,P1,Ps...>(t);
96 if (h.threadIdx() == 0) { d += r; }
99 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
102 T r = Gpu::blockReduceSum<MT>(s);
103 if (threadIdx.x == 0) { d += r; }
108 template <
typename T>
112 template <
typename T>
113 constexpr void init (T& t)
const noexcept { t = 0; }
121 template <
typename T>
125 if (h.threadIdx() == 0) { d =
amrex::min(d,r); }
128 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
131 T r = Gpu::blockReduceMin<MT>(s);
132 if (threadIdx.x == 0) { d =
amrex::min(d,r); }
137 template <
typename T>
141 template <
typename T>
142 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
143 init (T& t)
const noexcept { t = std::numeric_limits<T>::max(); }
145 template <
typename T>
146 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
147 init (T& t)
const noexcept { t = T::max(); }
155 template <
typename T>
159 if (h.threadIdx() == 0) { d =
amrex::max(d,r); }
162 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
165 T r = Gpu::blockReduceMax<MT>(s);
166 if (threadIdx.x == 0) { d =
amrex::max(d,r); }
171 template <
typename T>
175 template <
typename T>
176 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
177 init (T& t)
const noexcept { t = std::numeric_limits<T>::lowest(); }
179 template <
typename T>
180 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
181 init (T& t)
const noexcept { t = T::lowest(); }
189 template <
typename T>
191 std::enable_if_t<std::is_integral_v<T>>
194 if (h.threadIdx() == 0) { d = d && r; }
197 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
199 std::enable_if_t<std::is_integral_v<T>>
201 T r = Gpu::blockReduceLogicalAnd<MT>(s);
202 if (threadIdx.x == 0) { d = d && r; }
207 template <
typename T>
209 std::enable_if_t<std::is_integral_v<T>>
212 template <
typename T>
213 constexpr std::enable_if_t<std::is_integral_v<T>>
214 init (T& t)
const noexcept { t =
true; }
222 template <
typename T>
224 std::enable_if_t<std::is_integral_v<T>>
227 if (h.threadIdx() == 0) { d = d || r; }
230 template <
typename T,
int MT=AMREX_GPU_MAX_THREADS>
232 std::enable_if_t<std::is_integral_v<T>>
234 T r = Gpu::blockReduceLogicalOr<MT>(s);
235 if (threadIdx.x == 0) { d = d || r; }
240 template <
typename T>
242 std::enable_if_t<std::is_integral_v<T>>
245 template <
typename T>
246 constexpr std::enable_if_t<std::is_integral_v<T>>
247 init (T& t)
const noexcept { t =
false; }
250template <
typename... Ps>
class ReduceOps;
255template <
typename... Ts>
261 template <
typename... Ps>
263 : m_max_blocks(
Gpu::
Device::maxBlocksPerLaunch()),
266 * m_max_blocks * sizeof(
Type)))),
267 m_fn_value([&reduce_op,this] () ->
Type { return this->
value(reduce_op); })
269 reduce_op.resetResultReadiness();
270 static_assert(std::is_trivially_copyable<Type>(),
271 "ReduceData::Type must be trivially copyable");
272 static_assert(std::is_trivially_destructible<Type>(),
273 "ReduceData::Type must be trivially destructible");
275 new (m_host_tuple) Type();
294 template <
typename... Ps>
297 return reduce_op.
value(*
this);
319 int m_max_stream_index = 0;
320 Type* m_host_tuple =
nullptr;
321 Type* m_device_tuple =
nullptr;
323 std::function<Type()> m_fn_value;
327namespace Reduce::detail {
331 template <
typename F,
int dim>
333 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim>)
noexcept ->
334 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
336 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
339 template <
typename F,
int dim>
341 auto call_f_intvect_box (F
const& f, IntVectND<dim> iv, IndexTypeND<dim> t)
noexcept ->
342 decltype(f(BoxND<dim>(iv, iv, t)))
344 return f(BoxND<dim>(iv, iv, t));
349 template <
typename F,
typename T,
int dim>
351 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
352 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
354 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
360 struct iterate_box {};
361 struct iterate_box_comp {};
364 template <
typename I,
typename F,
typename T,
typename... Ps,
365 std::enable_if_t<std::is_same<iterate_box,I>::value,
int> = 0>
367 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int, T& r)
noexcept
369 auto const& pr = f(ibox,i,j,k);
370 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
373 template <
typename I,
typename F,
typename T,
typename... Ps,
374 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,
int> = 0>
376 void mf_call_f (F
const& f,
int ibox,
int i,
int j,
int k,
int ncomp, T& r)
noexcept
378 for (
int n = 0; n < ncomp; ++n) {
379 auto const& pr = f(ibox,i,j,k,n);
380 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
387template <
typename... Ps>
393 template <
typename I,
typename MF,
typename D,
typename F>
394 void eval_mf (I, MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F
const& f)
396 using ReduceTuple =
typename D::Type;
397 const int nboxes = mf.local_size();
399 auto const& parforinfo = mf.getParForInfo(nghost);
400 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
402 const int nblocks = nblocks_per_box * nboxes;
403 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
405 auto const& stream = Gpu::gpuStream();
406 auto pdst = reduce_data.devicePtr(stream);
407 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
409 reduce_data.nBlocks(stream) = nblocks_ec;
410 reduce_data.updateMaxStreamIndex(stream);
414 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
415 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
418 Dim1 blockIdx {gh.blockIdx()};
419 Dim1 threadIdx{gh.threadIdx()};
421 amrex::launch_global<AMREX_GPU_MAX_THREADS>
422 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
427 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
428 ReduceTuple& dst =
pdst[blockIdx.x];
429 if (threadIdx.x == 0) {
432 for (
int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
433 int ibox = iblock / nblocks_per_box;
434 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
437 if (icell < indexer.
numPts()) {
438 auto [i, j, k] = indexer(icell);
439 Reduce::detail::mf_call_f<I,
F, ReduceTuple, Ps...>
440 (f, ibox, i, j, k, ncomp, r);
444 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
446 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
452 template <
typename MF,
typename D,
typename F>
453 std::enable_if_t<IsFabArray<MF>::value
454#ifndef AMREX_USE_CUDA
460 using ReduceTuple =
typename D::Type;
461 const int nboxes = mf.local_size();
464 }
else if (!mf.isFusingCandidate()) {
467 const int li = mfi.LocalIndex();
468 this->eval(b, reduce_data,
471 return f(li, i, j, k);
475 eval_mf(Reduce::detail::iterate_box{},
476 mf, nghost, 0, reduce_data, std::forward<F>(f));
480 template <
typename MF,
typename D,
typename F>
481 std::enable_if_t<IsFabArray<MF>::value
482#ifndef AMREX_USE_CUDA
486 eval (MF
const& mf,
IntVect const& nghost,
int ncomp, D& reduce_data, F&& f)
488 using ReduceTuple =
typename D::Type;
490 const int nboxes = mf.local_size();
494 }
else if (!mf.isFusingCandidate()) {
497 const int li = mfi.LocalIndex();
498 this->eval(b, ncomp, reduce_data,
501 return f(li, i, j, k, n);
505 eval_mf(Reduce::detail::iterate_box_comp{},
506 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
510 template <
typename D,
typename F,
int dim>
513 using ReduceTuple =
typename D::Type;
514 auto const& stream = Gpu::gpuStream();
515 auto dp = reduce_data.devicePtr(stream);
516 int& nblocks = reduce_data.nBlocks(stream);
519 constexpr int nitems_per_thread = 4;
520 Long nblocks_ec = (box.
numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
521 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
522 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
523 reduce_data.updateMaxStreamIndex(stream);
526 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
527 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
530 Dim1 blockIdx {gh.blockIdx()};
531 Dim1 threadIdx{gh.threadIdx()};
532 Dim1 gridDim {gh.gridDim()};
534 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
539 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
540 ReduceTuple& dst = *(dp+blockIdx.x);
541 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
544 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
545 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
549 auto iv = indexer.
intVect(icell);
550 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
551 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
554 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
556 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
559 nblocks = std::max(nblocks,
static_cast<int>(nblocks_ec));
562 template <
typename N,
typename D,
typename F,
int dim,
563 typename M=std::enable_if_t<std::is_integral_v<N>> >
566 using ReduceTuple =
typename D::Type;
567 auto const& stream = Gpu::gpuStream();
568 auto dp = reduce_data.devicePtr(stream);
569 int& nblocks = reduce_data.nBlocks(stream);
571 constexpr int nitems_per_thread = 4;
572 Long nblocks_ec = (box.
numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
573 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
574 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
575 reduce_data.updateMaxStreamIndex(stream);
578 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
579 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
582 Dim1 blockIdx {gh.blockIdx()};
583 Dim1 threadIdx{gh.threadIdx()};
584 Dim1 gridDim {gh.gridDim()};
586 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
591 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
592 ReduceTuple& dst = *(dp+blockIdx.x);
593 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
596 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
597 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
601 auto iv = indexer.
intVect(icell);
602 for (N n = 0; n < ncomp; ++n) {
603 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
604 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
608 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
610 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
613 nblocks = std::max(nblocks,
static_cast<int>(nblocks_ec));
616 template <
typename N,
typename D,
typename F,
617 typename M=std::enable_if_t<std::is_integral_v<N>> >
618 void eval (N n, D & reduce_data, F
const& f)
620 if (n <= 0) {
return; }
621 using ReduceTuple =
typename D::Type;
622 auto const& stream = Gpu::gpuStream();
623 auto dp = reduce_data.devicePtr(stream);
624 int& nblocks = reduce_data.nBlocks(stream);
625 constexpr int nitems_per_thread = 4;
626 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
627 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
628 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
629 reduce_data.updateMaxStreamIndex(stream);
632 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
633 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
636 Dim1 blockIdx {gh.blockIdx()};
637 Dim1 threadIdx{gh.threadIdx()};
638 Dim1 gridDim {gh.gridDim()};
640 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
645 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
646 ReduceTuple& dst = *(dp+blockIdx.x);
647 if (threadIdx.x == 0 &&
static_cast<int>(blockIdx.x) >= nblocks) {
650 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
651 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
656 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
659 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
661 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
667 template <
typename D>
668 typename D::Type
value (D & reduce_data)
670 auto hp = reduce_data.hostPtr();
672 if (m_result_is_ready) {
676 using ReduceTuple =
typename D::Type;
677 auto const& stream = Gpu::gpuStream();
678 auto dp = reduce_data.devicePtr();
679 auto const& nblocks = reduce_data.nBlocks();
680#if defined(AMREX_USE_SYCL)
681 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
682 const int N = nblocks[0];
684 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
687 Gpu::dtoh_memcpy_async(tmp.
data(), dp,
sizeof(ReduceTuple)*N);
688 Gpu::streamSynchronize();
689 for (
int i = 1; i < N; ++i) {
690 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
697 int maxblocks = reduce_data.maxBlocks();
700 constexpr std::size_t shared_mem_bytes =
sizeof(
unsigned long long)*Gpu::Device::warp_size;
701#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
704 auto presult = dtmp.
data();
708 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
712 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
714 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
715 auto dp_stream = dp+istream*maxblocks;
716 for (
int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
717 i < nblocks[istream]; i += stride) {
718 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
721 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
722 if (gh.threadIdx() == 0) { *presult = dst; }
724#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
725 Gpu::dtoh_memcpy_async(hp, dtmp.
data(),
sizeof(ReduceTuple));
728 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
732 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
734 for (
int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
735 auto dp_stream = dp+istream*maxblocks;
736 for (
int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
737 i < nblocks[istream]; i += stride) {
738 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
741 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
742 if (threadIdx.x == 0) { *hp = dst; }
745 Gpu::streamSynchronize();
748 m_result_is_ready =
true;
753 bool m_result_is_ready =
false;
762template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
763T Sum (N n, T
const* v, T init_val = 0)
767 using ReduceTuple =
typename decltype(reduce_data)::Type;
769 ReduceTuple hv = reduce_data.
value(reduce_op);
770 return amrex::get<0>(hv) + init_val;
774template <
typename T,
typename N,
typename F,
775 typename M=std::enable_if_t<std::is_integral_v<N>> >
776T Sum (N n, F
const& f, T init_val = 0)
780 using ReduceTuple =
typename decltype(reduce_data)::Type;
782 ReduceTuple hv = reduce_data.
value(reduce_op);
783 return amrex::get<0>(hv) + init_val;
787template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
788T Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max())
792 using ReduceTuple =
typename decltype(reduce_data)::Type;
794 ReduceTuple hv = reduce_data.
value(reduce_op);
795 return std::min(amrex::get<0>(hv),init_val);
799template <
typename T,
typename N,
typename F,
800 typename M=std::enable_if_t<std::is_integral_v<N>> >
801T Min (N n, F
const& f, T init_val = std::numeric_limits<T>::max())
805 using ReduceTuple =
typename decltype(reduce_data)::Type;
807 ReduceTuple hv = reduce_data.
value(reduce_op);
808 return std::min(amrex::get<0>(hv),init_val);
812template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
813T Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
817 using ReduceTuple =
typename decltype(reduce_data)::Type;
819 ReduceTuple hv = reduce_data.
value(reduce_op);
820 return std::max(amrex::get<0>(hv),init_val);
824template <
typename T,
typename N,
typename F,
825 typename M=std::enable_if_t<std::is_integral_v<N>> >
826T Max (N n, F
const& f, T init_val = std::numeric_limits<T>::lowest())
830 using ReduceTuple =
typename decltype(reduce_data)::Type;
832 ReduceTuple hv = reduce_data.
value(reduce_op);
833 return std::max(amrex::get<0>(hv),init_val);
837template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
842 using ReduceTuple =
typename decltype(reduce_data)::Type;
846 auto hv = reduce_data.
value(reduce_op);
847 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
851template <
typename T,
typename N,
typename F,
852 typename M=std::enable_if_t<std::is_integral_v<N>> >
857 using ReduceTuple =
typename decltype(reduce_data)::Type;
862 auto hv = reduce_data.
value(reduce_op);
863 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
867template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
868bool AnyOf (N n, T
const* v, P
const& pred)
874 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
877 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
878 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
879 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
881 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
882 if (gh.threadIdx() == 0) { *has_any = *dp; }
888 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
889 i < n && !r; i += stride)
891 r = pred(v[i]) ? 1 : 0;
894 r = Gpu::blockReduce<Gpu::Device::warp_size>
896 if (gh.threadIdx() == 0 && r) { *dp = 1; }
900 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
902 __shared__
int has_any;
903 if (threadIdx.x == 0) { has_any = *dp; }
909 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
910 i < n && !r; i += stride)
912 r = pred(v[i]) ? 1 : 0;
914 r = Gpu::blockReduce<Gpu::Device::warp_size>
916 if (threadIdx.x == 0 && r) *dp = 1;
924template <
typename P,
int dim>
932 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
935 const int num_ints = std::max(Gpu::Device::warp_size,
int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
936 const std::size_t shared_mem_bytes = num_ints*
sizeof(
int);
937 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
939 int* has_any = &(
static_cast<int*
>(gh.sharedMemory())[num_ints-1]);
940 if (gh.threadIdx() == 0) { *has_any = *dp; }
946 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
947 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
948 icell < indexer.
numPts() && !r;
951 auto iv = indexer.
intVect(icell);
952 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
954 r = Gpu::blockReduce<Gpu::Device::warp_size>
956 if (gh.threadIdx() == 0 && r) { *dp = 1; }
963 __shared__
int has_any;
964 if (threadIdx.x == 0) { has_any = *dp; }
970 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
971 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
972 icell < indexer.
numPts() && !r;
975 auto iv = indexer.
intVect(icell);
976 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
978 r = Gpu::blockReduce<Gpu::Device::warp_size>
980 if (threadIdx.x == 0 && r) *dp = 1;
991template <
typename... Ts>
995 using Type = GpuTuple<Ts...>;
997 template <
typename... Ps>
998 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
999 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1000 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1002 reduce_op.resetResultReadiness();
1003 for (
auto& t : m_tuple) {
1004 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1008 ~ReduceData () =
default;
1009 ReduceData (ReduceData<Ts...>
const&) =
delete;
1010 ReduceData (ReduceData<Ts...> &&) =
delete;
1011 void operator= (ReduceData<Ts...>
const&) =
delete;
1012 void operator= (ReduceData<Ts...> &&) =
delete;
1014 Type value () {
return m_fn_value(); }
1016 template <
typename... Ps>
1017 Type value (ReduceOps<Ps...>& reduce_op)
1019 return reduce_op.value(*
this);
1022 Vector<Type>& reference () {
return m_tuple; }
1024 Type& reference (
int tid)
1026 if (m_tuple.size() == 1) {
1030 return m_tuple[tid];
1035 Vector<Type> m_tuple;
1036 std::function<Type()> m_fn_value;
1039namespace Reduce::detail {
1043 template <
typename F,
int dim>
1045 auto call_f_intvect (F
const& f, IntVectND<dim> iv)
noexcept ->
1046 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1048 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1053 template <
typename F,
typename T,
int dim>
1055 auto call_f_intvect_n (F
const& f, IntVectND<dim> iv, T n)
noexcept ->
1056 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1058 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1062template <
typename... Ps>
1069 template <
typename D,
typename F,
int dim>
1071 static auto call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
1072 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(
1073 Reduce::detail::call_f_intvect(f, IntVectND<dim>(0))
1074 )>,
typename D::Type>>
1076 using ReduceTuple =
typename D::Type;
1078 [&] (IntVectND<dim> iv) {
1079 auto pr = Reduce::detail::call_f_intvect(f, iv);
1080 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1084 template <
typename D,
typename F,
int dim>
1086 static auto call_f_box (BoxND<dim>
const& box,
typename D::Type & r, F
const& f)
1087 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<
decltype(f(box))>,
1090 using ReduceTuple =
typename D::Type;
1091 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1096 template <
typename MF,
typename D,
typename F>
1097 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1098 eval (MF
const& mf, IntVect
const& nghost, D & reduce_data, F
const& f)
1100 using ReduceTuple =
typename D::Type;
1106 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1107 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1108 Box const& b = mfi.growntilebox(nghost);
1109 const int li = mfi.LocalIndex();
1112 for (
int k = lo.z; k <= hi.z; ++k) {
1113 for (
int j = lo.y; j <= hi.y; ++j) {
1114 for (
int i = lo.x; i <= hi.x; ++i) {
1115 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1118 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1119 reduce_data.reference(OpenMP::get_thread_num()), rr);
1123 template <
typename MF,
typename D,
typename F>
1124 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1125 eval (MF
const& mf, IntVect
const& nghost,
int ncomp, D & reduce_data, F
const& f)
1127 using ReduceTuple =
typename D::Type;
1133 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1134 for (MFIter mfi(mf,
true); mfi.isValid(); ++mfi) {
1135 Box const& b = mfi.growntilebox(nghost);
1136 const int li = mfi.LocalIndex();
1139 for (
int n = 0; n < ncomp; ++n) {
1140 for (
int k = lo.z; k <= hi.z; ++k) {
1141 for (
int j = lo.y; j <= hi.y; ++j) {
1142 for (
int i = lo.x; i <= hi.x; ++i) {
1143 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1146 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1147 reduce_data.reference(OpenMP::get_thread_num()), rr);
1151 template <
typename D,
typename F,
int dim>
1152 void eval (BoxND<dim>
const& box, D & reduce_data, F&& f)
1154 using ReduceTuple =
typename D::Type;
1156 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1157 call_f_box<D>(box, rr, std::forward<F>(f));
1158 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1159 reduce_data.reference(OpenMP::get_thread_num()), rr);
1162 template <
typename N,
typename D,
typename F,
int dim,
1163 typename M=std::enable_if_t<std::is_integral_v<N>> >
1164 void eval (BoxND<dim>
const& box, N ncomp, D & reduce_data, F
const& f)
1166 using ReduceTuple =
typename D::Type;
1168 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1170 [&] (IntVectND<dim> iv,
int n) {
1171 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1172 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1174 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1175 reduce_data.reference(OpenMP::get_thread_num()), rr);
1178 template <
typename N,
typename D,
typename F,
1179 typename M=std::enable_if_t<std::is_integral_v<N>> >
1180 void eval (N n, D & reduce_data, F
const& f)
1182 using ReduceTuple =
typename D::Type;
1184 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1185 for (N i = 0; i < n; ++i) {
1186 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1188 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1189 reduce_data.reference(OpenMP::get_thread_num()), rr);
1192 template <
typename D>
1193 typename D::Type value (D & reduce_data)
1195 auto& rrv = reduce_data.reference();
1196 if (! m_result_is_ready) {
1197 using ReduceTuple =
typename D::Type;
1198 if (rrv.size() > 1) {
1199 for (
int i = 1, N = rrv.size(); i < N; ++i) {
1200 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1203 m_result_is_ready =
true;
1208 bool m_result_is_ready =
false;
1210 void resetResultReadiness () { m_result_is_ready =
false; }
1215template <
typename T,
typename N,
typename F,
1216 typename M=std::enable_if_t<std::is_integral_v<N>> >
1217T
Sum (N n, F
const& f, T init_val = 0)
1221#pragma omp parallel for reduction(+:r)
1223 for (N i = 0; i < n; ++i) {
1229template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1230T
Sum (N n, T
const* v, T init_val = 0)
1232 return Sum(n, [=] (N i) -> T {
return v[i]; }, init_val);
1235template <
typename T,
typename N,
typename F,
1236 typename M=std::enable_if_t<std::is_integral_v<N>> >
1237T
Min (N n, F
const& f, T init_val = std::numeric_limits<T>::max())
1241#pragma omp parallel for reduction(min:r)
1243 for (N i = 0; i < n; ++i) {
1244 r = std::min(r,f(i));
1249template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1250T
Min (N n, T
const* v, T init_val = std::numeric_limits<T>::max())
1252 return Reduce::Min(n, [=] (N i) -> T {
return v[i]; }, init_val);
1255template <
typename T,
typename N,
typename F,
1256 typename M=std::enable_if_t<std::is_integral_v<N>> >
1257T
Max (N n, F
const& f, T init_val = std::numeric_limits<T>::lowest())
1261#pragma omp parallel for reduction(max:r)
1263 for (N i = 0; i < n; ++i) {
1264 r = std::max(r,f(i));
1269template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1270T
Max (N n, T
const* v, T init_val = std::numeric_limits<T>::lowest())
1272 return Reduce::Max(n, [=] (N i) -> T {
return v[i]; }, init_val);
1275template <
typename T,
typename N,
typename F,
1276 typename M=std::enable_if_t<std::is_integral_v<N>> >
1277std::pair<T,T>
MinMax (N n, F
const& f)
1279 T r_min = std::numeric_limits<T>::max();
1280 T r_max = std::numeric_limits<T>::lowest();
1282#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1284 for (N i = 0; i < n; ++i) {
1286 r_min = std::min(r_min,tmp);
1287 r_max = std::max(r_max,tmp);
1289 return std::make_pair(r_min,r_max);
1292template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1293std::pair<T,T>
MinMax (N n, T
const* v)
1295 return Reduce::MinMax<T>(n, [=] (N i) -> T {
return v[i]; });
1298template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1299bool AnyOf (N n, T
const* v, P&& pred)
1301 return std::any_of(v, v+n, std::forward<P>(pred));
1304template <
typename P,
int dim>
1305bool AnyOf (BoxND<dim>
const& box, P
const& pred)
1307 for (
auto iv : box.iterator()) {
1308 if (Reduce::detail::call_f_intvect(pred, iv)) {
return true; }
1321template <
typename... Ts,
typename... Ps>
1323constexpr GpuTuple<Ts...>
1327 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1335template <
typename... Ts,
typename... Ps>
1337constexpr GpuTuple<Ts...>
1341 Reduce::detail::for_each_init<0,
decltype(r), Ps...>(r);
1348template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1349T Sum (N n, T* v, T init_val = 0)
1351 return Sum(n, (T
const*)v, init_val);
1355template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1356T Min (N n, T* v, T init_val = std::numeric_limits<T>::max())
1358 return Min(n, (T
const*)v, init_val);
1362template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1363T Max (N n, T* v, T init_val = std::numeric_limits<T>::lowest())
1365 return Max(n, (T
const*)v, init_val);
1369template <
typename T,
typename N,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1372 return MinMax(n, (T
const*)v);
1376template <
typename T,
typename N,
typename P,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1379 return AnyOf(n, (T
const*)v, pred);
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:697
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:85
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:257
~ReduceData()
Definition AMReX_Reduce.H:279
int maxStreamIndex() const
Definition AMReX_Reduce.H:312
Type value()
Definition AMReX_Reduce.H:289
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:313
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:308
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:262
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:301
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:295
Type * devicePtr()
Definition AMReX_Reduce.H:300
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:307
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:305
int maxBlocks() const
Definition AMReX_Reduce.H:310
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:389
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:668
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:511
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:394
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:458
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:564
void resetResultReadiness()
Definition AMReX_Reduce.H:756
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:486
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:618
amrex_long Long
Definition AMReX_INT.H:30
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:838
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:868
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:1005
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1324
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:998
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:41
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:287
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:186
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:214
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:200
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:210
Definition AMReX_Reduce.H:219
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:247
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:243
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:233
Definition AMReX_Reduce.H:152
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:177
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:181
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:173
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:164
Definition AMReX_Reduce.H:118
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:147
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:130
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:139
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
Definition AMReX_Reduce.H:88
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:101
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:110
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:113
Struct for holding types.
Definition AMReX_TypeList.H:12