1 #ifndef AMREX_GPU_REDUCE_H_
2 #define AMREX_GPU_REDUCE_H_
3 #include <AMReX_Config.H>
13 #if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14 #define AMREX_USE_CUB 1
17 #if defined(AMREX_USE_CUB)
18 #include <cub/cub.cuh>
19 #elif defined(AMREX_USE_HIP)
20 #include <rocprim/rocprim.hpp>
29 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h) noexcept;
33 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h) noexcept;
37 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h) noexcept;
54 template <
int warpSize,
typename T,
typename F>
58 T
operator() (T x, sycl::sub_group
const& sg)
const noexcept
61 T y = sycl::shift_group_left(sg, x,
offset);
68 template <
int warpSize,
typename T,
typename WARPREDUCE>
70 T
blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
72 T* shared = (T*)h.local;
73 int tid = h.item->get_local_id(0);
74 sycl::sub_group
const& sg = h.item->get_sub_group();
75 int lane = sg.get_local_id()[0];
76 int wid = sg.get_group_id()[0];
77 int numwarps = sg.get_group_range()[0];
78 x = warp_reduce(x, sg);
83 h.item->barrier(sycl::access::fence_space::local_space);
84 if (lane == 0) { shared[wid] =
x; }
85 h.item->barrier(sycl::access::fence_space::local_space);
86 bool b = (tid == 0) || (tid < numwarps);
87 x =
b ? shared[lane] : x0;
88 if (wid == 0) {
x = warp_reduce(x, sg); }
92 template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
95 Gpu::Handler
const& handler)
97 sycl::sub_group
const& sg = handler.item->get_sub_group();
98 int wid = sg.get_group_id()[0];
99 if ((wid+1)*warpSize <= handler.numActiveThreads) {
100 x = warp_reduce(x, sg);
101 if (sg.get_local_id()[0] == 0) {
atomic_op(dest, x); }
108 template <
typename T>
112 return Gpu::blockReduce<Gpu::Device::warp_size>
116 template <
typename T>
125 template <
typename T>
129 return Gpu::blockReduce<Gpu::Device::warp_size>
133 template <
typename T>
142 template <
typename T>
146 return Gpu::blockReduce<Gpu::Device::warp_size>
150 template <
typename T>
160 int blockReduceLogicalAnd (
int source, Gpu::Handler
const& h) noexcept
162 return Gpu::blockReduce<Gpu::Device::warp_size>
167 void deviceReduceLogicalAnd_full (
int * dest,
int source, Gpu::Handler
const& h) noexcept
169 int aggregate = blockReduceLogicalAnd(source, h);
175 int blockReduceLogicalOr (
int source, Gpu::Handler
const& h) noexcept
177 return Gpu::blockReduce<Gpu::Device::warp_size>
182 void deviceReduceLogicalOr_full (
int * dest,
int source, Gpu::Handler
const& h) noexcept
184 int aggregate = blockReduceLogicalOr(source, h);
188 template <
typename T>
190 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h) noexcept
192 if (h.isFullBlock()) {
195 Gpu::blockReduce_partial<Gpu::Device::warp_size>
197 Gpu::AtomicAdd<T>(), h);
201 template <
typename T>
203 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h) noexcept
205 if (h.isFullBlock()) {
208 Gpu::blockReduce_partial<Gpu::Device::warp_size>
210 Gpu::AtomicMin<T>(), h);
214 template <
typename T>
216 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h) noexcept
218 if (h.isFullBlock()) {
221 Gpu::blockReduce_partial<Gpu::Device::warp_size>
223 Gpu::AtomicMax<T>(), h);
230 if (h.isFullBlock()) {
231 deviceReduceLogicalAnd_full(dest, source, h);
233 Gpu::blockReduce_partial<Gpu::Device::warp_size>
235 Gpu::AtomicLogicalAnd<int>(), h);
242 if (h.isFullBlock()) {
243 deviceReduceLogicalOr_full(dest, source, h);
245 Gpu::blockReduce_partial<Gpu::Device::warp_size>
247 Gpu::AtomicLogicalOr<int>(), h);
251 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
255 template <
typename T>
260 __shfl_down_sync(0xffffffff,
x,
offset));
264 template <
class T, std::enable_if_t<sizeof(T)%sizeof(
unsigned int) == 0,
int> = 0>
268 constexpr
int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
270 auto py =
reinterpret_cast<unsigned int*
>(&y);
271 auto px =
reinterpret_cast<unsigned int*
>(&
x);
272 for (
int i = 0; i < nwords; ++i) {
280 template <
int warpSize,
typename T,
typename F>
284 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,
int> = 0>
295 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,
int> = 0>
307 template <
int warpSize,
typename T,
typename WARPREDUCE>
311 __shared__ T shared[warpSize];
312 int lane = threadIdx.x % warpSize;
313 int wid = threadIdx.x / warpSize;
320 if (lane == 0) { shared[wid] =
x; }
322 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
323 x =
b ? shared[lane] : x0;
324 if (wid == 0) {
x = warp_reduce(
x); }
328 template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
333 int warp = (
int)threadIdx.x / warpSize;
336 if (threadIdx.x % warpSize == 0) {
atomic_op(dest,
x); }
343 template <
typename T>
347 return Gpu::blockReduce<Gpu::Device::warp_size>
351 template <
typename T>
360 template <
int BLOCKDIMX,
typename T>
364 #if defined(AMREX_USE_CUB)
365 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
366 __shared__
typename BlockReduce::TempStorage temp_storage;
372 return BlockReduce(temp_storage).Sum(source);
373 #elif defined(AMREX_USE_HIP)
374 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
375 __shared__
typename BlockReduce::storage_type temp_storage;
377 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
384 template <
int BLOCKDIMX,
typename T>
388 T aggregate = blockReduceSum<BLOCKDIMX>(source);
393 template <
typename T>
397 return Gpu::blockReduce<Gpu::Device::warp_size>
401 template <
typename T>
410 template <
int BLOCKDIMX,
typename T>
414 #if defined(AMREX_USE_CUB)
415 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
416 __shared__
typename BlockReduce::TempStorage temp_storage;
422 return BlockReduce(temp_storage).Reduce(source,
cub::Min());
423 #elif defined(AMREX_USE_HIP)
424 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
425 __shared__
typename BlockReduce::storage_type temp_storage;
427 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
434 template <
int BLOCKDIMX,
typename T>
438 T aggregate = blockReduceMin<BLOCKDIMX>(source);
443 template <
typename T>
447 return Gpu::blockReduce<Gpu::Device::warp_size>
451 template <
typename T>
460 template <
int BLOCKDIMX,
typename T>
464 #if defined(AMREX_USE_CUB)
465 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
466 __shared__
typename BlockReduce::TempStorage temp_storage;
472 return BlockReduce(temp_storage).Reduce(source,
cub::Max());
473 #elif defined(AMREX_USE_HIP)
474 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
475 __shared__
typename BlockReduce::storage_type temp_storage;
477 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
484 template <
int BLOCKDIMX,
typename T>
488 T aggregate = blockReduceMax<BLOCKDIMX>(source);
496 int blockReduceLogicalAnd (
int source) noexcept
498 return Gpu::blockReduce<Gpu::Device::warp_size>
503 void deviceReduceLogicalAnd_full (
int * dest,
int source) noexcept
505 int aggregate = blockReduceLogicalAnd(source);
510 template <
int BLOCKDIMX>
512 int blockReduceLogicalAnd (
int source) noexcept
514 #if defined(AMREX_USE_CUB)
515 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
516 __shared__
typename BlockReduce::TempStorage temp_storage;
523 #elif defined(AMREX_USE_HIP)
524 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
525 __shared__
typename BlockReduce::storage_type temp_storage;
530 return blockReduceLogicalAnd(source);
534 template <
int BLOCKDIMX>
536 void deviceReduceLogicalAnd_full (
int * dest,
int source) noexcept
538 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
544 int blockReduceLogicalOr (
int source) noexcept
546 return Gpu::blockReduce<Gpu::Device::warp_size>
551 void deviceReduceLogicalOr_full (
int * dest,
int source) noexcept
553 int aggregate = blockReduceLogicalOr(source);
558 template <
int BLOCKDIMX>
560 int blockReduceLogicalOr (
int source) noexcept
562 #if defined(AMREX_USE_CUB)
563 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
564 __shared__
typename BlockReduce::TempStorage temp_storage;
571 #elif defined(AMREX_USE_HIP)
572 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
573 __shared__
typename BlockReduce::storage_type temp_storage;
578 return blockReduceLogicalOr(source);
582 template <
int BLOCKDIMX>
584 void deviceReduceLogicalOr_full (
int * dest,
int source) noexcept
586 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
592 template <
typename T>
596 if (handler.isFullBlock()) {
597 if (blockDim.x == 128) {
598 deviceReduceSum_full<128,T>(dest, source);
599 }
else if (blockDim.x == 256) {
600 deviceReduceSum_full<256,T>(dest, source);
602 deviceReduceSum_full<T>(dest, source);
605 Gpu::blockReduce_partial<Gpu::Device::warp_size>
611 template <
typename T>
615 if (handler.isFullBlock()) {
616 if (blockDim.x == 128) {
617 deviceReduceMin_full<128,T>(dest, source);
618 }
else if (blockDim.x == 256) {
619 deviceReduceMin_full<256,T>(dest, source);
621 deviceReduceMin_full<T>(dest, source);
624 Gpu::blockReduce_partial<Gpu::Device::warp_size>
630 template <
typename T>
634 if (handler.isFullBlock()) {
635 if (blockDim.x == 128) {
636 deviceReduceMax_full<128,T>(dest, source);
637 }
else if (blockDim.x == 256) {
638 deviceReduceMax_full<256,T>(dest, source);
640 deviceReduceMax_full<T>(dest, source);
643 Gpu::blockReduce_partial<Gpu::Device::warp_size>
652 if (handler.isFullBlock()) {
653 if (blockDim.x == 128) {
654 deviceReduceLogicalAnd_full<128>(dest, source);
655 }
else if (blockDim.x == 256) {
656 deviceReduceLogicalAnd_full<256>(dest, source);
658 deviceReduceLogicalAnd_full(dest, source);
661 Gpu::blockReduce_partial<Gpu::Device::warp_size>
670 if (handler.isFullBlock()) {
671 if (blockDim.x == 128) {
672 deviceReduceLogicalOr_full<128>(dest, source);
673 }
else if (blockDim.x == 256) {
674 deviceReduceLogicalOr_full<256>(dest, source);
676 deviceReduceLogicalOr_full(dest, source);
679 Gpu::blockReduce_partial<Gpu::Device::warp_size>
687 template <
typename T>
697 template <
typename T>
699 void deviceReduceSum (T * dest, T source, Gpu::Handler
const&) noexcept
704 template <
typename T>
709 #pragma omp critical (gpureduce_reducemin)
714 template <
typename T>
716 void deviceReduceMin (T * dest, T source, Gpu::Handler
const&) noexcept
721 template <
typename T>
726 #pragma omp critical (gpureduce_reducemax)
731 template <
typename T>
733 void deviceReduceMax (T * dest, T source, Gpu::Handler
const&) noexcept
739 void deviceReduceLogicalAnd_full (
int * dest,
int source) noexcept
742 #pragma omp critical (gpureduce_reduceand)
744 *dest = (*dest) && source;
750 deviceReduceLogicalAnd_full(dest, source);
754 void deviceReduceLogicalOr_full (
int * dest,
int source) noexcept
757 #pragma omp critical (gpureduce_reduceor)
759 *dest = (*dest) || source;
765 deviceReduceLogicalOr_full(dest, source);
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition: AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
AMREX_GPU_DEVICE AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition: AMReX_GpuAtomic.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition: AMReX_GpuAtomic.H:417
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition: AMReX_GpuAtomic.H:434
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition: AMReX_GpuAtomic.H:459
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition: AMReX_GpuAtomic.H:354
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition: AMReX_GpuAtomic.H:281
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T multi_shuffle_down(T x, int offset) noexcept
Definition: AMReX_GpuReduce.H:266
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T shuffle_down(T x, int offset) noexcept
Definition: AMReX_GpuReduce.H:257
Definition: AMReX_BaseFwd.H:52
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition: AMReX_GpuReduce.H:309
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:650
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition: AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:632
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition: AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition: AMReX_GpuReduce.H:445
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:453
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition: AMReX_GpuReduce.H:330
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:668
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:594
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:353
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:403
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:613
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Definition: AMReX_FabArrayCommI.H:841
Definition: AMReX_GpuAtomic.H:632
Definition: AMReX_GpuAtomic.H:656
Definition: AMReX_GpuAtomic.H:664
Definition: AMReX_GpuAtomic.H:648
Definition: AMReX_GpuAtomic.H:640
Definition: AMReX_GpuTypes.H:86
int numActiveThreads
Definition: AMReX_GpuTypes.H:96
Definition: AMReX_GpuReduce.H:282
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T operator()(T x) const noexcept
Definition: AMReX_GpuReduce.H:286
Definition: AMReX_Functional.H:50
Definition: AMReX_Functional.H:59
Definition: AMReX_Functional.H:41
Definition: AMReX_Functional.H:32
Definition: AMReX_Functional.H:14