1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
17#if defined(AMREX_USE_CUB)
19#elif defined(AMREX_USE_HIP)
20#include <rocprim/rocprim.hpp>
29 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept;
33 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept;
37 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept;
54template <
int warpSize,
typename T,
typename F>
58 T
operator() (T
x, sycl::sub_group
const& sg)
const noexcept
61 T y = sycl::shift_group_left(sg,
x,
offset);
68template <
int warpSize,
typename T,
typename WARPREDUCE>
70T
blockReduce (T
x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
72 T* shared = (T*)h.local;
73 int tid = h.item->get_local_id(0);
74 sycl::sub_group
const& sg = h.item->get_sub_group();
75 int lane = sg.get_local_id()[0];
76 int wid = sg.get_group_id()[0];
77 int numwarps = sg.get_group_range()[0];
78 x = warp_reduce(
x, sg);
83 h.item->barrier(sycl::access::fence_space::local_space);
84 if (lane == 0) { shared[wid] =
x; }
85 h.item->barrier(sycl::access::fence_space::local_space);
86 bool b = (tid == 0) || (tid < numwarps);
87 x =
b ? shared[lane] : x0;
88 if (wid == 0) {
x = warp_reduce(
x, sg); }
92template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
95 Gpu::Handler
const& handler)
97 sycl::sub_group
const& sg = handler.item->get_sub_group();
98 int wid = sg.get_group_id()[0];
99 if ((wid+1)*warpSize <= handler.numActiveThreads) {
100 x = warp_reduce(
x, sg);
101 if (sg.get_local_id()[0] == 0) {
atomic_op(dest,
x); }
112 return Gpu::blockReduce<Gpu::Device::warp_size>
129 return Gpu::blockReduce<Gpu::Device::warp_size>
146 return Gpu::blockReduce<Gpu::Device::warp_size>
160int blockReduceLogicalAnd (
int source, Gpu::Handler
const& h)
noexcept
162 return Gpu::blockReduce<Gpu::Device::warp_size>
167void deviceReduceLogicalAnd_full (
int * dest,
int source, Gpu::Handler
const& h)
noexcept
169 int aggregate = blockReduceLogicalAnd(source, h);
175int blockReduceLogicalOr (
int source, Gpu::Handler
const& h)
noexcept
177 return Gpu::blockReduce<Gpu::Device::warp_size>
182void deviceReduceLogicalOr_full (
int * dest,
int source, Gpu::Handler
const& h)
noexcept
184 int aggregate = blockReduceLogicalOr(source, h);
190void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept
192 if (h.isFullBlock()) {
195 Gpu::blockReduce_partial<Gpu::Device::warp_size>
197 Gpu::AtomicAdd<T>(), h);
203void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept
205 if (h.isFullBlock()) {
208 Gpu::blockReduce_partial<Gpu::Device::warp_size>
210 Gpu::AtomicMin<T>(), h);
216void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept
218 if (h.isFullBlock()) {
221 Gpu::blockReduce_partial<Gpu::Device::warp_size>
223 Gpu::AtomicMax<T>(), h);
230 if (h.isFullBlock()) {
231 deviceReduceLogicalAnd_full(dest, source, h);
233 Gpu::blockReduce_partial<Gpu::Device::warp_size>
235 Gpu::AtomicLogicalAnd<int>(), h);
242 if (h.isFullBlock()) {
243 deviceReduceLogicalOr_full(dest, source, h);
245 Gpu::blockReduce_partial<Gpu::Device::warp_size>
247 Gpu::AtomicLogicalOr<int>(), h);
251#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
260 __shfl_down_sync(0xffffffff,
x,
offset));
264template <
class T, std::enable_if_t<sizeof(T)%sizeof(
unsigned int) == 0,
int> = 0>
268 constexpr int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
270 auto py =
reinterpret_cast<unsigned int*
>(&y);
271 auto px =
reinterpret_cast<unsigned int*
>(&
x);
272 for (
int i = 0; i < nwords; ++i) {
280template <
int warpSize,
typename T,
typename F>
284 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,
int> = 0>
295 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,
int> = 0>
307template <
int warpSize,
typename T,
typename WARPREDUCE>
311 __shared__ T shared[warpSize];
312 int lane = threadIdx.x % warpSize;
313 int wid = threadIdx.x / warpSize;
320 if (lane == 0) { shared[wid] =
x; }
322 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
323 x =
b ? shared[lane] : x0;
324 if (wid == 0) {
x = warp_reduce(
x); }
328template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
333 int warp = (
int)threadIdx.x / warpSize;
336 if (threadIdx.x % warpSize == 0) { atomic_op(dest,
x); }
347 return Gpu::blockReduce<Gpu::Device::warp_size>
360template <
int BLOCKDIMX,
typename T>
364#if defined(AMREX_USE_CUB)
365 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
366 __shared__
typename BlockReduce::TempStorage temp_storage;
372 return BlockReduce(temp_storage).Sum(source);
373#elif defined(AMREX_USE_HIP)
374 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
375 __shared__
typename BlockReduce::storage_type temp_storage;
377 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
384template <
int BLOCKDIMX,
typename T>
388 T aggregate = blockReduceSum<BLOCKDIMX>(source);
397 return Gpu::blockReduce<Gpu::Device::warp_size>
410template <
int BLOCKDIMX,
typename T>
414#if defined(AMREX_USE_CUB)
415 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
416 __shared__
typename BlockReduce::TempStorage temp_storage;
422 return BlockReduce(temp_storage).Reduce(source, cub::Min());
423#elif defined(AMREX_USE_HIP)
424 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
425 __shared__
typename BlockReduce::storage_type temp_storage;
427 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
434template <
int BLOCKDIMX,
typename T>
438 T aggregate = blockReduceMin<BLOCKDIMX>(source);
447 return Gpu::blockReduce<Gpu::Device::warp_size>
460template <
int BLOCKDIMX,
typename T>
464#if defined(AMREX_USE_CUB)
465 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
466 __shared__
typename BlockReduce::TempStorage temp_storage;
472 return BlockReduce(temp_storage).Reduce(source, cub::Max());
473#elif defined(AMREX_USE_HIP)
474 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
475 __shared__
typename BlockReduce::storage_type temp_storage;
477 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
484template <
int BLOCKDIMX,
typename T>
488 T aggregate = blockReduceMax<BLOCKDIMX>(source);
496int blockReduceLogicalAnd (
int source)
noexcept
498 return Gpu::blockReduce<Gpu::Device::warp_size>
503void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
505 int aggregate = blockReduceLogicalAnd(source);
510template <
int BLOCKDIMX>
512int blockReduceLogicalAnd (
int source)
noexcept
514#if defined(AMREX_USE_CUB)
515 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
516 __shared__
typename BlockReduce::TempStorage temp_storage;
523#elif defined(AMREX_USE_HIP)
524 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
525 __shared__
typename BlockReduce::storage_type temp_storage;
530 return blockReduceLogicalAnd(source);
534template <
int BLOCKDIMX>
536void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
538 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
544int blockReduceLogicalOr (
int source)
noexcept
546 return Gpu::blockReduce<Gpu::Device::warp_size>
551void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
553 int aggregate = blockReduceLogicalOr(source);
558template <
int BLOCKDIMX>
560int blockReduceLogicalOr (
int source)
noexcept
562#if defined(AMREX_USE_CUB)
563 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
564 __shared__
typename BlockReduce::TempStorage temp_storage;
571#elif defined(AMREX_USE_HIP)
572 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
573 __shared__
typename BlockReduce::storage_type temp_storage;
578 return blockReduceLogicalOr(source);
582template <
int BLOCKDIMX>
584void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
586 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
596 if (handler.isFullBlock()) {
597 if (blockDim.x == 128) {
598 deviceReduceSum_full<128,T>(dest, source);
599 }
else if (blockDim.x == 256) {
600 deviceReduceSum_full<256,T>(dest, source);
602 deviceReduceSum_full<T>(dest, source);
605 Gpu::blockReduce_partial<Gpu::Device::warp_size>
615 if (handler.isFullBlock()) {
616 if (blockDim.x == 128) {
617 deviceReduceMin_full<128,T>(dest, source);
618 }
else if (blockDim.x == 256) {
619 deviceReduceMin_full<256,T>(dest, source);
621 deviceReduceMin_full<T>(dest, source);
624 Gpu::blockReduce_partial<Gpu::Device::warp_size>
634 if (handler.isFullBlock()) {
635 if (blockDim.x == 128) {
636 deviceReduceMax_full<128,T>(dest, source);
637 }
else if (blockDim.x == 256) {
638 deviceReduceMax_full<256,T>(dest, source);
640 deviceReduceMax_full<T>(dest, source);
643 Gpu::blockReduce_partial<Gpu::Device::warp_size>
652 if (handler.isFullBlock()) {
653 if (blockDim.x == 128) {
654 deviceReduceLogicalAnd_full<128>(dest, source);
655 }
else if (blockDim.x == 256) {
656 deviceReduceLogicalAnd_full<256>(dest, source);
658 deviceReduceLogicalAnd_full(dest, source);
661 Gpu::blockReduce_partial<Gpu::Device::warp_size>
670 if (handler.isFullBlock()) {
671 if (blockDim.x == 128) {
672 deviceReduceLogicalOr_full<128>(dest, source);
673 }
else if (blockDim.x == 256) {
674 deviceReduceLogicalOr_full<256>(dest, source);
676 deviceReduceLogicalOr_full(dest, source);
679 Gpu::blockReduce_partial<Gpu::Device::warp_size>
709#pragma omp critical (gpureduce_reducemin)
711 *dest = std::min(*dest, source);
726#pragma omp critical (gpureduce_reducemax)
728 *dest = std::max(*dest, source);
739void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
742#pragma omp critical (gpureduce_reduceand)
744 *dest = (*dest) && source;
750 deviceReduceLogicalAnd_full(dest, source);
754void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
757#pragma omp critical (gpureduce_reduceor)
759 *dest = (*dest) || source;
765 deviceReduceLogicalOr_full(dest, source);
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
AMREX_GPU_DEVICE AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition AMReX_GpuAtomic.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:434
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:459
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:281
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T multi_shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:266
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:257
Definition AMReX_BaseFwd.H:52
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:309
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:650
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:632
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:445
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:453
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:330
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:668
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:594
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:353
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:403
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:613
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_GpuAtomic.H:632
Definition AMReX_GpuAtomic.H:656
Definition AMReX_GpuAtomic.H:664
Definition AMReX_GpuAtomic.H:648
Definition AMReX_GpuAtomic.H:640
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:282
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:286
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14