1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
17#if defined(AMREX_USE_CUB)
19#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
20#define AMREX_CUDA_CCCL_VER_GE_3 1
22#elif defined(AMREX_USE_HIP)
23#include <rocprim/rocprim.hpp>
32 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept;
36 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept;
40 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept;
57template <
int warpSize,
typename T,
typename F>
61 T
operator() (T
x, sycl::sub_group
const& sg)
const noexcept
64 T
y = sycl::shift_group_left(sg,
x,
offset);
71template <
int warpSize,
typename T,
typename WARPREDUCE>
73T
blockReduce (T
x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
75 T* shared = (T*)h.local;
76 int tid = h.item->get_local_id(0);
77 sycl::sub_group
const& sg = h.item->get_sub_group();
78 int lane = sg.get_local_id()[0];
79 int wid = sg.get_group_id()[0];
80 int numwarps = sg.get_group_range()[0];
81 x = warp_reduce(
x, sg);
86 h.item->barrier(sycl::access::fence_space::local_space);
87 if (lane == 0) { shared[wid] =
x; }
88 h.item->barrier(sycl::access::fence_space::local_space);
89 bool b = (tid == 0) || (tid < numwarps);
90 x =
b ? shared[lane] : x0;
91 if (wid == 0) {
x = warp_reduce(
x, sg); }
95template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
98 Gpu::Handler
const& handler)
100 sycl::sub_group
const& sg = handler.item->get_sub_group();
101 int wid = sg.get_group_id()[0];
102 if ((wid+1)*warpSize <= handler.numActiveThreads) {
103 x = warp_reduce(
x, sg);
104 if (sg.get_local_id()[0] == 0) {
atomic_op(dest,
x); }
115 return Gpu::blockReduce<Gpu::Device::warp_size>
132 return Gpu::blockReduce<Gpu::Device::warp_size>
149 return Gpu::blockReduce<Gpu::Device::warp_size>
163int blockReduceLogicalAnd (
int source, Gpu::Handler
const& h)
noexcept
165 return Gpu::blockReduce<Gpu::Device::warp_size>
170void deviceReduceLogicalAnd_full (
int * dest,
int source, Gpu::Handler
const& h)
noexcept
172 int aggregate = blockReduceLogicalAnd(source, h);
178int blockReduceLogicalOr (
int source, Gpu::Handler
const& h)
noexcept
180 return Gpu::blockReduce<Gpu::Device::warp_size>
185void deviceReduceLogicalOr_full (
int * dest,
int source, Gpu::Handler
const& h)
noexcept
187 int aggregate = blockReduceLogicalOr(source, h);
193void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept
195 if (h.isFullBlock()) {
198 Gpu::blockReduce_partial<Gpu::Device::warp_size>
200 Gpu::AtomicAdd<T>(), h);
206void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept
208 if (h.isFullBlock()) {
211 Gpu::blockReduce_partial<Gpu::Device::warp_size>
213 Gpu::AtomicMin<T>(), h);
219void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept
221 if (h.isFullBlock()) {
224 Gpu::blockReduce_partial<Gpu::Device::warp_size>
226 Gpu::AtomicMax<T>(), h);
233 if (h.isFullBlock()) {
234 deviceReduceLogicalAnd_full(dest, source, h);
236 Gpu::blockReduce_partial<Gpu::Device::warp_size>
238 Gpu::AtomicLogicalAnd<int>(), h);
245 if (h.isFullBlock()) {
246 deviceReduceLogicalOr_full(dest, source, h);
248 Gpu::blockReduce_partial<Gpu::Device::warp_size>
250 Gpu::AtomicLogicalOr<int>(), h);
254#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
263 __shfl_down_sync(0xffffffff,
x,
offset));
267template <
class T, std::enable_if_t<sizeof(T)%sizeof(
unsigned int) == 0,
int> = 0>
271 constexpr int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
273 auto py =
reinterpret_cast<unsigned int*
>(&
y);
274 auto px =
reinterpret_cast<unsigned int*
>(&
x);
275 for (
int i = 0; i < nwords; ++i) {
283template <
int warpSize,
typename T,
typename F>
287 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,
int> = 0>
298 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,
int> = 0>
310template <
int warpSize,
typename T,
typename WARPREDUCE>
314 __shared__ T shared[warpSize];
315 int lane = threadIdx.x % warpSize;
316 int wid = threadIdx.x / warpSize;
323 if (lane == 0) { shared[wid] =
x; }
325 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
326 x =
b ? shared[lane] : x0;
327 if (wid == 0) {
x = warp_reduce(
x); }
331template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
336 int warp = (int)threadIdx.x / warpSize;
339 if (threadIdx.x % warpSize == 0) { atomic_op(dest,
x); }
350 return Gpu::blockReduce<Gpu::Device::warp_size>
363template <
int BLOCKDIMX,
typename T>
367#if defined(AMREX_USE_CUB)
368 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
369 __shared__
typename BlockReduce::TempStorage temp_storage;
375 return BlockReduce(temp_storage).Sum(source);
376#elif defined(AMREX_USE_HIP)
377 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
378 __shared__
typename BlockReduce::storage_type temp_storage;
380 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
387template <
int BLOCKDIMX,
typename T>
391 T aggregate = blockReduceSum<BLOCKDIMX>(source);
400 return Gpu::blockReduce<Gpu::Device::warp_size>
413template <
int BLOCKDIMX,
typename T>
417#if defined(AMREX_USE_CUB)
418 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
419 __shared__
typename BlockReduce::TempStorage temp_storage;
426#ifdef AMREX_CUDA_CCCL_VER_GE_3
427 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
429 return BlockReduce(temp_storage).Reduce(source, cub::Min());
431#elif defined(AMREX_USE_HIP)
432 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
433 __shared__
typename BlockReduce::storage_type temp_storage;
435 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
442template <
int BLOCKDIMX,
typename T>
446 T aggregate = blockReduceMin<BLOCKDIMX>(source);
455 return Gpu::blockReduce<Gpu::Device::warp_size>
468template <
int BLOCKDIMX,
typename T>
472#if defined(AMREX_USE_CUB)
473 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
474 __shared__
typename BlockReduce::TempStorage temp_storage;
480#ifdef AMREX_CUDA_CCCL_VER_GE_3
481 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
483 return BlockReduce(temp_storage).Reduce(source, cub::Max());
485#elif defined(AMREX_USE_HIP)
486 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
487 __shared__
typename BlockReduce::storage_type temp_storage;
489 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
496template <
int BLOCKDIMX,
typename T>
500 T aggregate = blockReduceMax<BLOCKDIMX>(source);
508int blockReduceLogicalAnd (
int source)
noexcept
510 return Gpu::blockReduce<Gpu::Device::warp_size>
515void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
517 int aggregate = blockReduceLogicalAnd(source);
522template <
int BLOCKDIMX>
524int blockReduceLogicalAnd (
int source)
noexcept
526#if defined(AMREX_USE_CUB)
527 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
528 __shared__
typename BlockReduce::TempStorage temp_storage;
535#elif defined(AMREX_USE_HIP)
536 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
537 __shared__
typename BlockReduce::storage_type temp_storage;
542 return blockReduceLogicalAnd(source);
546template <
int BLOCKDIMX>
548void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
550 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
556int blockReduceLogicalOr (
int source)
noexcept
558 return Gpu::blockReduce<Gpu::Device::warp_size>
563void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
565 int aggregate = blockReduceLogicalOr(source);
570template <
int BLOCKDIMX>
572int blockReduceLogicalOr (
int source)
noexcept
574#if defined(AMREX_USE_CUB)
575 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
576 __shared__
typename BlockReduce::TempStorage temp_storage;
583#elif defined(AMREX_USE_HIP)
584 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
585 __shared__
typename BlockReduce::storage_type temp_storage;
590 return blockReduceLogicalOr(source);
594template <
int BLOCKDIMX>
596void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
598 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
608 if (handler.isFullBlock()) {
609 if (blockDim.x == 128) {
610 deviceReduceSum_full<128,T>(dest, source);
611 }
else if (blockDim.x == 256) {
612 deviceReduceSum_full<256,T>(dest, source);
614 deviceReduceSum_full<T>(dest, source);
617 Gpu::blockReduce_partial<Gpu::Device::warp_size>
627 if (handler.isFullBlock()) {
628 if (blockDim.x == 128) {
629 deviceReduceMin_full<128,T>(dest, source);
630 }
else if (blockDim.x == 256) {
631 deviceReduceMin_full<256,T>(dest, source);
633 deviceReduceMin_full<T>(dest, source);
636 Gpu::blockReduce_partial<Gpu::Device::warp_size>
646 if (handler.isFullBlock()) {
647 if (blockDim.x == 128) {
648 deviceReduceMax_full<128,T>(dest, source);
649 }
else if (blockDim.x == 256) {
650 deviceReduceMax_full<256,T>(dest, source);
652 deviceReduceMax_full<T>(dest, source);
655 Gpu::blockReduce_partial<Gpu::Device::warp_size>
664 if (handler.isFullBlock()) {
665 if (blockDim.x == 128) {
666 deviceReduceLogicalAnd_full<128>(dest, source);
667 }
else if (blockDim.x == 256) {
668 deviceReduceLogicalAnd_full<256>(dest, source);
670 deviceReduceLogicalAnd_full(dest, source);
673 Gpu::blockReduce_partial<Gpu::Device::warp_size>
682 if (handler.isFullBlock()) {
683 if (blockDim.x == 128) {
684 deviceReduceLogicalOr_full<128>(dest, source);
685 }
else if (blockDim.x == 256) {
686 deviceReduceLogicalOr_full<256>(dest, source);
688 deviceReduceLogicalOr_full(dest, source);
691 Gpu::blockReduce_partial<Gpu::Device::warp_size>
721#pragma omp critical (gpureduce_reducemin)
723 *dest = std::min(*dest, source);
738#pragma omp critical (gpureduce_reducemax)
740 *dest = std::max(*dest, source);
751void deviceReduceLogicalAnd_full (
int * dest,
int source)
noexcept
754#pragma omp critical (gpureduce_reduceand)
756 *dest = (*dest) && source;
762 deviceReduceLogicalAnd_full(dest, source);
766void deviceReduceLogicalOr_full (
int * dest,
int source)
noexcept
769#pragma omp critical (gpureduce_reduceor)
771 *dest = (*dest) || source;
777 deviceReduceLogicalOr_full(dest, source);
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
__device__ AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition AMReX_GpuAtomic.H:87
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:459
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:434
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:281
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
__device__ T shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:260
__device__ T multi_shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:269
Definition AMReX_BaseFwd.H:52
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:606
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:312
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:333
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:453
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:398
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:644
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:461
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:662
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:680
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:356
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:406
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:625
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:348
Definition AMReX_FabArrayCommI.H:1000
Definition AMReX_GpuAtomic.H:632
Definition AMReX_GpuAtomic.H:656
Definition AMReX_GpuAtomic.H:664
Definition AMReX_GpuAtomic.H:648
Definition AMReX_GpuAtomic.H:640
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:285
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:289
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14