1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
17#if defined(AMREX_USE_CUB)
19#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
20#define AMREX_CUDA_CCCL_VER_GE_3 1
22#elif defined(AMREX_USE_HIP)
23#include <rocprim/rocprim.hpp>
32 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept;
36 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept;
40 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept;
57template <
int warpSize,
typename T,
typename F>
61 T
operator() (T
x, sycl::sub_group
const& sg)
const noexcept
64 T
y = sycl::shift_group_left(sg,
x,
offset);
71template <
int warpSize,
typename T,
typename WARPREDUCE>
73T
blockReduce (T
x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
75 T* shared = (T*)h.local;
76 int tid = h.item->get_local_id(0);
77 sycl::sub_group
const& sg = h.item->get_sub_group();
78 int lane = sg.get_local_id()[0];
79 int wid = sg.get_group_id()[0];
80 int numwarps = sg.get_group_range()[0];
81 x = warp_reduce(
x, sg);
86 h.item->barrier(sycl::access::fence_space::local_space);
87 if (lane == 0) { shared[wid] =
x; }
88 h.item->barrier(sycl::access::fence_space::local_space);
89 bool b = (tid == 0) || (tid < numwarps);
90 x = b ? shared[lane] : x0;
91 if (wid == 0) {
x = warp_reduce(
x, sg); }
95template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
98 Gpu::Handler
const& handler)
100 sycl::sub_group
const& sg = handler.item->get_sub_group();
101 int wid = sg.get_group_id()[0];
102 if ((wid+1)*warpSize <= handler.numActiveThreads) {
103 x = warp_reduce(
x, sg);
104 if (sg.get_local_id()[0] == 0) { atomic_op(dest,
x); }
115 return Gpu::blockReduce<Gpu::Device::warp_size>
132 return Gpu::blockReduce<Gpu::Device::warp_size>
149 return Gpu::blockReduce<Gpu::Device::warp_size>
165 return Gpu::blockReduce<Gpu::Device::warp_size>
180 return Gpu::blockReduce<Gpu::Device::warp_size>
193void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept
195 if (h.isFullBlock()) {
198 Gpu::blockReduce_partial<Gpu::Device::warp_size>
200 Gpu::AtomicAdd<T>(), h);
206void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept
208 if (h.isFullBlock()) {
211 Gpu::blockReduce_partial<Gpu::Device::warp_size>
213 Gpu::AtomicMin<T>(), h);
219void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept
221 if (h.isFullBlock()) {
224 Gpu::blockReduce_partial<Gpu::Device::warp_size>
226 Gpu::AtomicMax<T>(), h);
233 if (h.isFullBlock()) {
236 Gpu::blockReduce_partial<Gpu::Device::warp_size>
238 Gpu::AtomicLogicalAnd<int>(), h);
245 if (h.isFullBlock()) {
248 Gpu::blockReduce_partial<Gpu::Device::warp_size>
250 Gpu::AtomicLogicalOr<int>(), h);
254#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
261T shuffle_down (T x,
int offset)
noexcept
264 __shfl_down_sync(0xffffffff, x,
offset));
268template <
class T, std::enable_if_t<sizeof(T)%sizeof(
unsigned int) == 0,
int> = 0>
270T multi_shuffle_down (T x,
int offset)
noexcept
272 constexpr int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
274 auto py =
reinterpret_cast<unsigned int*
>(&
y);
275 auto px =
reinterpret_cast<unsigned int*
>(&
x);
276 for (
int i = 0; i < nwords; ++i) {
277 py[i] = shuffle_down(px[i],
offset);
285template <
int warpSize,
typename T,
typename F>
289 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,
int> = 0>
294 T
y = detail::shuffle_down(
x,
offset);
300 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,
int> = 0>
305 T
y = detail::multi_shuffle_down(
x,
offset);
312template <
int warpSize,
typename T,
typename WARPREDUCE>
316 __shared__ T shared[warpSize];
317 int lane = threadIdx.x % warpSize;
318 int wid = threadIdx.x / warpSize;
325 if (lane == 0) { shared[wid] =
x; }
327 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
328 x = b ? shared[lane] : x0;
329 if (wid == 0) {
x = warp_reduce(
x); }
333template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
338 int warp = (int)threadIdx.x / warpSize;
341 if (threadIdx.x % warpSize == 0) { atomic_op(dest,
x); }
352 return Gpu::blockReduce<Gpu::Device::warp_size>
365template <
int BLOCKDIMX,
typename T>
369#if defined(AMREX_USE_CUB)
370 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
371 __shared__
typename BlockReduce::TempStorage temp_storage;
377 return BlockReduce(temp_storage).Sum(source);
378#elif defined(AMREX_USE_HIP)
379 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
380 __shared__
typename BlockReduce::storage_type temp_storage;
382 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
389template <
int BLOCKDIMX,
typename T>
393 T aggregate = blockReduceSum<BLOCKDIMX>(source);
402 return Gpu::blockReduce<Gpu::Device::warp_size>
415template <
int BLOCKDIMX,
typename T>
419#if defined(AMREX_USE_CUB)
420 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
421 __shared__
typename BlockReduce::TempStorage temp_storage;
428#ifdef AMREX_CUDA_CCCL_VER_GE_3
429 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
431 return BlockReduce(temp_storage).Reduce(source, cub::Min());
433#elif defined(AMREX_USE_HIP)
434 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
435 __shared__
typename BlockReduce::storage_type temp_storage;
437 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
444template <
int BLOCKDIMX,
typename T>
448 T aggregate = blockReduceMin<BLOCKDIMX>(source);
457 return Gpu::blockReduce<Gpu::Device::warp_size>
470template <
int BLOCKDIMX,
typename T>
474#if defined(AMREX_USE_CUB)
475 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
476 __shared__
typename BlockReduce::TempStorage temp_storage;
482#ifdef AMREX_CUDA_CCCL_VER_GE_3
483 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
485 return BlockReduce(temp_storage).Reduce(source, cub::Max());
487#elif defined(AMREX_USE_HIP)
488 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
489 __shared__
typename BlockReduce::storage_type temp_storage;
491 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
498template <
int BLOCKDIMX,
typename T>
502 T aggregate = blockReduceMax<BLOCKDIMX>(source);
510 return Gpu::blockReduce<Gpu::Device::warp_size>
522template <
int BLOCKDIMX>
526#if defined(AMREX_USE_CUB)
527 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
528 __shared__
typename BlockReduce::TempStorage temp_storage;
535#elif defined(AMREX_USE_HIP)
536 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
537 __shared__
typename BlockReduce::storage_type temp_storage;
546template <
int BLOCKDIMX>
550 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
558 return Gpu::blockReduce<Gpu::Device::warp_size>
570template <
int BLOCKDIMX>
574#if defined(AMREX_USE_CUB)
575 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
576 __shared__
typename BlockReduce::TempStorage temp_storage;
583#elif defined(AMREX_USE_HIP)
584 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
585 __shared__
typename BlockReduce::storage_type temp_storage;
594template <
int BLOCKDIMX>
598 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
606 if (handler.isFullBlock()) {
607 if (blockDim.x == 128) {
608 deviceReduceSum_full<128,T>(dest, source);
609 }
else if (blockDim.x == 256) {
610 deviceReduceSum_full<256,T>(dest, source);
612 deviceReduceSum_full<T>(dest, source);
615 Gpu::blockReduce_partial<Gpu::Device::warp_size>
625 if (handler.isFullBlock()) {
626 if (blockDim.x == 128) {
627 deviceReduceMin_full<128,T>(dest, source);
628 }
else if (blockDim.x == 256) {
629 deviceReduceMin_full<256,T>(dest, source);
631 deviceReduceMin_full<T>(dest, source);
634 Gpu::blockReduce_partial<Gpu::Device::warp_size>
644 if (handler.isFullBlock()) {
645 if (blockDim.x == 128) {
646 deviceReduceMax_full<128,T>(dest, source);
647 }
else if (blockDim.x == 256) {
648 deviceReduceMax_full<256,T>(dest, source);
650 deviceReduceMax_full<T>(dest, source);
653 Gpu::blockReduce_partial<Gpu::Device::warp_size>
662 if (handler.isFullBlock()) {
663 if (blockDim.x == 128) {
664 deviceReduceLogicalAnd_full<128>(dest, source);
665 }
else if (blockDim.x == 256) {
666 deviceReduceLogicalAnd_full<256>(dest, source);
671 Gpu::blockReduce_partial<Gpu::Device::warp_size>
680 if (handler.isFullBlock()) {
681 if (blockDim.x == 128) {
682 deviceReduceLogicalOr_full<128>(dest, source);
683 }
else if (blockDim.x == 256) {
684 deviceReduceLogicalOr_full<256>(dest, source);
689 Gpu::blockReduce_partial<Gpu::Device::warp_size>
719#pragma omp critical (gpureduce_reducemin)
721 *dest = std::min(*dest, source);
736#pragma omp critical (gpureduce_reducemax)
738 *dest = std::max(*dest, source);
752#pragma omp critical (gpureduce_reduceand)
754 *dest = (*dest) && source;
767#pragma omp critical (gpureduce_reduceor)
769 *dest = (*dest) || source;
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:52
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:604
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:314
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:515
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:335
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:642
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:463
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:660
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:678
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:358
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:408
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:623
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:563
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_GpuAtomic.H:634
Definition AMReX_GpuAtomic.H:658
Definition AMReX_GpuAtomic.H:666
Definition AMReX_GpuAtomic.H:650
Definition AMReX_GpuAtomic.H:642
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:287
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:291
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14