1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
13#if defined(AMREX_USE_CUDA)
15#if defined(CCCL_MAJOR_VERSION) && defined(CCCL_MINOR_VERSION) && ((CCCL_MAJOR_VERSION >= 3) || ((CCCL_MAJOR_VERSION >= 2) && (CCCL_MINOR_VERSION >= 8)))
16#define AMREX_CUDA_CCCL_VER_GE_2_8 1
18#elif defined(AMREX_USE_HIP)
19#include <rocprim/rocprim.hpp>
28 void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept;
32 void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept;
36 void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept;
53template <
int warpSize,
typename T,
typename F>
57 T
operator() (T
x, sycl::sub_group
const& sg)
const noexcept
60 T
y = sycl::shift_group_left(sg,
x,
offset);
67template <
int warpSize,
typename T,
typename WARPREDUCE>
69T
blockReduce (T
x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler
const& h)
71 T* shared = (T*)h.local;
72 int tid = h.item->get_local_id(0);
73 sycl::sub_group
const& sg = h.item->get_sub_group();
74 int lane = sg.get_local_id()[0];
75 int wid = sg.get_group_id()[0];
76 int numwarps = sg.get_group_range()[0];
77 x = warp_reduce(
x, sg);
82 h.item->barrier(sycl::access::fence_space::local_space);
83 if (lane == 0) { shared[wid] =
x; }
84 h.item->barrier(sycl::access::fence_space::local_space);
85 bool b = (tid == 0) || (tid < numwarps);
86 x = b ? shared[lane] : x0;
87 if (wid == 0) {
x = warp_reduce(
x, sg); }
91template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
94 Gpu::Handler
const& handler)
96 sycl::sub_group
const& sg = handler.item->get_sub_group();
97 int wid = sg.get_group_id()[0];
98 if ((wid+1)*warpSize <= handler.numActiveThreads) {
99 x = warp_reduce(
x, sg);
100 if (sg.get_local_id()[0] == 0) { atomic_op(dest,
x); }
111 return Gpu::blockReduce<Gpu::Device::warp_size>
128 return Gpu::blockReduce<Gpu::Device::warp_size>
145 return Gpu::blockReduce<Gpu::Device::warp_size>
161 return Gpu::blockReduce<Gpu::Device::warp_size>
176 return Gpu::blockReduce<Gpu::Device::warp_size>
189void deviceReduceSum (T * dest, T source, Gpu::Handler
const& h)
noexcept
191 if (h.isFullBlock()) {
194 Gpu::blockReduce_partial<Gpu::Device::warp_size>
196 Gpu::AtomicAdd<T>(), h);
202void deviceReduceMin (T * dest, T source, Gpu::Handler
const& h)
noexcept
204 if (h.isFullBlock()) {
207 Gpu::blockReduce_partial<Gpu::Device::warp_size>
209 Gpu::AtomicMin<T>(), h);
215void deviceReduceMax (T * dest, T source, Gpu::Handler
const& h)
noexcept
217 if (h.isFullBlock()) {
220 Gpu::blockReduce_partial<Gpu::Device::warp_size>
222 Gpu::AtomicMax<T>(), h);
229 if (h.isFullBlock()) {
232 Gpu::blockReduce_partial<Gpu::Device::warp_size>
234 Gpu::AtomicLogicalAnd<int>(), h);
241 if (h.isFullBlock()) {
244 Gpu::blockReduce_partial<Gpu::Device::warp_size>
246 Gpu::AtomicLogicalOr<int>(), h);
250#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
257T shuffle_down (T x,
int offset)
noexcept
260 __shfl_down_sync(0xffffffff, x,
offset));
265requires (
sizeof(T)%
sizeof(
unsigned int) == 0)
267T multi_shuffle_down (T x,
int offset)
noexcept
269 constexpr int nwords = (
sizeof(T) +
sizeof(
unsigned int) - 1) /
sizeof(
unsigned int);
271 auto py =
reinterpret_cast<unsigned int*
>(&
y);
272 auto px =
reinterpret_cast<unsigned int*
>(&
x);
273 for (
int i = 0; i < nwords; ++i) {
274 py[i] = shuffle_down(px[i],
offset);
282template <
int warpSize,
typename T,
typename F>
288 requires (std::is_arithmetic_v<T>)
291 T
y = detail::shuffle_down(
x,
offset);
299 requires (!std::is_arithmetic_v<T>)
302 T
y = detail::multi_shuffle_down(
x,
offset);
309template <
int warpSize,
typename T,
typename WARPREDUCE>
313 __shared__ T shared[warpSize];
314 int lane = threadIdx.x % warpSize;
315 int wid = threadIdx.x / warpSize;
322 if (lane == 0) { shared[wid] =
x; }
324 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
325 x = b ? shared[lane] : x0;
326 if (wid == 0) {
x = warp_reduce(
x); }
330template <
int warpSize,
typename T,
typename WARPREDUCE,
typename ATOMICOP>
335 int warp = (
int)threadIdx.x / warpSize;
338 if (threadIdx.x % warpSize == 0) { atomic_op(dest,
x); }
349 return Gpu::blockReduce<Gpu::Device::warp_size>
362template <
int BLOCKDIMX,
typename T>
366#if defined(AMREX_USE_CUDA)
367 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
368 __shared__
typename BlockReduce::TempStorage temp_storage;
374 return BlockReduce(temp_storage).Sum(source);
375#elif defined(AMREX_USE_HIP)
376 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
377 __shared__
typename BlockReduce::storage_type temp_storage;
379 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
386template <
int BLOCKDIMX,
typename T>
390 T aggregate = blockReduceSum<BLOCKDIMX>(source);
399 return Gpu::blockReduce<Gpu::Device::warp_size>
412template <
int BLOCKDIMX,
typename T>
416#if defined(AMREX_USE_CUDA)
417 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
418 __shared__
typename BlockReduce::TempStorage temp_storage;
425#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
426 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
428 return BlockReduce(temp_storage).Reduce(source, cub::Min());
430#elif defined(AMREX_USE_HIP)
431 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
432 __shared__
typename BlockReduce::storage_type temp_storage;
434 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
441template <
int BLOCKDIMX,
typename T>
445 T aggregate = blockReduceMin<BLOCKDIMX>(source);
454 return Gpu::blockReduce<Gpu::Device::warp_size>
467template <
int BLOCKDIMX,
typename T>
471#if defined(AMREX_USE_CUDA)
472 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
473 __shared__
typename BlockReduce::TempStorage temp_storage;
479#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
480 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
482 return BlockReduce(temp_storage).Reduce(source, cub::Max());
484#elif defined(AMREX_USE_HIP)
485 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
486 __shared__
typename BlockReduce::storage_type temp_storage;
488 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
495template <
int BLOCKDIMX,
typename T>
499 T aggregate = blockReduceMax<BLOCKDIMX>(source);
507 return Gpu::blockReduce<Gpu::Device::warp_size>
519template <
int BLOCKDIMX>
523#if defined(AMREX_USE_CUDA)
524 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
525 __shared__
typename BlockReduce::TempStorage temp_storage;
532#elif defined(AMREX_USE_HIP)
533 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
534 __shared__
typename BlockReduce::storage_type temp_storage;
543template <
int BLOCKDIMX>
547 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
555 return Gpu::blockReduce<Gpu::Device::warp_size>
567template <
int BLOCKDIMX>
571#if defined(AMREX_USE_CUDA)
572 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
573 __shared__
typename BlockReduce::TempStorage temp_storage;
580#elif defined(AMREX_USE_HIP)
581 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
582 __shared__
typename BlockReduce::storage_type temp_storage;
591template <
int BLOCKDIMX>
595 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
603 if (handler.isFullBlock()) {
604 if (blockDim.x == 128) {
605 deviceReduceSum_full<128,T>(dest, source);
606 }
else if (blockDim.x == 256) {
607 deviceReduceSum_full<256,T>(dest, source);
609 deviceReduceSum_full<T>(dest, source);
612 Gpu::blockReduce_partial<Gpu::Device::warp_size>
622 if (handler.isFullBlock()) {
623 if (blockDim.x == 128) {
624 deviceReduceMin_full<128,T>(dest, source);
625 }
else if (blockDim.x == 256) {
626 deviceReduceMin_full<256,T>(dest, source);
628 deviceReduceMin_full<T>(dest, source);
631 Gpu::blockReduce_partial<Gpu::Device::warp_size>
641 if (handler.isFullBlock()) {
642 if (blockDim.x == 128) {
643 deviceReduceMax_full<128,T>(dest, source);
644 }
else if (blockDim.x == 256) {
645 deviceReduceMax_full<256,T>(dest, source);
647 deviceReduceMax_full<T>(dest, source);
650 Gpu::blockReduce_partial<Gpu::Device::warp_size>
659 if (handler.isFullBlock()) {
660 if (blockDim.x == 128) {
661 deviceReduceLogicalAnd_full<128>(dest, source);
662 }
else if (blockDim.x == 256) {
663 deviceReduceLogicalAnd_full<256>(dest, source);
668 Gpu::blockReduce_partial<Gpu::Device::warp_size>
677 if (handler.isFullBlock()) {
678 if (blockDim.x == 128) {
679 deviceReduceLogicalOr_full<128>(dest, source);
680 }
else if (blockDim.x == 256) {
681 deviceReduceLogicalOr_full<256>(dest, source);
686 Gpu::blockReduce_partial<Gpu::Device::warp_size>
716#pragma omp critical (gpureduce_reducemin)
718 *dest = std::min(*dest, source);
733#pragma omp critical (gpureduce_reducemax)
735 *dest = std::max(*dest, source);
749#pragma omp critical (gpureduce_reduceand)
751 *dest = (*dest) && source;
764#pragma omp critical (gpureduce_reduceor)
766 *dest = (*dest) || source;
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:17
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:55
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:601
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:311
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:512
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:332
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:553
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:452
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:397
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:505
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:639
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:460
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:657
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:675
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:355
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:405
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:620
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:560
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:347
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_GpuAtomic.H:661
Definition AMReX_GpuAtomic.H:685
Definition AMReX_GpuAtomic.H:693
Definition AMReX_GpuAtomic.H:677
Definition AMReX_GpuAtomic.H:669
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:284
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:287
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:298
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14