Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_GpuControl.H>
7#include <AMReX_GpuTypes.H>
8#include <AMReX_GpuAtomic.H>
9#include <AMReX_GpuUtility.H>
10#include <AMReX_Functional.H>
11#include <AMReX_TypeTraits.H>
12
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
15#endif
16
17#if defined(AMREX_USE_CUB)
18#include <cub/cub.cuh>
19#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
20#define AMREX_CUDA_CCCL_VER_GE_3 1
21#endif
22#elif defined(AMREX_USE_HIP)
23#include <rocprim/rocprim.hpp>
24#endif
25
26//
27// Public interface
28//
29namespace amrex::Gpu {
30 template <typename T>
32 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
33
34 template <typename T>
36 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
37
38 template <typename T>
40 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
41
43 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
44
46 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
47}
48
49//
50// Reduce functions based on _shfl_down_sync
51//
52
53namespace amrex::Gpu {
54
55#ifdef AMREX_USE_SYCL
56
57template <int warpSize, typename T, typename F>
58struct warpReduce
59{
61 T operator() (T x, sycl::sub_group const& sg) const noexcept
62 {
63 for (int offset = warpSize/2; offset > 0; offset /= 2) {
64 T y = sycl::shift_group_left(sg, x, offset);
65 x = F()(x,y);
66 }
67 return x;
68 }
69};
70
71template <int warpSize, typename T, typename WARPREDUCE>
73T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
74{
75 T* shared = (T*)h.local;
76 int tid = h.item->get_local_id(0);
77 sycl::sub_group const& sg = h.item->get_sub_group();
78 int lane = sg.get_local_id()[0];
79 int wid = sg.get_group_id()[0];
80 int numwarps = sg.get_group_range()[0];
81 x = warp_reduce(x, sg);
82 // __syncthreads() prior to writing to shared memory is necessary
83 // if this reduction call is occurring multiple times in a kernel,
84 // and since we don't know how many times the user is calling it,
85 // we do it always to be safe.
86 h.item->barrier(sycl::access::fence_space::local_space);
87 if (lane == 0) { shared[wid] = x; }
88 h.item->barrier(sycl::access::fence_space::local_space);
89 bool b = (tid == 0) || (tid < numwarps);
90 x = b ? shared[lane] : x0;
91 if (wid == 0) { x = warp_reduce(x, sg); }
92 return x;
93}
94
95template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
97void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
98 Gpu::Handler const& handler)
99{
100 sycl::sub_group const& sg = handler.item->get_sub_group();
101 int wid = sg.get_group_id()[0];
102 if ((wid+1)*warpSize <= handler.numActiveThreads) {
103 x = warp_reduce(x, sg); // full warp
104 if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
105 } else {
106 atomic_op(dest, x);
107 }
108}
109
110// block-wide reduction for thread0
111template <typename T>
113T blockReduceSum (T source, Gpu::Handler const& h) noexcept
114{
115 return Gpu::blockReduce<Gpu::Device::warp_size>
116 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
117}
118
119template <typename T>
121void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
122{
123 T aggregate = blockReduceSum(source, h);
124 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
125}
126
127// block-wide reduction for thread0
128template <typename T>
130T blockReduceMin (T source, Gpu::Handler const& h) noexcept
131{
132 return Gpu::blockReduce<Gpu::Device::warp_size>
133 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
134}
135
136template <typename T>
138void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
139{
140 T aggregate = blockReduceMin(source, h);
141 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
142}
143
144// block-wide reduction for thread0
145template <typename T>
147T blockReduceMax (T source, Gpu::Handler const& h) noexcept
148{
149 return Gpu::blockReduce<Gpu::Device::warp_size>
150 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
151}
152
153template <typename T>
155void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
156{
157 T aggregate = blockReduceMax(source, h);
158 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
159}
160
161// block-wide reduction for thread0
163int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
164{
165 return Gpu::blockReduce<Gpu::Device::warp_size>
166 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
167}
168
170void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
171{
172 int aggregate = blockReduceLogicalAnd(source, h);
173 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
174}
175
176// block-wide reduction for thread0
178int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
179{
180 return Gpu::blockReduce<Gpu::Device::warp_size>
181 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
182}
183
185void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
186{
187 int aggregate = blockReduceLogicalOr(source, h);
188 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
189}
190
191template <typename T>
193void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
194{
195 if (h.isFullBlock()) {
196 deviceReduceSum_full(dest, source, h);
197 } else {
198 Gpu::blockReduce_partial<Gpu::Device::warp_size>
199 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
200 Gpu::AtomicAdd<T>(), h);
201 }
202}
203
204template <typename T>
206void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
207{
208 if (h.isFullBlock()) {
209 deviceReduceMin_full(dest, source, h);
210 } else {
211 Gpu::blockReduce_partial<Gpu::Device::warp_size>
212 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
213 Gpu::AtomicMin<T>(), h);
214 }
215}
216
217template <typename T>
219void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
220{
221 if (h.isFullBlock()) {
222 deviceReduceMax_full(dest, source, h);
223 } else {
224 Gpu::blockReduce_partial<Gpu::Device::warp_size>
225 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
226 Gpu::AtomicMax<T>(), h);
227 }
228}
229
231void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
232{
233 if (h.isFullBlock()) {
234 deviceReduceLogicalAnd_full(dest, source, h);
235 } else {
236 Gpu::blockReduce_partial<Gpu::Device::warp_size>
237 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
238 Gpu::AtomicLogicalAnd<int>(), h);
239 }
240}
241
243void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
244{
245 if (h.isFullBlock()) {
246 deviceReduceLogicalOr_full(dest, source, h);
247 } else {
248 Gpu::blockReduce_partial<Gpu::Device::warp_size>
249 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
250 Gpu::AtomicLogicalOr<int>(), h);
251 }
252}
253
254#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
255
256namespace detail {
257
258template <typename T>
260T shuffle_down (T x, int offset) noexcept
261{
262 return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
263 __shfl_down_sync(0xffffffff, x, offset));
264}
265
266// If other sizeof is needed, we can implement it later.
267template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
269T multi_shuffle_down (T x, int offset) noexcept
270{
271 constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
272 T y;
273 auto py = reinterpret_cast<unsigned int*>(&y);
274 auto px = reinterpret_cast<unsigned int*>(&x);
275 for (int i = 0; i < nwords; ++i) {
276 py[i] = shuffle_down(px[i],offset);
277 }
278 return y;
279}
280
281}
282
283template <int warpSize, typename T, typename F>
285{
286 // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
287 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
289 T operator() (T x) const noexcept
290 {
291 for (int offset = warpSize/2; offset > 0; offset /= 2) {
293 x = F()(x,y);
294 }
295 return x;
296 }
297
298 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
300 T operator() (T x) const noexcept
301 {
302 for (int offset = warpSize/2; offset > 0; offset /= 2) {
304 x = F()(x,y);
305 }
306 return x;
307 }
308};
309
310template <int warpSize, typename T, typename WARPREDUCE>
312T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
313{
314 __shared__ T shared[warpSize];
315 int lane = threadIdx.x % warpSize;
316 int wid = threadIdx.x / warpSize;
317 x = warp_reduce(x);
318 // __syncthreads() prior to writing to shared memory is necessary
319 // if this reduction call is occurring multiple times in a kernel,
320 // and since we don't know how many times the user is calling it,
321 // we do it always to be safe.
322 __syncthreads();
323 if (lane == 0) { shared[wid] = x; }
324 __syncthreads();
325 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
326 x = b ? shared[lane] : x0;
327 if (wid == 0) { x = warp_reduce(x); }
328 return x;
329}
330
331template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
333void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
334 Gpu::Handler const& handler)
335{
336 int warp = (int)threadIdx.x / warpSize;
337 if ((warp+1)*warpSize <= handler.numActiveThreads) {
338 x = warp_reduce(x); // full warp
339 if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
340 } else {
341 atomic_op(dest,x);
342 }
343}
344
345// Compute the block-wide reduction for thread0
346template <typename T>
348T blockReduceSum (T source) noexcept
349{
350 return Gpu::blockReduce<Gpu::Device::warp_size>
352}
353
354template <typename T>
356void deviceReduceSum_full (T * dest, T source) noexcept
357{
358 T aggregate = blockReduceSum(source);
359 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
360}
361
362// Compute the block-wide reduction for thread0
363template <int BLOCKDIMX, typename T>
365T blockReduceSum (T source) noexcept
366{
367#if defined(AMREX_USE_CUB)
368 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
369 __shared__ typename BlockReduce::TempStorage temp_storage;
370 // __syncthreads() prior to writing to shared memory is necessary
371 // if this reduction call is occurring multiple times in a kernel,
372 // and since we don't know how many times the user is calling it,
373 // we do it always to be safe.
374 __syncthreads();
375 return BlockReduce(temp_storage).Sum(source);
376#elif defined(AMREX_USE_HIP)
377 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
378 __shared__ typename BlockReduce::storage_type temp_storage;
379 __syncthreads();
380 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
381 return source;
382#else
383 return blockReduceSum(source);
384#endif
385}
386
387template <int BLOCKDIMX, typename T>
389void deviceReduceSum_full (T * dest, T source) noexcept
390{
391 T aggregate = blockReduceSum<BLOCKDIMX>(source);
392 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
393}
394
395// Compute the block-wide reduction for thread0
396template <typename T>
398T blockReduceMin (T source) noexcept
399{
400 return Gpu::blockReduce<Gpu::Device::warp_size>
402}
403
404template <typename T>
406void deviceReduceMin_full (T * dest, T source) noexcept
407{
408 T aggregate = blockReduceMin(source);
409 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
410}
411
412// Compute the block-wide reduction for thread0
413template <int BLOCKDIMX, typename T>
415T blockReduceMin (T source) noexcept
416{
417#if defined(AMREX_USE_CUB)
418 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
419 __shared__ typename BlockReduce::TempStorage temp_storage;
420 // __syncthreads() prior to writing to shared memory is necessary
421 // if this reduction call is occurring multiple times in a kernel,
422 // and since we don't know how many times the user is calling it,
423 // we do it always to be safe.
424 __syncthreads();
425
426#ifdef AMREX_CUDA_CCCL_VER_GE_3
427 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
428#else
429 return BlockReduce(temp_storage).Reduce(source, cub::Min());
430#endif
431#elif defined(AMREX_USE_HIP)
432 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
433 __shared__ typename BlockReduce::storage_type temp_storage;
434 __syncthreads();
435 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
436 return source;
437#else
438 return blockReduceMin(source);
439#endif
440}
441
442template <int BLOCKDIMX, typename T>
444void deviceReduceMin_full (T * dest, T source) noexcept
445{
446 T aggregate = blockReduceMin<BLOCKDIMX>(source);
447 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
448}
449
450// Compute the block-wide reduction for thread0
451template <typename T>
453T blockReduceMax (T source) noexcept
454{
455 return Gpu::blockReduce<Gpu::Device::warp_size>
457}
458
459template <typename T>
461void deviceReduceMax_full (T * dest, T source) noexcept
462{
463 T aggregate = blockReduceMax(source);
464 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
465}
466
467// Compute the block-wide reduction for thread0
468template <int BLOCKDIMX, typename T>
470T blockReduceMax (T source) noexcept
471{
472#if defined(AMREX_USE_CUB)
473 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
474 __shared__ typename BlockReduce::TempStorage temp_storage;
475 // __syncthreads() prior to writing to shared memory is necessary
476 // if this reduction call is occurring multiple times in a kernel,
477 // and since we don't know how many times the user is calling it,
478 // we do it always to be safe.
479 __syncthreads();
480#ifdef AMREX_CUDA_CCCL_VER_GE_3
481 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
482#else
483 return BlockReduce(temp_storage).Reduce(source, cub::Max());
484#endif
485#elif defined(AMREX_USE_HIP)
486 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
487 __shared__ typename BlockReduce::storage_type temp_storage;
488 __syncthreads();
489 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
490 return source;
491#else
492 return blockReduceMax(source);
493#endif
494}
495
496template <int BLOCKDIMX, typename T>
498void deviceReduceMax_full (T * dest, T source) noexcept
499{
500 T aggregate = blockReduceMax<BLOCKDIMX>(source);
501 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
502}
503
504// \cond CODEGEN
505
506// Compute the block-wide reduction for thread0
508int blockReduceLogicalAnd (int source) noexcept
509{
510 return Gpu::blockReduce<Gpu::Device::warp_size>
512}
513
515void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
516{
517 int aggregate = blockReduceLogicalAnd(source);
518 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
519}
520
521// Compute the block-wide reduction for thread0
522template <int BLOCKDIMX>
524int blockReduceLogicalAnd (int source) noexcept
525{
526#if defined(AMREX_USE_CUB)
527 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
528 __shared__ typename BlockReduce::TempStorage temp_storage;
529 // __syncthreads() prior to writing to shared memory is necessary
530 // if this reduction call is occurring multiple times in a kernel,
531 // and since we don't know how many times the user is calling it,
532 // we do it always to be safe.
533 __syncthreads();
534 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
535#elif defined(AMREX_USE_HIP)
536 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
537 __shared__ typename BlockReduce::storage_type temp_storage;
538 __syncthreads();
539 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
540 return source;
541#else
542 return blockReduceLogicalAnd(source);
543#endif
544}
545
546template <int BLOCKDIMX>
548void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
549{
550 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
551 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
552}
553
554// Compute the block-wide reduction for thread0
556int blockReduceLogicalOr (int source) noexcept
557{
558 return Gpu::blockReduce<Gpu::Device::warp_size>
559 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0);
560}
561
563void deviceReduceLogicalOr_full (int * dest, int source) noexcept
564{
565 int aggregate = blockReduceLogicalOr(source);
566 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
567}
568
569// Compute the block-wide reduction for thread0
570template <int BLOCKDIMX>
572int blockReduceLogicalOr (int source) noexcept
573{
574#if defined(AMREX_USE_CUB)
575 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
576 __shared__ typename BlockReduce::TempStorage temp_storage;
577 // __syncthreads() prior to writing to shared memory is necessary
578 // if this reduction call is occurring multiple times in a kernel,
579 // and since we don't know how many times the user is calling it,
580 // we do it always to be safe.
581 __syncthreads();
582 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
583#elif defined(AMREX_USE_HIP)
584 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
585 __shared__ typename BlockReduce::storage_type temp_storage;
586 __syncthreads();
587 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
588 return source;
589#else
590 return blockReduceLogicalOr(source);
591#endif
592}
593
594template <int BLOCKDIMX>
596void deviceReduceLogicalOr_full (int * dest, int source) noexcept
597{
598 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
599 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
600}
601
602// \endcond
603
604template <typename T>
606void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
607{
608 if (handler.isFullBlock()) {
609 if (blockDim.x == 128) {
610 deviceReduceSum_full<128,T>(dest, source);
611 } else if (blockDim.x == 256) {
612 deviceReduceSum_full<256,T>(dest, source);
613 } else {
614 deviceReduceSum_full<T>(dest, source);
615 }
616 } else {
617 Gpu::blockReduce_partial<Gpu::Device::warp_size>
619 Gpu::AtomicAdd<T>(), handler);
620 }
621}
622
623template <typename T>
625void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
626{
627 if (handler.isFullBlock()) {
628 if (blockDim.x == 128) {
629 deviceReduceMin_full<128,T>(dest, source);
630 } else if (blockDim.x == 256) {
631 deviceReduceMin_full<256,T>(dest, source);
632 } else {
633 deviceReduceMin_full<T>(dest, source);
634 }
635 } else {
636 Gpu::blockReduce_partial<Gpu::Device::warp_size>
638 Gpu::AtomicMin<T>(), handler);
639 }
640}
641
642template <typename T>
644void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
645{
646 if (handler.isFullBlock()) {
647 if (blockDim.x == 128) {
648 deviceReduceMax_full<128,T>(dest, source);
649 } else if (blockDim.x == 256) {
650 deviceReduceMax_full<256,T>(dest, source);
651 } else {
652 deviceReduceMax_full<T>(dest, source);
653 }
654 } else {
655 Gpu::blockReduce_partial<Gpu::Device::warp_size>
657 Gpu::AtomicMax<T>(), handler);
658 }
659}
660
662void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
663{
664 if (handler.isFullBlock()) {
665 if (blockDim.x == 128) {
666 deviceReduceLogicalAnd_full<128>(dest, source);
667 } else if (blockDim.x == 256) {
668 deviceReduceLogicalAnd_full<256>(dest, source);
669 } else {
670 deviceReduceLogicalAnd_full(dest, source);
671 }
672 } else {
673 Gpu::blockReduce_partial<Gpu::Device::warp_size>
675 Gpu::AtomicLogicalAnd<int>(), handler);
676 }
677}
678
680void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
681{
682 if (handler.isFullBlock()) {
683 if (blockDim.x == 128) {
684 deviceReduceLogicalOr_full<128>(dest, source);
685 } else if (blockDim.x == 256) {
686 deviceReduceLogicalOr_full<256>(dest, source);
687 } else {
688 deviceReduceLogicalOr_full(dest, source);
689 }
690 } else {
691 Gpu::blockReduce_partial<Gpu::Device::warp_size>
693 Gpu::AtomicLogicalOr<int>(), handler);
694 }
695}
696
697#else
698
699template <typename T>
701void deviceReduceSum_full (T * dest, T source) noexcept
702{
703#ifdef AMREX_USE_OMP
704#pragma omp atomic
705#endif
706 *dest += source;
707}
708
709template <typename T>
711void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
712{
713 deviceReduceSum_full(dest, source);
714}
715
716template <typename T>
718void deviceReduceMin_full (T * dest, T source) noexcept
719{
720#ifdef AMREX_USE_OMP
721#pragma omp critical (gpureduce_reducemin)
722#endif
723 *dest = std::min(*dest, source);
724}
725
726template <typename T>
728void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
729{
730 deviceReduceMin_full(dest, source);
731}
732
733template <typename T>
735void deviceReduceMax_full (T * dest, T source) noexcept
736{
737#ifdef AMREX_USE_OMP
738#pragma omp critical (gpureduce_reducemax)
739#endif
740 *dest = std::max(*dest, source);
741}
742
743template <typename T>
745void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
746{
747 deviceReduceMax_full(dest, source);
748}
749
751void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
752{
753#ifdef AMREX_USE_OMP
754#pragma omp critical (gpureduce_reduceand)
755#endif
756 *dest = (*dest) && source;
757}
758
760void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
761{
762 deviceReduceLogicalAnd_full(dest, source);
763}
764
766void deviceReduceLogicalOr_full (int * dest, int source) noexcept
767{
768#ifdef AMREX_USE_OMP
769#pragma omp critical (gpureduce_reduceor)
770#endif
771 *dest = (*dest) || source;
772}
773
775void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
776{
777 deviceReduceLogicalOr_full(dest, source);
778}
779
780#endif
781}
782
783#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
__device__ AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition AMReX_GpuAtomic.H:87
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:459
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:434
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:281
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
__device__ T shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:260
__device__ T multi_shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:269
Definition AMReX_BaseFwd.H:52
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:606
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:312
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:333
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:453
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:398
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:644
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:461
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:662
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:680
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:356
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:406
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:625
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:348
Definition AMReX_FabArrayCommI.H:1000
Definition AMReX_GpuAtomic.H:632
Definition AMReX_GpuAtomic.H:656
Definition AMReX_GpuAtomic.H:664
Definition AMReX_GpuAtomic.H:648
Definition AMReX_GpuAtomic.H:640
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:285
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:289
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14