Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_GpuControl.H>
7#include <AMReX_GpuTypes.H>
8#include <AMReX_GpuAtomic.H>
9#include <AMReX_GpuUtility.H>
10#include <AMReX_Functional.H>
11#include <AMReX_TypeTraits.H>
12
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
15#endif
16
17#if defined(AMREX_USE_CUB)
18#include <cub/cub.cuh>
19#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
20#define AMREX_CUDA_CCCL_VER_GE_3 1
21#endif
22#elif defined(AMREX_USE_HIP)
23#include <rocprim/rocprim.hpp>
24#endif
25
26//
27// Public interface
28//
29namespace amrex::Gpu {
30 template <typename T>
32 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
33
34 template <typename T>
36 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
37
38 template <typename T>
40 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
41
43 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
44
46 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
47}
48
49//
50// Reduce functions based on _shfl_down_sync
51//
52
53namespace amrex::Gpu {
54
55#ifdef AMREX_USE_SYCL
56
57template <int warpSize, typename T, typename F>
58struct warpReduce
59{
61 T operator() (T x, sycl::sub_group const& sg) const noexcept
62 {
63 for (int offset = warpSize/2; offset > 0; offset /= 2) {
64 T y = sycl::shift_group_left(sg, x, offset);
65 x = F()(x,y);
66 }
67 return x;
68 }
69};
70
71template <int warpSize, typename T, typename WARPREDUCE>
73T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
74{
75 T* shared = (T*)h.local;
76 int tid = h.item->get_local_id(0);
77 sycl::sub_group const& sg = h.item->get_sub_group();
78 int lane = sg.get_local_id()[0];
79 int wid = sg.get_group_id()[0];
80 int numwarps = sg.get_group_range()[0];
81 x = warp_reduce(x, sg);
82 // __syncthreads() prior to writing to shared memory is necessary
83 // if this reduction call is occurring multiple times in a kernel,
84 // and since we don't know how many times the user is calling it,
85 // we do it always to be safe.
86 h.item->barrier(sycl::access::fence_space::local_space);
87 if (lane == 0) { shared[wid] = x; }
88 h.item->barrier(sycl::access::fence_space::local_space);
89 bool b = (tid == 0) || (tid < numwarps);
90 x = b ? shared[lane] : x0;
91 if (wid == 0) { x = warp_reduce(x, sg); }
92 return x;
93}
94
95template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
97void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
98 Gpu::Handler const& handler)
99{
100 sycl::sub_group const& sg = handler.item->get_sub_group();
101 int wid = sg.get_group_id()[0];
102 if ((wid+1)*warpSize <= handler.numActiveThreads) {
103 x = warp_reduce(x, sg); // full warp
104 if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
105 } else {
106 atomic_op(dest, x);
107 }
108}
109
110// block-wide reduction for thread0
111template <typename T>
113T blockReduceSum (T source, Gpu::Handler const& h) noexcept
114{
115 return Gpu::blockReduce<Gpu::Device::warp_size>
116 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
117}
118
119template <typename T>
121void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
122{
123 T aggregate = blockReduceSum(source, h);
124 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
125}
126
127// block-wide reduction for thread0
128template <typename T>
130T blockReduceMin (T source, Gpu::Handler const& h) noexcept
131{
132 return Gpu::blockReduce<Gpu::Device::warp_size>
133 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
134}
135
136template <typename T>
138void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
139{
140 T aggregate = blockReduceMin(source, h);
141 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
142}
143
144// block-wide reduction for thread0
145template <typename T>
147T blockReduceMax (T source, Gpu::Handler const& h) noexcept
148{
149 return Gpu::blockReduce<Gpu::Device::warp_size>
150 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
151}
152
153template <typename T>
155void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
156{
157 T aggregate = blockReduceMax(source, h);
158 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
159}
160
161// block-wide reduction for thread0
163int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
164{
165 return Gpu::blockReduce<Gpu::Device::warp_size>
166 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
167}
168
170void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
171{
172 int aggregate = blockReduceLogicalAnd(source, h);
173 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
174}
175
176// block-wide reduction for thread0
178int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
179{
180 return Gpu::blockReduce<Gpu::Device::warp_size>
181 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
182}
183
185void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
186{
187 int aggregate = blockReduceLogicalOr(source, h);
188 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
189}
190
191template <typename T>
193void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
194{
195 if (h.isFullBlock()) {
196 deviceReduceSum_full(dest, source, h);
197 } else {
198 Gpu::blockReduce_partial<Gpu::Device::warp_size>
199 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
200 Gpu::AtomicAdd<T>(), h);
201 }
202}
203
204template <typename T>
206void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
207{
208 if (h.isFullBlock()) {
209 deviceReduceMin_full(dest, source, h);
210 } else {
211 Gpu::blockReduce_partial<Gpu::Device::warp_size>
212 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
213 Gpu::AtomicMin<T>(), h);
214 }
215}
216
217template <typename T>
219void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
220{
221 if (h.isFullBlock()) {
222 deviceReduceMax_full(dest, source, h);
223 } else {
224 Gpu::blockReduce_partial<Gpu::Device::warp_size>
225 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
226 Gpu::AtomicMax<T>(), h);
227 }
228}
229
231void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
232{
233 if (h.isFullBlock()) {
234 deviceReduceLogicalAnd_full(dest, source, h);
235 } else {
236 Gpu::blockReduce_partial<Gpu::Device::warp_size>
237 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
238 Gpu::AtomicLogicalAnd<int>(), h);
239 }
240}
241
243void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
244{
245 if (h.isFullBlock()) {
246 deviceReduceLogicalOr_full(dest, source, h);
247 } else {
248 Gpu::blockReduce_partial<Gpu::Device::warp_size>
249 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
250 Gpu::AtomicLogicalOr<int>(), h);
251 }
252}
253
254#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
255
257namespace detail {
258
259template <typename T>
261T shuffle_down (T x, int offset) noexcept
262{
263 return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
264 __shfl_down_sync(0xffffffff, x, offset));
265}
266
267// If other sizeof is needed, we can implement it later.
268template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
270T multi_shuffle_down (T x, int offset) noexcept
271{
272 constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
273 T y;
274 auto py = reinterpret_cast<unsigned int*>(&y);
275 auto px = reinterpret_cast<unsigned int*>(&x);
276 for (int i = 0; i < nwords; ++i) {
277 py[i] = shuffle_down(px[i],offset);
278 }
279 return y;
280}
281
282}
284
285template <int warpSize, typename T, typename F>
287{
288 // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
289 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
291 T operator() (T x) const noexcept
292 {
293 for (int offset = warpSize/2; offset > 0; offset /= 2) {
294 T y = detail::shuffle_down(x, offset);
295 x = F()(x,y);
296 }
297 return x;
298 }
299
300 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
302 T operator() (T x) const noexcept
303 {
304 for (int offset = warpSize/2; offset > 0; offset /= 2) {
305 T y = detail::multi_shuffle_down(x, offset);
306 x = F()(x,y);
307 }
308 return x;
309 }
310};
311
312template <int warpSize, typename T, typename WARPREDUCE>
314T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
315{
316 __shared__ T shared[warpSize];
317 int lane = threadIdx.x % warpSize;
318 int wid = threadIdx.x / warpSize;
319 x = warp_reduce(x);
320 // __syncthreads() prior to writing to shared memory is necessary
321 // if this reduction call is occurring multiple times in a kernel,
322 // and since we don't know how many times the user is calling it,
323 // we do it always to be safe.
324 __syncthreads();
325 if (lane == 0) { shared[wid] = x; }
326 __syncthreads();
327 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
328 x = b ? shared[lane] : x0;
329 if (wid == 0) { x = warp_reduce(x); }
330 return x;
331}
332
333template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
335void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
336 Gpu::Handler const& handler)
337{
338 int warp = (int)threadIdx.x / warpSize;
339 if ((warp+1)*warpSize <= handler.numActiveThreads) {
340 x = warp_reduce(x); // full warp
341 if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
342 } else {
343 atomic_op(dest,x);
344 }
345}
346
347// Compute the block-wide reduction for thread0
348template <typename T>
350T blockReduceSum (T source) noexcept
351{
352 return Gpu::blockReduce<Gpu::Device::warp_size>
354}
355
356template <typename T>
358void deviceReduceSum_full (T * dest, T source) noexcept
359{
360 T aggregate = blockReduceSum(source);
361 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
362}
363
364// Compute the block-wide reduction for thread0
365template <int BLOCKDIMX, typename T>
367T blockReduceSum (T source) noexcept
368{
369#if defined(AMREX_USE_CUB)
370 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
371 __shared__ typename BlockReduce::TempStorage temp_storage;
372 // __syncthreads() prior to writing to shared memory is necessary
373 // if this reduction call is occurring multiple times in a kernel,
374 // and since we don't know how many times the user is calling it,
375 // we do it always to be safe.
376 __syncthreads();
377 return BlockReduce(temp_storage).Sum(source);
378#elif defined(AMREX_USE_HIP)
379 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
380 __shared__ typename BlockReduce::storage_type temp_storage;
381 __syncthreads();
382 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
383 return source;
384#else
385 return blockReduceSum(source);
386#endif
387}
388
389template <int BLOCKDIMX, typename T>
391void deviceReduceSum_full (T * dest, T source) noexcept
392{
393 T aggregate = blockReduceSum<BLOCKDIMX>(source);
394 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
395}
396
397// Compute the block-wide reduction for thread0
398template <typename T>
400T blockReduceMin (T source) noexcept
401{
402 return Gpu::blockReduce<Gpu::Device::warp_size>
404}
405
406template <typename T>
408void deviceReduceMin_full (T * dest, T source) noexcept
409{
410 T aggregate = blockReduceMin(source);
411 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
412}
413
414// Compute the block-wide reduction for thread0
415template <int BLOCKDIMX, typename T>
417T blockReduceMin (T source) noexcept
418{
419#if defined(AMREX_USE_CUB)
420 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
421 __shared__ typename BlockReduce::TempStorage temp_storage;
422 // __syncthreads() prior to writing to shared memory is necessary
423 // if this reduction call is occurring multiple times in a kernel,
424 // and since we don't know how many times the user is calling it,
425 // we do it always to be safe.
426 __syncthreads();
427
428#ifdef AMREX_CUDA_CCCL_VER_GE_3
429 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
430#else
431 return BlockReduce(temp_storage).Reduce(source, cub::Min());
432#endif
433#elif defined(AMREX_USE_HIP)
434 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
435 __shared__ typename BlockReduce::storage_type temp_storage;
436 __syncthreads();
437 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
438 return source;
439#else
440 return blockReduceMin(source);
441#endif
442}
443
444template <int BLOCKDIMX, typename T>
446void deviceReduceMin_full (T * dest, T source) noexcept
447{
448 T aggregate = blockReduceMin<BLOCKDIMX>(source);
449 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
450}
451
452// Compute the block-wide reduction for thread0
453template <typename T>
455T blockReduceMax (T source) noexcept
456{
457 return Gpu::blockReduce<Gpu::Device::warp_size>
459}
460
461template <typename T>
463void deviceReduceMax_full (T * dest, T source) noexcept
464{
465 T aggregate = blockReduceMax(source);
466 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
467}
468
469// Compute the block-wide reduction for thread0
470template <int BLOCKDIMX, typename T>
472T blockReduceMax (T source) noexcept
473{
474#if defined(AMREX_USE_CUB)
475 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
476 __shared__ typename BlockReduce::TempStorage temp_storage;
477 // __syncthreads() prior to writing to shared memory is necessary
478 // if this reduction call is occurring multiple times in a kernel,
479 // and since we don't know how many times the user is calling it,
480 // we do it always to be safe.
481 __syncthreads();
482#ifdef AMREX_CUDA_CCCL_VER_GE_3
483 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
484#else
485 return BlockReduce(temp_storage).Reduce(source, cub::Max());
486#endif
487#elif defined(AMREX_USE_HIP)
488 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
489 __shared__ typename BlockReduce::storage_type temp_storage;
490 __syncthreads();
491 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
492 return source;
493#else
494 return blockReduceMax(source);
495#endif
496}
497
498template <int BLOCKDIMX, typename T>
500void deviceReduceMax_full (T * dest, T source) noexcept
501{
502 T aggregate = blockReduceMax<BLOCKDIMX>(source);
503 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
504}
505
506// Compute the block-wide reduction for thread0
508int blockReduceLogicalAnd (int source) noexcept
509{
510 return Gpu::blockReduce<Gpu::Device::warp_size>
512}
513
515void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
516{
517 int aggregate = blockReduceLogicalAnd(source);
518 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
519}
520
521// Compute the block-wide reduction for thread0
522template <int BLOCKDIMX>
524int blockReduceLogicalAnd (int source) noexcept
525{
526#if defined(AMREX_USE_CUB)
527 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
528 __shared__ typename BlockReduce::TempStorage temp_storage;
529 // __syncthreads() prior to writing to shared memory is necessary
530 // if this reduction call is occurring multiple times in a kernel,
531 // and since we don't know how many times the user is calling it,
532 // we do it always to be safe.
533 __syncthreads();
534 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
535#elif defined(AMREX_USE_HIP)
536 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
537 __shared__ typename BlockReduce::storage_type temp_storage;
538 __syncthreads();
539 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
540 return source;
541#else
542 return blockReduceLogicalAnd(source);
543#endif
544}
545
546template <int BLOCKDIMX>
548void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
549{
550 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
551 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
552}
553
554// Compute the block-wide reduction for thread0
556int blockReduceLogicalOr (int source) noexcept
557{
558 return Gpu::blockReduce<Gpu::Device::warp_size>
560}
561
563void deviceReduceLogicalOr_full (int * dest, int source) noexcept
564{
565 int aggregate = blockReduceLogicalOr(source);
566 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
567}
568
569// Compute the block-wide reduction for thread0
570template <int BLOCKDIMX>
572int blockReduceLogicalOr (int source) noexcept
573{
574#if defined(AMREX_USE_CUB)
575 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
576 __shared__ typename BlockReduce::TempStorage temp_storage;
577 // __syncthreads() prior to writing to shared memory is necessary
578 // if this reduction call is occurring multiple times in a kernel,
579 // and since we don't know how many times the user is calling it,
580 // we do it always to be safe.
581 __syncthreads();
582 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
583#elif defined(AMREX_USE_HIP)
584 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
585 __shared__ typename BlockReduce::storage_type temp_storage;
586 __syncthreads();
587 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
588 return source;
589#else
590 return blockReduceLogicalOr(source);
591#endif
592}
593
594template <int BLOCKDIMX>
596void deviceReduceLogicalOr_full (int * dest, int source) noexcept
597{
598 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
599 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
600}
601
602template <typename T>
604void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
605{
606 if (handler.isFullBlock()) {
607 if (blockDim.x == 128) {
608 deviceReduceSum_full<128,T>(dest, source);
609 } else if (blockDim.x == 256) {
610 deviceReduceSum_full<256,T>(dest, source);
611 } else {
612 deviceReduceSum_full<T>(dest, source);
613 }
614 } else {
615 Gpu::blockReduce_partial<Gpu::Device::warp_size>
617 Gpu::AtomicAdd<T>(), handler);
618 }
619}
620
621template <typename T>
623void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
624{
625 if (handler.isFullBlock()) {
626 if (blockDim.x == 128) {
627 deviceReduceMin_full<128,T>(dest, source);
628 } else if (blockDim.x == 256) {
629 deviceReduceMin_full<256,T>(dest, source);
630 } else {
631 deviceReduceMin_full<T>(dest, source);
632 }
633 } else {
634 Gpu::blockReduce_partial<Gpu::Device::warp_size>
636 Gpu::AtomicMin<T>(), handler);
637 }
638}
639
640template <typename T>
642void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
643{
644 if (handler.isFullBlock()) {
645 if (blockDim.x == 128) {
646 deviceReduceMax_full<128,T>(dest, source);
647 } else if (blockDim.x == 256) {
648 deviceReduceMax_full<256,T>(dest, source);
649 } else {
650 deviceReduceMax_full<T>(dest, source);
651 }
652 } else {
653 Gpu::blockReduce_partial<Gpu::Device::warp_size>
655 Gpu::AtomicMax<T>(), handler);
656 }
657}
658
660void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
661{
662 if (handler.isFullBlock()) {
663 if (blockDim.x == 128) {
664 deviceReduceLogicalAnd_full<128>(dest, source);
665 } else if (blockDim.x == 256) {
666 deviceReduceLogicalAnd_full<256>(dest, source);
667 } else {
668 deviceReduceLogicalAnd_full(dest, source);
669 }
670 } else {
671 Gpu::blockReduce_partial<Gpu::Device::warp_size>
673 Gpu::AtomicLogicalAnd<int>(), handler);
674 }
675}
676
678void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
679{
680 if (handler.isFullBlock()) {
681 if (blockDim.x == 128) {
682 deviceReduceLogicalOr_full<128>(dest, source);
683 } else if (blockDim.x == 256) {
684 deviceReduceLogicalOr_full<256>(dest, source);
685 } else {
686 deviceReduceLogicalOr_full(dest, source);
687 }
688 } else {
689 Gpu::blockReduce_partial<Gpu::Device::warp_size>
691 Gpu::AtomicLogicalOr<int>(), handler);
692 }
693}
694
695#else
696
697template <typename T>
699void deviceReduceSum_full (T * dest, T source) noexcept
700{
701#ifdef AMREX_USE_OMP
702#pragma omp atomic
703#endif
704 *dest += source;
705}
706
707template <typename T>
709void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
710{
711 deviceReduceSum_full(dest, source);
712}
713
714template <typename T>
716void deviceReduceMin_full (T * dest, T source) noexcept
717{
718#ifdef AMREX_USE_OMP
719#pragma omp critical (gpureduce_reducemin)
720#endif
721 *dest = std::min(*dest, source);
722}
723
724template <typename T>
726void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
727{
728 deviceReduceMin_full(dest, source);
729}
730
731template <typename T>
733void deviceReduceMax_full (T * dest, T source) noexcept
734{
735#ifdef AMREX_USE_OMP
736#pragma omp critical (gpureduce_reducemax)
737#endif
738 *dest = std::max(*dest, source);
739}
740
741template <typename T>
743void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
744{
745 deviceReduceMax_full(dest, source);
746}
747
749void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
750{
751#ifdef AMREX_USE_OMP
752#pragma omp critical (gpureduce_reduceand)
753#endif
754 *dest = (*dest) && source;
755}
756
758void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
759{
760 deviceReduceLogicalAnd_full(dest, source);
761}
762
764void deviceReduceLogicalOr_full (int * dest, int source) noexcept
765{
766#ifdef AMREX_USE_OMP
767#pragma omp critical (gpureduce_reduceor)
768#endif
769 *dest = (*dest) || source;
770}
771
773void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
774{
775 deviceReduceLogicalOr_full(dest, source);
776}
777
778#endif
779}
780
781#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:52
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:604
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:314
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:515
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:335
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:642
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:463
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:660
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:678
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:358
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:408
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:623
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:563
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_GpuAtomic.H:634
Definition AMReX_GpuAtomic.H:658
Definition AMReX_GpuAtomic.H:666
Definition AMReX_GpuAtomic.H:650
Definition AMReX_GpuAtomic.H:642
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:287
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:291
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14