Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_GpuReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_GpuControl.H>
7#include <AMReX_GpuTypes.H>
8#include <AMReX_GpuAtomic.H>
9#include <AMReX_GpuUtility.H>
10#include <AMReX_Functional.H>
11#include <AMReX_TypeTraits.H>
12
13#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14#define AMREX_USE_CUB 1
15#endif
16
17#if defined(AMREX_USE_CUB)
18#include <cub/cub.cuh>
19#elif defined(AMREX_USE_HIP)
20#include <rocprim/rocprim.hpp>
21#endif
22
23//
24// Public interface
25//
26namespace amrex::Gpu {
27 template <typename T>
29 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
30
31 template <typename T>
33 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
34
35 template <typename T>
37 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
38
40 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
41
43 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
44}
45
46//
47// Reduce functions based on _shfl_down_sync
48//
49
50namespace amrex::Gpu {
51
52#ifdef AMREX_USE_SYCL
53
54template <int warpSize, typename T, typename F>
55struct warpReduce
56{
58 T operator() (T x, sycl::sub_group const& sg) const noexcept
59 {
60 for (int offset = warpSize/2; offset > 0; offset /= 2) {
61 T y = sycl::shift_group_left(sg, x, offset);
62 x = F()(x,y);
63 }
64 return x;
65 }
66};
67
68template <int warpSize, typename T, typename WARPREDUCE>
70T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
71{
72 T* shared = (T*)h.local;
73 int tid = h.item->get_local_id(0);
74 sycl::sub_group const& sg = h.item->get_sub_group();
75 int lane = sg.get_local_id()[0];
76 int wid = sg.get_group_id()[0];
77 int numwarps = sg.get_group_range()[0];
78 x = warp_reduce(x, sg);
79 // __syncthreads() prior to writing to shared memory is necessary
80 // if this reduction call is occurring multiple times in a kernel,
81 // and since we don't know how many times the user is calling it,
82 // we do it always to be safe.
83 h.item->barrier(sycl::access::fence_space::local_space);
84 if (lane == 0) { shared[wid] = x; }
85 h.item->barrier(sycl::access::fence_space::local_space);
86 bool b = (tid == 0) || (tid < numwarps);
87 x = b ? shared[lane] : x0;
88 if (wid == 0) { x = warp_reduce(x, sg); }
89 return x;
90}
91
92template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
94void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
95 Gpu::Handler const& handler)
96{
97 sycl::sub_group const& sg = handler.item->get_sub_group();
98 int wid = sg.get_group_id()[0];
99 if ((wid+1)*warpSize <= handler.numActiveThreads) {
100 x = warp_reduce(x, sg); // full warp
101 if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
102 } else {
103 atomic_op(dest, x);
104 }
105}
106
107// block-wide reduction for thread0
108template <typename T>
110T blockReduceSum (T source, Gpu::Handler const& h) noexcept
111{
112 return Gpu::blockReduce<Gpu::Device::warp_size>
113 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
114}
115
116template <typename T>
118void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
119{
120 T aggregate = blockReduceSum(source, h);
121 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
122}
123
124// block-wide reduction for thread0
125template <typename T>
127T blockReduceMin (T source, Gpu::Handler const& h) noexcept
128{
129 return Gpu::blockReduce<Gpu::Device::warp_size>
130 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
131}
132
133template <typename T>
135void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
136{
137 T aggregate = blockReduceMin(source, h);
138 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
139}
140
141// block-wide reduction for thread0
142template <typename T>
144T blockReduceMax (T source, Gpu::Handler const& h) noexcept
145{
146 return Gpu::blockReduce<Gpu::Device::warp_size>
147 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
148}
149
150template <typename T>
152void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
153{
154 T aggregate = blockReduceMax(source, h);
155 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
156}
157
158// block-wide reduction for thread0
160int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
161{
162 return Gpu::blockReduce<Gpu::Device::warp_size>
163 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
164}
165
167void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
168{
169 int aggregate = blockReduceLogicalAnd(source, h);
170 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
171}
172
173// block-wide reduction for thread0
175int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
176{
177 return Gpu::blockReduce<Gpu::Device::warp_size>
178 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
179}
180
182void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
183{
184 int aggregate = blockReduceLogicalOr(source, h);
185 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
186}
187
188template <typename T>
190void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
191{
192 if (h.isFullBlock()) {
193 deviceReduceSum_full(dest, source, h);
194 } else {
195 Gpu::blockReduce_partial<Gpu::Device::warp_size>
196 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
197 Gpu::AtomicAdd<T>(), h);
198 }
199}
200
201template <typename T>
203void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
204{
205 if (h.isFullBlock()) {
206 deviceReduceMin_full(dest, source, h);
207 } else {
208 Gpu::blockReduce_partial<Gpu::Device::warp_size>
209 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
210 Gpu::AtomicMin<T>(), h);
211 }
212}
213
214template <typename T>
216void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
217{
218 if (h.isFullBlock()) {
219 deviceReduceMax_full(dest, source, h);
220 } else {
221 Gpu::blockReduce_partial<Gpu::Device::warp_size>
222 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
223 Gpu::AtomicMax<T>(), h);
224 }
225}
226
228void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
229{
230 if (h.isFullBlock()) {
231 deviceReduceLogicalAnd_full(dest, source, h);
232 } else {
233 Gpu::blockReduce_partial<Gpu::Device::warp_size>
234 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
235 Gpu::AtomicLogicalAnd<int>(), h);
236 }
237}
238
240void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
241{
242 if (h.isFullBlock()) {
243 deviceReduceLogicalOr_full(dest, source, h);
244 } else {
245 Gpu::blockReduce_partial<Gpu::Device::warp_size>
246 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
247 Gpu::AtomicLogicalOr<int>(), h);
248 }
249}
250
251#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
252
253namespace detail {
254
255template <typename T>
257T shuffle_down (T x, int offset) noexcept
258{
259 return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
260 __shfl_down_sync(0xffffffff, x, offset));
261}
262
263// If other sizeof is needed, we can implement it later.
264template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
266T multi_shuffle_down (T x, int offset) noexcept
267{
268 constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
269 T y;
270 auto py = reinterpret_cast<unsigned int*>(&y);
271 auto px = reinterpret_cast<unsigned int*>(&x);
272 for (int i = 0; i < nwords; ++i) {
273 py[i] = shuffle_down(px[i],offset);
274 }
275 return y;
276}
277
278}
279
280template <int warpSize, typename T, typename F>
282{
283 // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
284 template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
286 T operator() (T x) const noexcept
287 {
288 for (int offset = warpSize/2; offset > 0; offset /= 2) {
290 x = F()(x,y);
291 }
292 return x;
293 }
294
295 template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
297 T operator() (T x) const noexcept
298 {
299 for (int offset = warpSize/2; offset > 0; offset /= 2) {
301 x = F()(x,y);
302 }
303 return x;
304 }
305};
306
307template <int warpSize, typename T, typename WARPREDUCE>
309T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
310{
311 __shared__ T shared[warpSize];
312 int lane = threadIdx.x % warpSize;
313 int wid = threadIdx.x / warpSize;
314 x = warp_reduce(x);
315 // __syncthreads() prior to writing to shared memory is necessary
316 // if this reduction call is occurring multiple times in a kernel,
317 // and since we don't know how many times the user is calling it,
318 // we do it always to be safe.
319 __syncthreads();
320 if (lane == 0) { shared[wid] = x; }
321 __syncthreads();
322 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
323 x = b ? shared[lane] : x0;
324 if (wid == 0) { x = warp_reduce(x); }
325 return x;
326}
327
328template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
330void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
331 Gpu::Handler const& handler)
332{
333 int warp = (int)threadIdx.x / warpSize;
334 if ((warp+1)*warpSize <= handler.numActiveThreads) {
335 x = warp_reduce(x); // full warp
336 if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
337 } else {
338 atomic_op(dest,x);
339 }
340}
341
342// Compute the block-wide reduction for thread0
343template <typename T>
345T blockReduceSum (T source) noexcept
346{
347 return Gpu::blockReduce<Gpu::Device::warp_size>
349}
350
351template <typename T>
353void deviceReduceSum_full (T * dest, T source) noexcept
354{
355 T aggregate = blockReduceSum(source);
356 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
357}
358
359// Compute the block-wide reduction for thread0
360template <int BLOCKDIMX, typename T>
362T blockReduceSum (T source) noexcept
363{
364#if defined(AMREX_USE_CUB)
365 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
366 __shared__ typename BlockReduce::TempStorage temp_storage;
367 // __syncthreads() prior to writing to shared memory is necessary
368 // if this reduction call is occurring multiple times in a kernel,
369 // and since we don't know how many times the user is calling it,
370 // we do it always to be safe.
371 __syncthreads();
372 return BlockReduce(temp_storage).Sum(source);
373#elif defined(AMREX_USE_HIP)
374 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
375 __shared__ typename BlockReduce::storage_type temp_storage;
376 __syncthreads();
377 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
378 return source;
379#else
380 return blockReduceSum(source);
381#endif
382}
383
384template <int BLOCKDIMX, typename T>
386void deviceReduceSum_full (T * dest, T source) noexcept
387{
388 T aggregate = blockReduceSum<BLOCKDIMX>(source);
389 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
390}
391
392// Compute the block-wide reduction for thread0
393template <typename T>
395T blockReduceMin (T source) noexcept
396{
397 return Gpu::blockReduce<Gpu::Device::warp_size>
399}
400
401template <typename T>
403void deviceReduceMin_full (T * dest, T source) noexcept
404{
405 T aggregate = blockReduceMin(source);
406 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
407}
408
409// Compute the block-wide reduction for thread0
410template <int BLOCKDIMX, typename T>
412T blockReduceMin (T source) noexcept
413{
414#if defined(AMREX_USE_CUB)
415 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
416 __shared__ typename BlockReduce::TempStorage temp_storage;
417 // __syncthreads() prior to writing to shared memory is necessary
418 // if this reduction call is occurring multiple times in a kernel,
419 // and since we don't know how many times the user is calling it,
420 // we do it always to be safe.
421 __syncthreads();
422 return BlockReduce(temp_storage).Reduce(source, cub::Min());
423#elif defined(AMREX_USE_HIP)
424 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
425 __shared__ typename BlockReduce::storage_type temp_storage;
426 __syncthreads();
427 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
428 return source;
429#else
430 return blockReduceMin(source);
431#endif
432}
433
434template <int BLOCKDIMX, typename T>
436void deviceReduceMin_full (T * dest, T source) noexcept
437{
438 T aggregate = blockReduceMin<BLOCKDIMX>(source);
439 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
440}
441
442// Compute the block-wide reduction for thread0
443template <typename T>
445T blockReduceMax (T source) noexcept
446{
447 return Gpu::blockReduce<Gpu::Device::warp_size>
449}
450
451template <typename T>
453void deviceReduceMax_full (T * dest, T source) noexcept
454{
455 T aggregate = blockReduceMax(source);
456 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
457}
458
459// Compute the block-wide reduction for thread0
460template <int BLOCKDIMX, typename T>
462T blockReduceMax (T source) noexcept
463{
464#if defined(AMREX_USE_CUB)
465 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
466 __shared__ typename BlockReduce::TempStorage temp_storage;
467 // __syncthreads() prior to writing to shared memory is necessary
468 // if this reduction call is occurring multiple times in a kernel,
469 // and since we don't know how many times the user is calling it,
470 // we do it always to be safe.
471 __syncthreads();
472 return BlockReduce(temp_storage).Reduce(source, cub::Max());
473#elif defined(AMREX_USE_HIP)
474 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
475 __shared__ typename BlockReduce::storage_type temp_storage;
476 __syncthreads();
477 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
478 return source;
479#else
480 return blockReduceMax(source);
481#endif
482}
483
484template <int BLOCKDIMX, typename T>
486void deviceReduceMax_full (T * dest, T source) noexcept
487{
488 T aggregate = blockReduceMax<BLOCKDIMX>(source);
489 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
490}
491
492// \cond CODEGEN
493
494// Compute the block-wide reduction for thread0
496int blockReduceLogicalAnd (int source) noexcept
497{
498 return Gpu::blockReduce<Gpu::Device::warp_size>
500}
501
503void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
504{
505 int aggregate = blockReduceLogicalAnd(source);
506 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
507}
508
509// Compute the block-wide reduction for thread0
510template <int BLOCKDIMX>
512int blockReduceLogicalAnd (int source) noexcept
513{
514#if defined(AMREX_USE_CUB)
515 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
516 __shared__ typename BlockReduce::TempStorage temp_storage;
517 // __syncthreads() prior to writing to shared memory is necessary
518 // if this reduction call is occurring multiple times in a kernel,
519 // and since we don't know how many times the user is calling it,
520 // we do it always to be safe.
521 __syncthreads();
522 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
523#elif defined(AMREX_USE_HIP)
524 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
525 __shared__ typename BlockReduce::storage_type temp_storage;
526 __syncthreads();
527 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
528 return source;
529#else
530 return blockReduceLogicalAnd(source);
531#endif
532}
533
534template <int BLOCKDIMX>
536void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
537{
538 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
539 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
540}
541
542// Compute the block-wide reduction for thread0
544int blockReduceLogicalOr (int source) noexcept
545{
546 return Gpu::blockReduce<Gpu::Device::warp_size>
547 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0);
548}
549
551void deviceReduceLogicalOr_full (int * dest, int source) noexcept
552{
553 int aggregate = blockReduceLogicalOr(source);
554 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
555}
556
557// Compute the block-wide reduction for thread0
558template <int BLOCKDIMX>
560int blockReduceLogicalOr (int source) noexcept
561{
562#if defined(AMREX_USE_CUB)
563 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
564 __shared__ typename BlockReduce::TempStorage temp_storage;
565 // __syncthreads() prior to writing to shared memory is necessary
566 // if this reduction call is occurring multiple times in a kernel,
567 // and since we don't know how many times the user is calling it,
568 // we do it always to be safe.
569 __syncthreads();
570 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
571#elif defined(AMREX_USE_HIP)
572 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
573 __shared__ typename BlockReduce::storage_type temp_storage;
574 __syncthreads();
575 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
576 return source;
577#else
578 return blockReduceLogicalOr(source);
579#endif
580}
581
582template <int BLOCKDIMX>
584void deviceReduceLogicalOr_full (int * dest, int source) noexcept
585{
586 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
587 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
588}
589
590// \endcond
591
592template <typename T>
594void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
595{
596 if (handler.isFullBlock()) {
597 if (blockDim.x == 128) {
598 deviceReduceSum_full<128,T>(dest, source);
599 } else if (blockDim.x == 256) {
600 deviceReduceSum_full<256,T>(dest, source);
601 } else {
602 deviceReduceSum_full<T>(dest, source);
603 }
604 } else {
605 Gpu::blockReduce_partial<Gpu::Device::warp_size>
607 Gpu::AtomicAdd<T>(), handler);
608 }
609}
610
611template <typename T>
613void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
614{
615 if (handler.isFullBlock()) {
616 if (blockDim.x == 128) {
617 deviceReduceMin_full<128,T>(dest, source);
618 } else if (blockDim.x == 256) {
619 deviceReduceMin_full<256,T>(dest, source);
620 } else {
621 deviceReduceMin_full<T>(dest, source);
622 }
623 } else {
624 Gpu::blockReduce_partial<Gpu::Device::warp_size>
626 Gpu::AtomicMin<T>(), handler);
627 }
628}
629
630template <typename T>
632void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
633{
634 if (handler.isFullBlock()) {
635 if (blockDim.x == 128) {
636 deviceReduceMax_full<128,T>(dest, source);
637 } else if (blockDim.x == 256) {
638 deviceReduceMax_full<256,T>(dest, source);
639 } else {
640 deviceReduceMax_full<T>(dest, source);
641 }
642 } else {
643 Gpu::blockReduce_partial<Gpu::Device::warp_size>
645 Gpu::AtomicMax<T>(), handler);
646 }
647}
648
650void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
651{
652 if (handler.isFullBlock()) {
653 if (blockDim.x == 128) {
654 deviceReduceLogicalAnd_full<128>(dest, source);
655 } else if (blockDim.x == 256) {
656 deviceReduceLogicalAnd_full<256>(dest, source);
657 } else {
658 deviceReduceLogicalAnd_full(dest, source);
659 }
660 } else {
661 Gpu::blockReduce_partial<Gpu::Device::warp_size>
663 Gpu::AtomicLogicalAnd<int>(), handler);
664 }
665}
666
668void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
669{
670 if (handler.isFullBlock()) {
671 if (blockDim.x == 128) {
672 deviceReduceLogicalOr_full<128>(dest, source);
673 } else if (blockDim.x == 256) {
674 deviceReduceLogicalOr_full<256>(dest, source);
675 } else {
676 deviceReduceLogicalOr_full(dest, source);
677 }
678 } else {
679 Gpu::blockReduce_partial<Gpu::Device::warp_size>
681 Gpu::AtomicLogicalOr<int>(), handler);
682 }
683}
684
685#else
686
687template <typename T>
689void deviceReduceSum_full (T * dest, T source) noexcept
690{
691#ifdef AMREX_USE_OMP
692#pragma omp atomic
693#endif
694 *dest += source;
695}
696
697template <typename T>
699void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
700{
701 deviceReduceSum_full(dest, source);
702}
703
704template <typename T>
706void deviceReduceMin_full (T * dest, T source) noexcept
707{
708#ifdef AMREX_USE_OMP
709#pragma omp critical (gpureduce_reducemin)
710#endif
711 *dest = std::min(*dest, source);
712}
713
714template <typename T>
716void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
717{
718 deviceReduceMin_full(dest, source);
719}
720
721template <typename T>
723void deviceReduceMax_full (T * dest, T source) noexcept
724{
725#ifdef AMREX_USE_OMP
726#pragma omp critical (gpureduce_reducemax)
727#endif
728 *dest = std::max(*dest, source);
729}
730
731template <typename T>
733void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
734{
735 deviceReduceMax_full(dest, source);
736}
737
739void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
740{
741#ifdef AMREX_USE_OMP
742#pragma omp critical (gpureduce_reduceand)
743#endif
744 *dest = (*dest) && source;
745}
746
748void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
749{
750 deviceReduceLogicalAnd_full(dest, source);
751}
752
754void deviceReduceLogicalOr_full (int * dest, int source) noexcept
755{
756#ifdef AMREX_USE_OMP
757#pragma omp critical (gpureduce_reduceor)
758#endif
759 *dest = (*dest) || source;
760}
761
763void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
764{
765 deviceReduceLogicalOr_full(dest, source);
766}
767
768#endif
769}
770
771#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
AMREX_GPU_DEVICE AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition AMReX_GpuAtomic.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:434
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:459
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:281
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T multi_shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:266
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T shuffle_down(T x, int offset) noexcept
Definition AMReX_GpuReduce.H:257
Definition AMReX_BaseFwd.H:52
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:309
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:650
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:632
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:445
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:453
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:330
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:668
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:594
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:353
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:403
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:613
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_GpuAtomic.H:632
Definition AMReX_GpuAtomic.H:656
Definition AMReX_GpuAtomic.H:664
Definition AMReX_GpuAtomic.H:648
Definition AMReX_GpuAtomic.H:640
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:282
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:286
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14