Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_REDUCE_H_
2#define AMREX_GPU_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_GpuControl.H>
7#include <AMReX_GpuTypes.H>
8#include <AMReX_GpuAtomic.H>
9#include <AMReX_GpuUtility.H>
10#include <AMReX_Functional.H>
11#include <AMReX_TypeTraits.H>
12
13#if defined(AMREX_USE_CUDA)
14#include <cub/cub.cuh>
15#if defined(CCCL_MAJOR_VERSION) && defined(CCCL_MINOR_VERSION) && ((CCCL_MAJOR_VERSION >= 3) || ((CCCL_MAJOR_VERSION >= 2) && (CCCL_MINOR_VERSION >= 8)))
16#define AMREX_CUDA_CCCL_VER_GE_2_8 1
17#endif
18#elif defined(AMREX_USE_HIP)
19#include <rocprim/rocprim.hpp>
20#endif
21
22//
23// Public interface
24//
25namespace amrex::Gpu {
26 template <typename T>
28 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
29
30 template <typename T>
32 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
33
34 template <typename T>
36 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
37
39 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
40
42 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
43}
44
45//
46// Reduce functions based on _shfl_down_sync
47//
48
49namespace amrex::Gpu {
50
51#ifdef AMREX_USE_SYCL
52
53template <int warpSize, typename T, typename F>
54struct warpReduce
55{
57 T operator() (T x, sycl::sub_group const& sg) const noexcept
58 {
59 for (int offset = warpSize/2; offset > 0; offset /= 2) {
60 T y = sycl::shift_group_left(sg, x, offset);
61 x = F()(x,y);
62 }
63 return x;
64 }
65};
66
67template <int warpSize, typename T, typename WARPREDUCE>
69T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
70{
71 T* shared = (T*)h.local;
72 int tid = h.item->get_local_id(0);
73 sycl::sub_group const& sg = h.item->get_sub_group();
74 int lane = sg.get_local_id()[0];
75 int wid = sg.get_group_id()[0];
76 int numwarps = sg.get_group_range()[0];
77 x = warp_reduce(x, sg);
78 // __syncthreads() prior to writing to shared memory is necessary
79 // if this reduction call is occurring multiple times in a kernel,
80 // and since we don't know how many times the user is calling it,
81 // we do it always to be safe.
82 h.item->barrier(sycl::access::fence_space::local_space);
83 if (lane == 0) { shared[wid] = x; }
84 h.item->barrier(sycl::access::fence_space::local_space);
85 bool b = (tid == 0) || (tid < numwarps);
86 x = b ? shared[lane] : x0;
87 if (wid == 0) { x = warp_reduce(x, sg); }
88 return x;
89}
90
91template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
93void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
94 Gpu::Handler const& handler)
95{
96 sycl::sub_group const& sg = handler.item->get_sub_group();
97 int wid = sg.get_group_id()[0];
98 if ((wid+1)*warpSize <= handler.numActiveThreads) {
99 x = warp_reduce(x, sg); // full warp
100 if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
101 } else {
102 atomic_op(dest, x);
103 }
104}
105
106// block-wide reduction for thread0
107template <typename T>
109T blockReduceSum (T source, Gpu::Handler const& h) noexcept
110{
111 return Gpu::blockReduce<Gpu::Device::warp_size>
112 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
113}
114
115template <typename T>
117void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
118{
119 T aggregate = blockReduceSum(source, h);
120 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
121}
122
123// block-wide reduction for thread0
124template <typename T>
126T blockReduceMin (T source, Gpu::Handler const& h) noexcept
127{
128 return Gpu::blockReduce<Gpu::Device::warp_size>
129 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
130}
131
132template <typename T>
134void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
135{
136 T aggregate = blockReduceMin(source, h);
137 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
138}
139
140// block-wide reduction for thread0
141template <typename T>
143T blockReduceMax (T source, Gpu::Handler const& h) noexcept
144{
145 return Gpu::blockReduce<Gpu::Device::warp_size>
146 (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
147}
148
149template <typename T>
151void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
152{
153 T aggregate = blockReduceMax(source, h);
154 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
155}
156
157// block-wide reduction for thread0
159int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
160{
161 return Gpu::blockReduce<Gpu::Device::warp_size>
162 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
163}
164
166void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
167{
168 int aggregate = blockReduceLogicalAnd(source, h);
169 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
170}
171
172// block-wide reduction for thread0
174int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
175{
176 return Gpu::blockReduce<Gpu::Device::warp_size>
177 (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
178}
179
181void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
182{
183 int aggregate = blockReduceLogicalOr(source, h);
184 if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
185}
186
187template <typename T>
189void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
190{
191 if (h.isFullBlock()) {
192 deviceReduceSum_full(dest, source, h);
193 } else {
194 Gpu::blockReduce_partial<Gpu::Device::warp_size>
195 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
196 Gpu::AtomicAdd<T>(), h);
197 }
198}
199
200template <typename T>
202void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
203{
204 if (h.isFullBlock()) {
205 deviceReduceMin_full(dest, source, h);
206 } else {
207 Gpu::blockReduce_partial<Gpu::Device::warp_size>
208 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
209 Gpu::AtomicMin<T>(), h);
210 }
211}
212
213template <typename T>
215void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
216{
217 if (h.isFullBlock()) {
218 deviceReduceMax_full(dest, source, h);
219 } else {
220 Gpu::blockReduce_partial<Gpu::Device::warp_size>
221 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
222 Gpu::AtomicMax<T>(), h);
223 }
224}
225
227void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
228{
229 if (h.isFullBlock()) {
230 deviceReduceLogicalAnd_full(dest, source, h);
231 } else {
232 Gpu::blockReduce_partial<Gpu::Device::warp_size>
233 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
234 Gpu::AtomicLogicalAnd<int>(), h);
235 }
236}
237
239void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
240{
241 if (h.isFullBlock()) {
242 deviceReduceLogicalOr_full(dest, source, h);
243 } else {
244 Gpu::blockReduce_partial<Gpu::Device::warp_size>
245 (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
246 Gpu::AtomicLogicalOr<int>(), h);
247 }
248}
249
250#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
251
253namespace detail {
254
255template <typename T>
257T shuffle_down (T x, int offset) noexcept
258{
259 return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
260 __shfl_down_sync(0xffffffff, x, offset));
261}
262
263// If other sizeof is needed, we can implement it later.
264template <class T>
265requires (sizeof(T)%sizeof(unsigned int) == 0)
267T multi_shuffle_down (T x, int offset) noexcept
268{
269 constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
270 T y;
271 auto py = reinterpret_cast<unsigned int*>(&y);
272 auto px = reinterpret_cast<unsigned int*>(&x);
273 for (int i = 0; i < nwords; ++i) {
274 py[i] = shuffle_down(px[i],offset);
275 }
276 return y;
277}
278
279}
281
282template <int warpSize, typename T, typename F>
284{
285 // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
287 T operator()(T x) const noexcept
288 requires (std::is_arithmetic_v<T>)
289 {
290 for (int offset = warpSize/2; offset > 0; offset /= 2) {
291 T y = detail::shuffle_down(x, offset);
292 x = F()(x,y);
293 }
294 return x;
295 }
296
298 T operator()(T x) const noexcept
299 requires (!std::is_arithmetic_v<T>)
300 {
301 for (int offset = warpSize/2; offset > 0; offset /= 2) {
302 T y = detail::multi_shuffle_down(x, offset);
303 x = F()(x,y);
304 }
305 return x;
306 }
307};
308
309template <int warpSize, typename T, typename WARPREDUCE>
311T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
312{
313 __shared__ T shared[warpSize];
314 int lane = threadIdx.x % warpSize;
315 int wid = threadIdx.x / warpSize;
316 x = warp_reduce(x);
317 // __syncthreads() prior to writing to shared memory is necessary
318 // if this reduction call is occurring multiple times in a kernel,
319 // and since we don't know how many times the user is calling it,
320 // we do it always to be safe.
321 __syncthreads();
322 if (lane == 0) { shared[wid] = x; }
323 __syncthreads();
324 bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
325 x = b ? shared[lane] : x0;
326 if (wid == 0) { x = warp_reduce(x); }
327 return x;
328}
329
330template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
332void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
333 Gpu::Handler const& handler)
334{
335 int warp = (int)threadIdx.x / warpSize;
336 if ((warp+1)*warpSize <= handler.numActiveThreads) {
337 x = warp_reduce(x); // full warp
338 if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
339 } else {
340 atomic_op(dest,x);
341 }
342}
343
344// Compute the block-wide reduction for thread0
345template <typename T>
347T blockReduceSum (T source) noexcept
348{
349 return Gpu::blockReduce<Gpu::Device::warp_size>
351}
352
353template <typename T>
355void deviceReduceSum_full (T * dest, T source) noexcept
356{
357 T aggregate = blockReduceSum(source);
358 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
359}
360
361// Compute the block-wide reduction for thread0
362template <int BLOCKDIMX, typename T>
364T blockReduceSum (T source) noexcept
365{
366#if defined(AMREX_USE_CUDA)
367 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
368 __shared__ typename BlockReduce::TempStorage temp_storage;
369 // __syncthreads() prior to writing to shared memory is necessary
370 // if this reduction call is occurring multiple times in a kernel,
371 // and since we don't know how many times the user is calling it,
372 // we do it always to be safe.
373 __syncthreads();
374 return BlockReduce(temp_storage).Sum(source);
375#elif defined(AMREX_USE_HIP)
376 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
377 __shared__ typename BlockReduce::storage_type temp_storage;
378 __syncthreads();
379 BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
380 return source;
381#else
382 return blockReduceSum(source);
383#endif
384}
385
386template <int BLOCKDIMX, typename T>
388void deviceReduceSum_full (T * dest, T source) noexcept
389{
390 T aggregate = blockReduceSum<BLOCKDIMX>(source);
391 if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
392}
393
394// Compute the block-wide reduction for thread0
395template <typename T>
397T blockReduceMin (T source) noexcept
398{
399 return Gpu::blockReduce<Gpu::Device::warp_size>
401}
402
403template <typename T>
405void deviceReduceMin_full (T * dest, T source) noexcept
406{
407 T aggregate = blockReduceMin(source);
408 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
409}
410
411// Compute the block-wide reduction for thread0
412template <int BLOCKDIMX, typename T>
414T blockReduceMin (T source) noexcept
415{
416#if defined(AMREX_USE_CUDA)
417 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
418 __shared__ typename BlockReduce::TempStorage temp_storage;
419 // __syncthreads() prior to writing to shared memory is necessary
420 // if this reduction call is occurring multiple times in a kernel,
421 // and since we don't know how many times the user is calling it,
422 // we do it always to be safe.
423 __syncthreads();
424
425#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
426 return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
427#else
428 return BlockReduce(temp_storage).Reduce(source, cub::Min());
429#endif
430#elif defined(AMREX_USE_HIP)
431 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
432 __shared__ typename BlockReduce::storage_type temp_storage;
433 __syncthreads();
434 BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
435 return source;
436#else
437 return blockReduceMin(source);
438#endif
439}
440
441template <int BLOCKDIMX, typename T>
443void deviceReduceMin_full (T * dest, T source) noexcept
444{
445 T aggregate = blockReduceMin<BLOCKDIMX>(source);
446 if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
447}
448
449// Compute the block-wide reduction for thread0
450template <typename T>
452T blockReduceMax (T source) noexcept
453{
454 return Gpu::blockReduce<Gpu::Device::warp_size>
456}
457
458template <typename T>
460void deviceReduceMax_full (T * dest, T source) noexcept
461{
462 T aggregate = blockReduceMax(source);
463 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
464}
465
466// Compute the block-wide reduction for thread0
467template <int BLOCKDIMX, typename T>
469T blockReduceMax (T source) noexcept
470{
471#if defined(AMREX_USE_CUDA)
472 using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
473 __shared__ typename BlockReduce::TempStorage temp_storage;
474 // __syncthreads() prior to writing to shared memory is necessary
475 // if this reduction call is occurring multiple times in a kernel,
476 // and since we don't know how many times the user is calling it,
477 // we do it always to be safe.
478 __syncthreads();
479#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
480 return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
481#else
482 return BlockReduce(temp_storage).Reduce(source, cub::Max());
483#endif
484#elif defined(AMREX_USE_HIP)
485 using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
486 __shared__ typename BlockReduce::storage_type temp_storage;
487 __syncthreads();
488 BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
489 return source;
490#else
491 return blockReduceMax(source);
492#endif
493}
494
495template <int BLOCKDIMX, typename T>
497void deviceReduceMax_full (T * dest, T source) noexcept
498{
499 T aggregate = blockReduceMax<BLOCKDIMX>(source);
500 if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
501}
502
503// Compute the block-wide reduction for thread0
505int blockReduceLogicalAnd (int source) noexcept
506{
507 return Gpu::blockReduce<Gpu::Device::warp_size>
509}
510
512void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
513{
514 int aggregate = blockReduceLogicalAnd(source);
515 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
516}
517
518// Compute the block-wide reduction for thread0
519template <int BLOCKDIMX>
521int blockReduceLogicalAnd (int source) noexcept
522{
523#if defined(AMREX_USE_CUDA)
524 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
525 __shared__ typename BlockReduce::TempStorage temp_storage;
526 // __syncthreads() prior to writing to shared memory is necessary
527 // if this reduction call is occurring multiple times in a kernel,
528 // and since we don't know how many times the user is calling it,
529 // we do it always to be safe.
530 __syncthreads();
531 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
532#elif defined(AMREX_USE_HIP)
533 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
534 __shared__ typename BlockReduce::storage_type temp_storage;
535 __syncthreads();
536 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
537 return source;
538#else
539 return blockReduceLogicalAnd(source);
540#endif
541}
542
543template <int BLOCKDIMX>
545void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
546{
547 int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
548 if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
549}
550
551// Compute the block-wide reduction for thread0
553int blockReduceLogicalOr (int source) noexcept
554{
555 return Gpu::blockReduce<Gpu::Device::warp_size>
557}
558
560void deviceReduceLogicalOr_full (int * dest, int source) noexcept
561{
562 int aggregate = blockReduceLogicalOr(source);
563 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
564}
565
566// Compute the block-wide reduction for thread0
567template <int BLOCKDIMX>
569int blockReduceLogicalOr (int source) noexcept
570{
571#if defined(AMREX_USE_CUDA)
572 using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
573 __shared__ typename BlockReduce::TempStorage temp_storage;
574 // __syncthreads() prior to writing to shared memory is necessary
575 // if this reduction call is occurring multiple times in a kernel,
576 // and since we don't know how many times the user is calling it,
577 // we do it always to be safe.
578 __syncthreads();
579 return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
580#elif defined(AMREX_USE_HIP)
581 using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
582 __shared__ typename BlockReduce::storage_type temp_storage;
583 __syncthreads();
584 BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
585 return source;
586#else
587 return blockReduceLogicalOr(source);
588#endif
589}
590
591template <int BLOCKDIMX>
593void deviceReduceLogicalOr_full (int * dest, int source) noexcept
594{
595 int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
596 if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
597}
598
599template <typename T>
601void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
602{
603 if (handler.isFullBlock()) {
604 if (blockDim.x == 128) {
605 deviceReduceSum_full<128,T>(dest, source);
606 } else if (blockDim.x == 256) {
607 deviceReduceSum_full<256,T>(dest, source);
608 } else {
609 deviceReduceSum_full<T>(dest, source);
610 }
611 } else {
612 Gpu::blockReduce_partial<Gpu::Device::warp_size>
614 Gpu::AtomicAdd<T>(), handler);
615 }
616}
617
618template <typename T>
620void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
621{
622 if (handler.isFullBlock()) {
623 if (blockDim.x == 128) {
624 deviceReduceMin_full<128,T>(dest, source);
625 } else if (blockDim.x == 256) {
626 deviceReduceMin_full<256,T>(dest, source);
627 } else {
628 deviceReduceMin_full<T>(dest, source);
629 }
630 } else {
631 Gpu::blockReduce_partial<Gpu::Device::warp_size>
633 Gpu::AtomicMin<T>(), handler);
634 }
635}
636
637template <typename T>
639void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
640{
641 if (handler.isFullBlock()) {
642 if (blockDim.x == 128) {
643 deviceReduceMax_full<128,T>(dest, source);
644 } else if (blockDim.x == 256) {
645 deviceReduceMax_full<256,T>(dest, source);
646 } else {
647 deviceReduceMax_full<T>(dest, source);
648 }
649 } else {
650 Gpu::blockReduce_partial<Gpu::Device::warp_size>
652 Gpu::AtomicMax<T>(), handler);
653 }
654}
655
657void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
658{
659 if (handler.isFullBlock()) {
660 if (blockDim.x == 128) {
661 deviceReduceLogicalAnd_full<128>(dest, source);
662 } else if (blockDim.x == 256) {
663 deviceReduceLogicalAnd_full<256>(dest, source);
664 } else {
665 deviceReduceLogicalAnd_full(dest, source);
666 }
667 } else {
668 Gpu::blockReduce_partial<Gpu::Device::warp_size>
670 Gpu::AtomicLogicalAnd<int>(), handler);
671 }
672}
673
675void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
676{
677 if (handler.isFullBlock()) {
678 if (blockDim.x == 128) {
679 deviceReduceLogicalOr_full<128>(dest, source);
680 } else if (blockDim.x == 256) {
681 deviceReduceLogicalOr_full<256>(dest, source);
682 } else {
683 deviceReduceLogicalOr_full(dest, source);
684 }
685 } else {
686 Gpu::blockReduce_partial<Gpu::Device::warp_size>
688 Gpu::AtomicLogicalOr<int>(), handler);
689 }
690}
691
692#else
693
694template <typename T>
696void deviceReduceSum_full (T * dest, T source) noexcept
697{
698#ifdef AMREX_USE_OMP
699#pragma omp atomic
700#endif
701 *dest += source;
702}
703
704template <typename T>
706void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
707{
708 deviceReduceSum_full(dest, source);
709}
710
711template <typename T>
713void deviceReduceMin_full (T * dest, T source) noexcept
714{
715#ifdef AMREX_USE_OMP
716#pragma omp critical (gpureduce_reducemin)
717#endif
718 *dest = std::min(*dest, source);
719}
720
721template <typename T>
723void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
724{
725 deviceReduceMin_full(dest, source);
726}
727
728template <typename T>
730void deviceReduceMax_full (T * dest, T source) noexcept
731{
732#ifdef AMREX_USE_OMP
733#pragma omp critical (gpureduce_reducemax)
734#endif
735 *dest = std::max(*dest, source);
736}
737
738template <typename T>
740void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
741{
742 deviceReduceMax_full(dest, source);
743}
744
746void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
747{
748#ifdef AMREX_USE_OMP
749#pragma omp critical (gpureduce_reduceand)
750#endif
751 *dest = (*dest) && source;
752}
753
755void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
756{
757 deviceReduceLogicalAnd_full(dest, source);
758}
759
761void deviceReduceLogicalOr_full (int * dest, int source) noexcept
762{
763#ifdef AMREX_USE_OMP
764#pragma omp critical (gpureduce_reduceor)
765#endif
766 *dest = (*dest) || source;
767}
768
770void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
771{
772 deviceReduceLogicalOr_full(dest, source);
773}
774
775#endif
776}
777
778#endif
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:17
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
__host__ __device__ AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:461
__host__ __device__ AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition AMReX_GpuAtomic.H:436
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
Definition AMReX_BaseFwd.H:55
__device__ void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:601
__device__ T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition AMReX_GpuReduce.H:311
__device__ void deviceReduceLogicalAnd_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:512
__device__ void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition AMReX_GpuReduce.H:332
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:553
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:452
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:397
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:505
__device__ void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:639
__device__ void deviceReduceMax_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:460
__device__ void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:657
__device__ void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:675
__device__ void deviceReduceSum_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:355
__device__ void deviceReduceMin_full(T *dest, T source) noexcept
Definition AMReX_GpuReduce.H:405
__device__ void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition AMReX_GpuReduce.H:620
__device__ void deviceReduceLogicalOr_full(int *dest, int source) noexcept
Definition AMReX_GpuReduce.H:560
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:347
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_GpuAtomic.H:661
Definition AMReX_GpuAtomic.H:685
Definition AMReX_GpuAtomic.H:693
Definition AMReX_GpuAtomic.H:677
Definition AMReX_GpuAtomic.H:669
Definition AMReX_GpuTypes.H:86
int numActiveThreads
Definition AMReX_GpuTypes.H:96
Definition AMReX_GpuReduce.H:284
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:287
__device__ T operator()(T x) const noexcept
Definition AMReX_GpuReduce.H:298
Definition AMReX_Functional.H:50
Definition AMReX_Functional.H:59
Definition AMReX_Functional.H:41
Definition AMReX_Functional.H:32
Definition AMReX_Functional.H:14