Block-Structured AMR Software Framework
AMReX_GpuReduce.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_REDUCE_H_
2 #define AMREX_GPU_REDUCE_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_GpuQualifiers.H>
6 #include <AMReX_GpuControl.H>
7 #include <AMReX_GpuTypes.H>
8 #include <AMReX_GpuAtomic.H>
9 #include <AMReX_GpuUtility.H>
10 #include <AMReX_Functional.H>
11 #include <AMReX_TypeTraits.H>
12 
13 #if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
14 #define AMREX_USE_CUB 1
15 #endif
16 
17 #if defined(AMREX_USE_CUB)
18 #include <cub/cub.cuh>
19 #elif defined(AMREX_USE_HIP)
20 #include <rocprim/rocprim.hpp>
21 #endif
22 
23 //
24 // Public interface
25 //
26 namespace amrex::Gpu {
27  template <typename T>
29  void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
30 
31  template <typename T>
33  void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
34 
35  template <typename T>
37  void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
38 
40  void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
41 
43  void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
44 }
45 
46 //
47 // Reduce functions based on _shfl_down_sync
48 //
49 
50 namespace amrex::Gpu {
51 
52 #ifdef AMREX_USE_SYCL
53 
54 template <int warpSize, typename T, typename F>
55 struct warpReduce
56 {
58  T operator() (T x, sycl::sub_group const& sg) const noexcept
59  {
60  for (int offset = warpSize/2; offset > 0; offset /= 2) {
61  T y = sycl::shift_group_left(sg, x, offset);
62  x = F()(x,y);
63  }
64  return x;
65  }
66 };
67 
68 template <int warpSize, typename T, typename WARPREDUCE>
70 T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
71 {
72  T* shared = (T*)h.local;
73  int tid = h.item->get_local_id(0);
74  sycl::sub_group const& sg = h.item->get_sub_group();
75  int lane = sg.get_local_id()[0];
76  int wid = sg.get_group_id()[0];
77  int numwarps = sg.get_group_range()[0];
78  x = warp_reduce(x, sg);
79  // __syncthreads() prior to writing to shared memory is necessary
80  // if this reduction call is occurring multiple times in a kernel,
81  // and since we don't know how many times the user is calling it,
82  // we do it always to be safe.
83  h.item->barrier(sycl::access::fence_space::local_space);
84  if (lane == 0) { shared[wid] = x; }
85  h.item->barrier(sycl::access::fence_space::local_space);
86  bool b = (tid == 0) || (tid < numwarps);
87  x = b ? shared[lane] : x0;
88  if (wid == 0) { x = warp_reduce(x, sg); }
89  return x;
90 }
91 
92 template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
94 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
95  Gpu::Handler const& handler)
96 {
97  sycl::sub_group const& sg = handler.item->get_sub_group();
98  int wid = sg.get_group_id()[0];
99  if ((wid+1)*warpSize <= handler.numActiveThreads) {
100  x = warp_reduce(x, sg); // full warp
101  if (sg.get_local_id()[0] == 0) { atomic_op(dest, x); }
102  } else {
103  atomic_op(dest, x);
104  }
105 }
106 
107 // block-wide reduction for thread0
108 template <typename T>
110 T blockReduceSum (T source, Gpu::Handler const& h) noexcept
111 {
112  return Gpu::blockReduce<Gpu::Device::warp_size>
113  (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
114 }
115 
116 template <typename T>
118 void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
119 {
120  T aggregate = blockReduceSum(source, h);
121  if (h.item->get_local_id(0) == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
122 }
123 
124 // block-wide reduction for thread0
125 template <typename T>
127 T blockReduceMin (T source, Gpu::Handler const& h) noexcept
128 {
129  return Gpu::blockReduce<Gpu::Device::warp_size>
130  (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(), source, h);
131 }
132 
133 template <typename T>
135 void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
136 {
137  T aggregate = blockReduceMin(source, h);
138  if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Min(dest, aggregate); }
139 }
140 
141 // block-wide reduction for thread0
142 template <typename T>
144 T blockReduceMax (T source, Gpu::Handler const& h) noexcept
145 {
146  return Gpu::blockReduce<Gpu::Device::warp_size>
147  (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(), source, h);
148 }
149 
150 template <typename T>
152 void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
153 {
154  T aggregate = blockReduceMax(source, h);
155  if (h.item->get_local_id(0) == 0) { Gpu::Atomic::Max(dest, aggregate); }
156 }
157 
158 // block-wide reduction for thread0
160 int blockReduceLogicalAnd (int source, Gpu::Handler const& h) noexcept
161 {
162  return Gpu::blockReduce<Gpu::Device::warp_size>
163  (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
164 }
165 
167 void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
168 {
169  int aggregate = blockReduceLogicalAnd(source, h);
170  if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
171 }
172 
173 // block-wide reduction for thread0
175 int blockReduceLogicalOr (int source, Gpu::Handler const& h) noexcept
176 {
177  return Gpu::blockReduce<Gpu::Device::warp_size>
178  (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
179 }
180 
182 void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
183 {
184  int aggregate = blockReduceLogicalOr(source, h);
185  if (h.item->get_local_id(0) == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
186 }
187 
188 template <typename T>
190 void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
191 {
192  if (h.isFullBlock()) {
193  deviceReduceSum_full(dest, source, h);
194  } else {
195  Gpu::blockReduce_partial<Gpu::Device::warp_size>
196  (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
197  Gpu::AtomicAdd<T>(), h);
198  }
199 }
200 
201 template <typename T>
203 void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
204 {
205  if (h.isFullBlock()) {
206  deviceReduceMin_full(dest, source, h);
207  } else {
208  Gpu::blockReduce_partial<Gpu::Device::warp_size>
209  (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Minimum<T> >(),
210  Gpu::AtomicMin<T>(), h);
211  }
212 }
213 
214 template <typename T>
216 void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
217 {
218  if (h.isFullBlock()) {
219  deviceReduceMax_full(dest, source, h);
220  } else {
221  Gpu::blockReduce_partial<Gpu::Device::warp_size>
222  (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Maximum<T> >(),
223  Gpu::AtomicMax<T>(), h);
224  }
225 }
226 
228 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
229 {
230  if (h.isFullBlock()) {
231  deviceReduceLogicalAnd_full(dest, source, h);
232  } else {
233  Gpu::blockReduce_partial<Gpu::Device::warp_size>
234  (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
235  Gpu::AtomicLogicalAnd<int>(), h);
236  }
237 }
238 
240 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
241 {
242  if (h.isFullBlock()) {
243  deviceReduceLogicalOr_full(dest, source, h);
244  } else {
245  Gpu::blockReduce_partial<Gpu::Device::warp_size>
246  (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
247  Gpu::AtomicLogicalOr<int>(), h);
248  }
249 }
250 
251 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
252 
253 namespace detail {
254 
255 template <typename T>
257 T shuffle_down (T x, int offset) noexcept
258 {
259  return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
260  __shfl_down_sync(0xffffffff, x, offset));
261 }
262 
263 // If other sizeof is needed, we can implement it later.
264 template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
266 T multi_shuffle_down (T x, int offset) noexcept
267 {
268  constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
269  T y;
270  auto py = reinterpret_cast<unsigned int*>(&y);
271  auto px = reinterpret_cast<unsigned int*>(&x);
272  for (int i = 0; i < nwords; ++i) {
273  py[i] = shuffle_down(px[i],offset);
274  }
275  return y;
276 }
277 
278 }
279 
280 template <int warpSize, typename T, typename F>
282 {
283  // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
284  template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
286  T operator() (T x) const noexcept
287  {
288  for (int offset = warpSize/2; offset > 0; offset /= 2) {
289  T y = detail::shuffle_down(x, offset);
290  x = F()(x,y);
291  }
292  return x;
293  }
294 
295  template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
297  T operator() (T x) const noexcept
298  {
299  for (int offset = warpSize/2; offset > 0; offset /= 2) {
301  x = F()(x,y);
302  }
303  return x;
304  }
305 };
306 
307 template <int warpSize, typename T, typename WARPREDUCE>
309 T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
310 {
311  __shared__ T shared[warpSize];
312  int lane = threadIdx.x % warpSize;
313  int wid = threadIdx.x / warpSize;
314  x = warp_reduce(x);
315  // __syncthreads() prior to writing to shared memory is necessary
316  // if this reduction call is occurring multiple times in a kernel,
317  // and since we don't know how many times the user is calling it,
318  // we do it always to be safe.
319  __syncthreads();
320  if (lane == 0) { shared[wid] = x; }
321  __syncthreads();
322  bool b = (threadIdx.x == 0) || (threadIdx.x < blockDim.x / warpSize);
323  x = b ? shared[lane] : x0;
324  if (wid == 0) { x = warp_reduce(x); }
325  return x;
326 }
327 
328 template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
330 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
331  Gpu::Handler const& handler)
332 {
333  int warp = (int)threadIdx.x / warpSize;
334  if ((warp+1)*warpSize <= handler.numActiveThreads) {
335  x = warp_reduce(x); // full warp
336  if (threadIdx.x % warpSize == 0) { atomic_op(dest, x); }
337  } else {
338  atomic_op(dest,x);
339  }
340 }
341 
342 // Compute the block-wide reduction for thread0
343 template <typename T>
345 T blockReduceSum (T source) noexcept
346 {
347  return Gpu::blockReduce<Gpu::Device::warp_size>
349 }
350 
351 template <typename T>
353 void deviceReduceSum_full (T * dest, T source) noexcept
354 {
355  T aggregate = blockReduceSum(source);
356  if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
357 }
358 
359 // Compute the block-wide reduction for thread0
360 template <int BLOCKDIMX, typename T>
362 T blockReduceSum (T source) noexcept
363 {
364 #if defined(AMREX_USE_CUB)
365  using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
366  __shared__ typename BlockReduce::TempStorage temp_storage;
367  // __syncthreads() prior to writing to shared memory is necessary
368  // if this reduction call is occurring multiple times in a kernel,
369  // and since we don't know how many times the user is calling it,
370  // we do it always to be safe.
371  __syncthreads();
372  return BlockReduce(temp_storage).Sum(source);
373 #elif defined(AMREX_USE_HIP)
374  using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
375  __shared__ typename BlockReduce::storage_type temp_storage;
376  __syncthreads();
377  BlockReduce().reduce(source, source, temp_storage, rocprim::plus<T>());
378  return source;
379 #else
380  return blockReduceSum(source);
381 #endif
382 }
383 
384 template <int BLOCKDIMX, typename T>
386 void deviceReduceSum_full (T * dest, T source) noexcept
387 {
388  T aggregate = blockReduceSum<BLOCKDIMX>(source);
389  if (threadIdx.x == 0) { Gpu::Atomic::AddNoRet(dest, aggregate); }
390 }
391 
392 // Compute the block-wide reduction for thread0
393 template <typename T>
395 T blockReduceMin (T source) noexcept
396 {
397  return Gpu::blockReduce<Gpu::Device::warp_size>
399 }
400 
401 template <typename T>
403 void deviceReduceMin_full (T * dest, T source) noexcept
404 {
405  T aggregate = blockReduceMin(source);
406  if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
407 }
408 
409 // Compute the block-wide reduction for thread0
410 template <int BLOCKDIMX, typename T>
412 T blockReduceMin (T source) noexcept
413 {
414 #if defined(AMREX_USE_CUB)
415  using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
416  __shared__ typename BlockReduce::TempStorage temp_storage;
417  // __syncthreads() prior to writing to shared memory is necessary
418  // if this reduction call is occurring multiple times in a kernel,
419  // and since we don't know how many times the user is calling it,
420  // we do it always to be safe.
421  __syncthreads();
422  return BlockReduce(temp_storage).Reduce(source, cub::Min());
423 #elif defined(AMREX_USE_HIP)
424  using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
425  __shared__ typename BlockReduce::storage_type temp_storage;
426  __syncthreads();
427  BlockReduce().reduce(source, source, temp_storage, rocprim::minimum<T>());
428  return source;
429 #else
430  return blockReduceMin(source);
431 #endif
432 }
433 
434 template <int BLOCKDIMX, typename T>
436 void deviceReduceMin_full (T * dest, T source) noexcept
437 {
438  T aggregate = blockReduceMin<BLOCKDIMX>(source);
439  if (threadIdx.x == 0) { Gpu::Atomic::Min(dest, aggregate); }
440 }
441 
442 // Compute the block-wide reduction for thread0
443 template <typename T>
445 T blockReduceMax (T source) noexcept
446 {
447  return Gpu::blockReduce<Gpu::Device::warp_size>
449 }
450 
451 template <typename T>
453 void deviceReduceMax_full (T * dest, T source) noexcept
454 {
455  T aggregate = blockReduceMax(source);
456  if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
457 }
458 
459 // Compute the block-wide reduction for thread0
460 template <int BLOCKDIMX, typename T>
462 T blockReduceMax (T source) noexcept
463 {
464 #if defined(AMREX_USE_CUB)
465  using BlockReduce = cub::BlockReduce<T,BLOCKDIMX>;
466  __shared__ typename BlockReduce::TempStorage temp_storage;
467  // __syncthreads() prior to writing to shared memory is necessary
468  // if this reduction call is occurring multiple times in a kernel,
469  // and since we don't know how many times the user is calling it,
470  // we do it always to be safe.
471  __syncthreads();
472  return BlockReduce(temp_storage).Reduce(source, cub::Max());
473 #elif defined(AMREX_USE_HIP)
474  using BlockReduce = rocprim::block_reduce<T,BLOCKDIMX>;
475  __shared__ typename BlockReduce::storage_type temp_storage;
476  __syncthreads();
477  BlockReduce().reduce(source, source, temp_storage, rocprim::maximum<T>());
478  return source;
479 #else
480  return blockReduceMax(source);
481 #endif
482 }
483 
484 template <int BLOCKDIMX, typename T>
486 void deviceReduceMax_full (T * dest, T source) noexcept
487 {
488  T aggregate = blockReduceMax<BLOCKDIMX>(source);
489  if (threadIdx.x == 0) { Gpu::Atomic::Max(dest, aggregate); }
490 }
491 
492 // \cond CODEGEN
493 
494 // Compute the block-wide reduction for thread0
496 int blockReduceLogicalAnd (int source) noexcept
497 {
498  return Gpu::blockReduce<Gpu::Device::warp_size>
500 }
501 
503 void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
504 {
505  int aggregate = blockReduceLogicalAnd(source);
506  if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
507 }
508 
509 // Compute the block-wide reduction for thread0
510 template <int BLOCKDIMX>
512 int blockReduceLogicalAnd (int source) noexcept
513 {
514 #if defined(AMREX_USE_CUB)
515  using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
516  __shared__ typename BlockReduce::TempStorage temp_storage;
517  // __syncthreads() prior to writing to shared memory is necessary
518  // if this reduction call is occurring multiple times in a kernel,
519  // and since we don't know how many times the user is calling it,
520  // we do it always to be safe.
521  __syncthreads();
522  return BlockReduce(temp_storage).Reduce(source, amrex::LogicalAnd<int>());
523 #elif defined(AMREX_USE_HIP)
524  using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
525  __shared__ typename BlockReduce::storage_type temp_storage;
526  __syncthreads();
527  BlockReduce().reduce(source, source, temp_storage, amrex::LogicalAnd<int>());
528  return source;
529 #else
530  return blockReduceLogicalAnd(source);
531 #endif
532 }
533 
534 template <int BLOCKDIMX>
536 void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
537 {
538  int aggregate = blockReduceLogicalAnd<BLOCKDIMX>(source);
539  if (threadIdx.x == 0) { Gpu::Atomic::LogicalAnd(dest, aggregate); }
540 }
541 
542 // Compute the block-wide reduction for thread0
544 int blockReduceLogicalOr (int source) noexcept
545 {
546  return Gpu::blockReduce<Gpu::Device::warp_size>
547  (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0);
548 }
549 
551 void deviceReduceLogicalOr_full (int * dest, int source) noexcept
552 {
553  int aggregate = blockReduceLogicalOr(source);
554  if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
555 }
556 
557 // Compute the block-wide reduction for thread0
558 template <int BLOCKDIMX>
560 int blockReduceLogicalOr (int source) noexcept
561 {
562 #if defined(AMREX_USE_CUB)
563  using BlockReduce = cub::BlockReduce<int,BLOCKDIMX>;
564  __shared__ typename BlockReduce::TempStorage temp_storage;
565  // __syncthreads() prior to writing to shared memory is necessary
566  // if this reduction call is occurring multiple times in a kernel,
567  // and since we don't know how many times the user is calling it,
568  // we do it always to be safe.
569  __syncthreads();
570  return BlockReduce(temp_storage).Reduce(source, amrex::LogicalOr<int>());
571 #elif defined(AMREX_USE_HIP)
572  using BlockReduce = rocprim::block_reduce<int,BLOCKDIMX>;
573  __shared__ typename BlockReduce::storage_type temp_storage;
574  __syncthreads();
575  BlockReduce().reduce(source, source, temp_storage, amrex::LogicalOr<int>());
576  return source;
577 #else
578  return blockReduceLogicalOr(source);
579 #endif
580 }
581 
582 template <int BLOCKDIMX>
584 void deviceReduceLogicalOr_full (int * dest, int source) noexcept
585 {
586  int aggregate = blockReduceLogicalOr<BLOCKDIMX>(source);
587  if (threadIdx.x == 0) { Gpu::Atomic::LogicalOr(dest, aggregate); }
588 }
589 
590 // \endcond
591 
592 template <typename T>
594 void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
595 {
596  if (handler.isFullBlock()) {
597  if (blockDim.x == 128) {
598  deviceReduceSum_full<128,T>(dest, source);
599  } else if (blockDim.x == 256) {
600  deviceReduceSum_full<256,T>(dest, source);
601  } else {
602  deviceReduceSum_full<T>(dest, source);
603  }
604  } else {
605  Gpu::blockReduce_partial<Gpu::Device::warp_size>
607  Gpu::AtomicAdd<T>(), handler);
608  }
609 }
610 
611 template <typename T>
613 void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
614 {
615  if (handler.isFullBlock()) {
616  if (blockDim.x == 128) {
617  deviceReduceMin_full<128,T>(dest, source);
618  } else if (blockDim.x == 256) {
619  deviceReduceMin_full<256,T>(dest, source);
620  } else {
621  deviceReduceMin_full<T>(dest, source);
622  }
623  } else {
624  Gpu::blockReduce_partial<Gpu::Device::warp_size>
626  Gpu::AtomicMin<T>(), handler);
627  }
628 }
629 
630 template <typename T>
632 void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
633 {
634  if (handler.isFullBlock()) {
635  if (blockDim.x == 128) {
636  deviceReduceMax_full<128,T>(dest, source);
637  } else if (blockDim.x == 256) {
638  deviceReduceMax_full<256,T>(dest, source);
639  } else {
640  deviceReduceMax_full<T>(dest, source);
641  }
642  } else {
643  Gpu::blockReduce_partial<Gpu::Device::warp_size>
645  Gpu::AtomicMax<T>(), handler);
646  }
647 }
648 
650 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
651 {
652  if (handler.isFullBlock()) {
653  if (blockDim.x == 128) {
654  deviceReduceLogicalAnd_full<128>(dest, source);
655  } else if (blockDim.x == 256) {
656  deviceReduceLogicalAnd_full<256>(dest, source);
657  } else {
658  deviceReduceLogicalAnd_full(dest, source);
659  }
660  } else {
661  Gpu::blockReduce_partial<Gpu::Device::warp_size>
663  Gpu::AtomicLogicalAnd<int>(), handler);
664  }
665 }
666 
668 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
669 {
670  if (handler.isFullBlock()) {
671  if (blockDim.x == 128) {
672  deviceReduceLogicalOr_full<128>(dest, source);
673  } else if (blockDim.x == 256) {
674  deviceReduceLogicalOr_full<256>(dest, source);
675  } else {
676  deviceReduceLogicalOr_full(dest, source);
677  }
678  } else {
679  Gpu::blockReduce_partial<Gpu::Device::warp_size>
681  Gpu::AtomicLogicalOr<int>(), handler);
682  }
683 }
684 
685 #else
686 
687 template <typename T>
689 void deviceReduceSum_full (T * dest, T source) noexcept
690 {
691 #ifdef AMREX_USE_OMP
692 #pragma omp atomic
693 #endif
694  *dest += source;
695 }
696 
697 template <typename T>
699 void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
700 {
701  deviceReduceSum_full(dest, source);
702 }
703 
704 template <typename T>
706 void deviceReduceMin_full (T * dest, T source) noexcept
707 {
708 #ifdef AMREX_USE_OMP
709 #pragma omp critical (gpureduce_reducemin)
710 #endif
711  *dest = std::min(*dest, source);
712 }
713 
714 template <typename T>
716 void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
717 {
718  deviceReduceMin_full(dest, source);
719 }
720 
721 template <typename T>
723 void deviceReduceMax_full (T * dest, T source) noexcept
724 {
725 #ifdef AMREX_USE_OMP
726 #pragma omp critical (gpureduce_reducemax)
727 #endif
728  *dest = std::max(*dest, source);
729 }
730 
731 template <typename T>
733 void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
734 {
735  deviceReduceMax_full(dest, source);
736 }
737 
739 void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
740 {
741 #ifdef AMREX_USE_OMP
742 #pragma omp critical (gpureduce_reduceand)
743 #endif
744  *dest = (*dest) && source;
745 }
746 
748 void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
749 {
750  deviceReduceLogicalAnd_full(dest, source);
751 }
752 
754 void deviceReduceLogicalOr_full (int * dest, int source) noexcept
755 {
756 #ifdef AMREX_USE_OMP
757 #pragma omp critical (gpureduce_reduceor)
758 #endif
759  *dest = (*dest) || source;
760 }
761 
763 void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
764 {
765  deviceReduceLogicalOr_full(dest, source);
766 }
767 
768 #endif
769 }
770 
771 #endif
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_HIP_OR_CUDA(a, b)
Definition: AMReX_GpuControl.H:21
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
AMREX_GPU_DEVICE AMREX_FORCE_INLINE R atomic_op(R *const address, R const val, F const f) noexcept
Definition: AMReX_GpuAtomic.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition: AMReX_GpuAtomic.H:417
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr(int *const m, int const value) noexcept
Definition: AMReX_GpuAtomic.H:434
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd(int *const m, int const value) noexcept
Definition: AMReX_GpuAtomic.H:459
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition: AMReX_GpuAtomic.H:354
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition: AMReX_GpuAtomic.H:281
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T multi_shuffle_down(T x, int offset) noexcept
Definition: AMReX_GpuReduce.H:266
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T shuffle_down(T x, int offset) noexcept
Definition: AMReX_GpuReduce.H:257
Definition: AMReX_BaseFwd.H:52
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduce(T x, WARPREDUCE &&warp_reduce, T x0)
Definition: AMReX_GpuReduce.H:309
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalAnd(int *dest, int source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:650
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition: AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:632
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition: AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition: AMReX_GpuReduce.H:445
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:453
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial(T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
Definition: AMReX_GpuReduce.H:330
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalOr(int *dest, int source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:668
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:594
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:353
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full(T *dest, T source) noexcept
Definition: AMReX_GpuReduce.H:403
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin(T *dest, T source, Gpu::Handler const &h) noexcept
Definition: AMReX_GpuReduce.H:613
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Definition: AMReX_FabArrayCommI.H:841
Definition: AMReX_GpuAtomic.H:632
Definition: AMReX_GpuAtomic.H:656
Definition: AMReX_GpuAtomic.H:664
Definition: AMReX_GpuAtomic.H:648
Definition: AMReX_GpuAtomic.H:640
Definition: AMReX_GpuTypes.H:86
int numActiveThreads
Definition: AMReX_GpuTypes.H:96
Definition: AMReX_GpuReduce.H:282
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T operator()(T x) const noexcept
Definition: AMReX_GpuReduce.H:286
Definition: AMReX_Functional.H:50
Definition: AMReX_Functional.H:59
Definition: AMReX_Functional.H:41
Definition: AMReX_Functional.H:32
Definition: AMReX_Functional.H:14