Block-Structured AMR Software Framework
AMReX_Reduce.H
Go to the documentation of this file.
1 #ifndef AMREX_REDUCE_H_
2 #define AMREX_REDUCE_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_Gpu.H>
6 #include <AMReX_Arena.H>
7 #include <AMReX_OpenMP.H>
8 #include <AMReX_MFIter.H>
9 #include <AMReX_ValLocPair.H>
10 
11 #include <algorithm>
12 #include <functional>
13 #include <limits>
14 
15 namespace amrex {
16 
17 namespace Reduce::detail {
18 
19 #ifdef AMREX_USE_GPU
20 #ifdef AMREX_USE_SYCL
21  template <std::size_t I, typename T, typename P>
23  void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
24  {
25  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
26  }
27 
28  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
30  void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
31  {
32  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
33  for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
34  }
35 #else
36  template <std::size_t I, typename T, typename P>
38  void for_each_parallel (T& d, T const& s)
39  {
40  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
41  }
42 
43  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
45  void for_each_parallel (T& d, T const& s)
46  {
47  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
48  for_each_parallel<I+1,T,P1,Ps...>(d, s);
49  }
50 #endif
51 #endif
52 
53  template <std::size_t I, typename T, typename P>
55  void for_each_local (T& d, T const& s)
56  {
57  P().local_update(amrex::get<I>(d), amrex::get<I>(s));
58  }
59 
60  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
62  void for_each_local (T& d, T const& s)
63  {
64  P().local_update(amrex::get<I>(d), amrex::get<I>(s));
65  for_each_local<I+1,T,P1,Ps...>(d, s);
66  }
67 
68  template <std::size_t I, typename T, typename P>
70  constexpr void for_each_init (T& t)
71  {
72  P().init(amrex::get<I>(t));
73  }
74 
75  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
77  constexpr void for_each_init (T& t)
78  {
79  P().init(amrex::get<I>(t));
80  for_each_init<I+1,T,P1,Ps...>(t);
81  }
82 }
83 
85 {
86 
87 #ifdef AMREX_USE_GPU
88 #ifdef AMREX_USE_SYCL
89  template <typename T>
91  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
92  T r = Gpu::blockReduceSum(s,h);
93  if (h.threadIdx() == 0) { d += r; }
94  }
95 #else
96  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
98  void parallel_update (T& d, T const& s) const noexcept {
99  T r = Gpu::blockReduceSum<MT>(s);
100  if (threadIdx.x == 0) { d += r; }
101  }
102 #endif
103 #endif
104 
105  template <typename T>
107  void local_update (T& d, T const& s) const noexcept { d += s; }
108 
109  template <typename T>
110  constexpr void init (T& t) const noexcept { t = 0; }
111 };
112 
114 {
115 #ifdef AMREX_USE_GPU
116 #ifdef AMREX_USE_SYCL
117  template <typename T>
119  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
120  T r = Gpu::blockReduceMin(s,h);
121  if (h.threadIdx() == 0) { d = amrex::min(d,r); }
122  }
123 #else
124  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
126  void parallel_update (T& d, T const& s) const noexcept {
127  T r = Gpu::blockReduceMin<MT>(s);
128  if (threadIdx.x == 0) { d = amrex::min(d,r); }
129  }
130 #endif
131 #endif
132 
133  template <typename T>
135  void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
136 
137  template <typename T>
138  constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
139  init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
140 
141  template <typename T>
142  constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
143  init (T& t) const noexcept { t = T::max(); }
144 };
145 
147 {
148 #ifdef AMREX_USE_GPU
149 #ifdef AMREX_USE_SYCL
150  template <typename T>
152  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
153  T r = Gpu::blockReduceMax(s,h);
154  if (h.threadIdx() == 0) { d = amrex::max(d,r); }
155  }
156 #else
157  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
159  void parallel_update (T& d, T const& s) const noexcept {
160  T r = Gpu::blockReduceMax<MT>(s);
161  if (threadIdx.x == 0) { d = amrex::max(d,r); }
162  }
163 #endif
164 #endif
165 
166  template <typename T>
168  void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
169 
170  template <typename T>
171  constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172  init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
173 
174  template <typename T>
175  constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176  init (T& t) const noexcept { t = T::lowest(); }
177 };
178 
180 {
181 #ifdef AMREX_USE_GPU
182 #ifdef AMREX_USE_SYCL
183  template <typename T>
185  std::enable_if_t<std::is_integral<T>::value>
186  parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
187  T r = Gpu::blockReduceLogicalAnd(s,h);
188  if (h.threadIdx() == 0) { d = d && r; }
189  }
190 #else
191  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
193  std::enable_if_t<std::is_integral<T>::value>
194  parallel_update (T& d, T s) const noexcept {
195  T r = Gpu::blockReduceLogicalAnd<MT>(s);
196  if (threadIdx.x == 0) { d = d && r; }
197  }
198 #endif
199 #endif
200 
201  template <typename T>
203  std::enable_if_t<std::is_integral_v<T>>
204  local_update (T& d, T s) const noexcept { d = d && s; }
205 
206  template <typename T>
207  constexpr std::enable_if_t<std::is_integral_v<T>>
208  init (T& t) const noexcept { t = true; }
209 };
210 
212 {
213 #ifdef AMREX_USE_GPU
214 #ifdef AMREX_USE_SYCL
215  template <typename T>
217  std::enable_if_t<std::is_integral<T>::value>
218  parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
219  T r = Gpu::blockReduceLogicalOr(s,h);
220  if (h.threadIdx() == 0) { d = d || r; }
221  }
222 #else
223  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
225  std::enable_if_t<std::is_integral<T>::value>
226  parallel_update (T& d, T s) const noexcept {
227  T r = Gpu::blockReduceLogicalOr<MT>(s);
228  if (threadIdx.x == 0) { d = d || r; }
229  }
230 #endif
231 #endif
232 
233  template <typename T>
235  std::enable_if_t<std::is_integral_v<T>>
236  local_update (T& d, T s) const noexcept { d = d || s; }
237 
238  template <typename T>
239  constexpr std::enable_if_t<std::is_integral_v<T>>
240  init (T& t) const noexcept { t = false; }
241 };
242 
243 template <typename... Ps> class ReduceOps;
244 
245 #ifdef AMREX_USE_GPU
246 
247 template <typename... Ts>
249 {
250 public:
251  using Type = GpuTuple<Ts...>;
252 
253  template <typename... Ps>
254  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
255  : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
256  m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
258  * m_max_blocks * sizeof(Type)))),
259  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
260  {
261  reduce_op.resetResultReadiness();
262  static_assert(std::is_trivially_copyable<Type>(),
263  "ReduceData::Type must be trivially copyable");
264  static_assert(std::is_trivially_destructible<Type>(),
265  "ReduceData::Type must be trivially destructible");
266 
267  new (m_host_tuple) Type();
268  m_nblocks.fill(0);
269  }
270 
272  The_Pinned_Arena()->free(m_host_tuple);
273  The_Arena()->free(m_device_tuple);
274  }
275 
276  ReduceData (ReduceData<Ts...> const&) = delete;
278  void operator= (ReduceData<Ts...> const&) = delete;
279  void operator= (ReduceData<Ts...> &&) = delete;
280 
282  {
283  return m_fn_value();
284  }
285 
286  template <typename... Ps>
288  {
289  return reduce_op.value(*this);
290  }
291 
292  Type* devicePtr () { return m_device_tuple; }
293  Type* devicePtr (gpuStream_t const& s) {
294  return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
295  }
296 
297  Type* hostPtr () { return m_host_tuple; }
298 
300  int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
301 
302  int maxBlocks () const { return m_max_blocks; }
303 
304  int maxStreamIndex () const { return m_max_stream_index; }
306  m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
307  }
308 
309 private:
311  int m_max_stream_index = 0;
312  Type* m_host_tuple = nullptr;
313  Type* m_device_tuple = nullptr;
315  std::function<Type()> m_fn_value;
316 };
317 
318 namespace Reduce::detail {
319  template <typename F>
321  auto call_f (F const& f, int i, int j, int k, IndexType)
322  noexcept -> decltype(f(0,0,0))
323  {
324  return f(i,j,k);
325  }
326 
327  template <typename F>
329  auto call_f (F const& f, int i, int j, int k, IndexType t)
330  noexcept -> decltype(f(Box()))
331  {
333  return f(Box(IntVect(AMREX_D_DECL(i,j,k)),
334  IntVect(AMREX_D_DECL(i,j,k)),
335  t));
336  }
337 
338  struct iterate_box {};
339  struct iterate_box_comp {};
340 
341  template <typename I, typename F, typename T, typename... Ps,
342  std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
344  void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
345  {
346  auto const& pr = f(ibox,i,j,k);
347  Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
348  }
349 
350  template <typename I, typename F, typename T, typename... Ps,
351  std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
353  void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
354  {
355  for (int n = 0; n < ncomp; ++n) {
356  auto const& pr = f(ibox,i,j,k,n);
357  Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
358  }
359  }
360 }
361 
362 template <typename... Ps>
364 {
365 public:
366 
367  // This is public for CUDA
368  template <typename I, typename MF, typename D, typename F>
369  void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
370  {
371  using ReduceTuple = typename D::Type;
372  const int nboxes = mf.local_size();
373  if (nboxes > 0) {
374  auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS);
375  auto par_for_blocks = parforinfo.getBlocks();
376  const int nblocks = par_for_blocks.first[nboxes];
377  const int block_0_size = par_for_blocks.first[1];
378  const int* dp_nblocks = par_for_blocks.second;
379  const BoxIndexer* dp_boxes = parforinfo.getBoxes();
380 
381  auto const& stream = Gpu::gpuStream();
382  auto pdst = reduce_data.devicePtr(stream);
383  int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
384  AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
385  reduce_data.nBlocks(stream) = nblocks_ec;
386  reduce_data.updateMaxStreamIndex(stream);
387 
388 #ifdef AMREX_USE_SYCL
389  // device reduce needs local(i.e., shared) memory
390  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
391  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
392  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
393  {
394  Dim1 blockIdx {gh.blockIdx()};
395  Dim1 threadIdx{gh.threadIdx()};
396 #else
397  amrex::launch_global<AMREX_GPU_MAX_THREADS>
398  <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
399  ([=] AMREX_GPU_DEVICE () noexcept
400  {
401 #endif
402  ReduceTuple r;
403  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
404  ReduceTuple& dst = pdst[blockIdx.x];
405  if (threadIdx.x == 0) {
406  dst = r;
407  }
408  for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
409  int ibox;
410  std::uint64_t icell;
411  if (dp_nblocks) {
412  ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);
413  icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
414  } else {
415  ibox = iblock / block_0_size;
416  icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
417  }
418 
419  BoxIndexer const& indexer = dp_boxes[ibox];
420  if (icell < indexer.numPts()) {
421  auto [i, j, k] = indexer(icell);
422  Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
423  (f, ibox, i, j, k, ncomp, r);
424  }
425  }
426 #ifdef AMREX_USE_SYCL
427  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
428 #else
429  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
430 #endif
431  });
432  }
433  }
434 
435  template <typename MF, typename D, typename F>
436  std::enable_if_t<IsFabArray<MF>::value
437 #ifndef AMREX_USE_CUDA
439 #endif
440  >
441  eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
442  {
443  using ReduceTuple = typename D::Type;
444  const int nboxes = mf.local_size();
445  if (nboxes == 0) {
446  return;
447  } else if (!mf.isFusingCandidate()) {
448  for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
449  Box const& b = amrex::grow(mfi.validbox(), nghost);
450  const int li = mfi.LocalIndex();
451  this->eval(b, reduce_data,
452  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
453  {
454  return f(li, i, j, k);
455  });
456  }
457  } else {
458  eval_mf(Reduce::detail::iterate_box{},
459  mf, nghost, 0, reduce_data, std::forward<F>(f));
460  }
461  }
462 
463  template <typename MF, typename D, typename F>
464  std::enable_if_t<IsFabArray<MF>::value
465 #ifndef AMREX_USE_CUDA
467 #endif
468  >
469  eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
470  {
471  using ReduceTuple = typename D::Type;
472 
473  const int nboxes = mf.local_size();
474 
475  if (nboxes == 0) {
476  return;
477  } else if (!mf.isFusingCandidate()) {
478  for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
479  Box const& b = amrex::grow(mfi.validbox(), nghost);
480  const int li = mfi.LocalIndex();
481  this->eval(b, ncomp, reduce_data,
482  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
483  {
484  return f(li, i, j, k, n);
485  });
486  }
487  } else {
489  mf, nghost, ncomp, reduce_data, std::forward<F>(f));
490  }
491  }
492 
493  template <typename D, typename F>
494  void eval (Box const& box, D & reduce_data, F const& f)
495  {
496  using ReduceTuple = typename D::Type;
497  auto const& stream = Gpu::gpuStream();
498  auto dp = reduce_data.devicePtr(stream);
499  int& nblocks = reduce_data.nBlocks(stream);
500  int ncells = box.numPts();
501  const auto lo = amrex::lbound(box);
502  const auto len = amrex::length(box);
503  const auto lenxy = len.x*len.y;
504  const auto lenx = len.x;
505  IndexType ixtype = box.ixType();
506  constexpr int nitems_per_thread = 4;
507  int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
508  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
509  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
510  reduce_data.updateMaxStreamIndex(stream);
511 #ifdef AMREX_USE_SYCL
512  // device reduce needs local(i.e., shared) memory
513  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
514  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
515  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
516  {
517  Dim1 blockIdx {gh.blockIdx()};
518  Dim1 threadIdx{gh.threadIdx()};
519  Dim1 blockDim {gh.blockDim()};
520  Dim1 gridDim {gh.gridDim()};
521 #else
522  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
523  [=] AMREX_GPU_DEVICE () noexcept
524  {
525 #endif
526  ReduceTuple r;
527  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
528  ReduceTuple& dst = *(dp+blockIdx.x);
529  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
530  dst = r;
531  }
532  for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
533  icell < ncells; icell += stride) {
534  int k = icell / lenxy;
535  int j = (icell - k*lenxy) / lenx;
536  int i = (icell - k*lenxy) - j*lenx;
537  i += lo.x;
538  j += lo.y;
539  k += lo.z;
540  auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
541  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
542  }
543 #ifdef AMREX_USE_SYCL
544  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
545 #else
546  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
547 #endif
548  });
549  nblocks = std::max(nblocks, nblocks_ec);
550  }
551 
552  template <typename N, typename D, typename F,
553  typename M=std::enable_if_t<std::is_integral<N>::value> >
554  void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
555  {
556  using ReduceTuple = typename D::Type;
557  auto const& stream = Gpu::gpuStream();
558  auto dp = reduce_data.devicePtr(stream);
559  int& nblocks = reduce_data.nBlocks(stream);
560  int ncells = box.numPts();
561  const auto lo = amrex::lbound(box);
562  const auto len = amrex::length(box);
563  const auto lenxy = len.x*len.y;
564  const auto lenx = len.x;
565  constexpr int nitems_per_thread = 4;
566  int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
567  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
568  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
569  reduce_data.updateMaxStreamIndex(stream);
570 #ifdef AMREX_USE_SYCL
571  // device reduce needs local(i.e., shared) memory
572  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
573  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
574  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
575  {
576  Dim1 blockIdx {gh.blockIdx()};
577  Dim1 threadIdx{gh.threadIdx()};
578  Dim1 blockDim {gh.blockDim()};
579  Dim1 gridDim {gh.gridDim()};
580 #else
581  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
582  [=] AMREX_GPU_DEVICE () noexcept
583  {
584 #endif
585  ReduceTuple r;
586  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
587  ReduceTuple& dst = *(dp+blockIdx.x);
588  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
589  dst = r;
590  }
591  for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
592  icell < ncells; icell += stride) {
593  int k = icell / lenxy;
594  int j = (icell - k*lenxy) / lenx;
595  int i = (icell - k*lenxy) - j*lenx;
596  i += lo.x;
597  j += lo.y;
598  k += lo.z;
599  for (N n = 0; n < ncomp; ++n) {
600  auto pr = f(i,j,k,n);
601  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
602  }
603  }
604 #ifdef AMREX_USE_SYCL
605  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
606 #else
607  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
608 #endif
609  });
610  nblocks = std::max(nblocks, nblocks_ec);
611  }
612 
613  template <typename N, typename D, typename F,
614  typename M=std::enable_if_t<std::is_integral<N>::value> >
615  void eval (N n, D & reduce_data, F const& f)
616  {
617  if (n <= 0) { return; }
618  using ReduceTuple = typename D::Type;
619  auto const& stream = Gpu::gpuStream();
620  auto dp = reduce_data.devicePtr(stream);
621  int& nblocks = reduce_data.nBlocks(stream);
622  constexpr int nitems_per_thread = 4;
623  int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
624  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
625  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
626  reduce_data.updateMaxStreamIndex(stream);
627 #ifdef AMREX_USE_SYCL
628  // device reduce needs local(i.e., shared) memory
629  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
630  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
631  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
632  {
633  Dim1 blockIdx {gh.blockIdx()};
634  Dim1 threadIdx{gh.threadIdx()};
635  Dim1 blockDim {gh.blockDim()};
636  Dim1 gridDim {gh.gridDim()};
637 #else
638  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
639  [=] AMREX_GPU_DEVICE () noexcept
640  {
641 #endif
642  ReduceTuple r;
643  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
644  ReduceTuple& dst = *(dp+blockIdx.x);
645  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
646  dst = r;
647  }
648  for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
649  i < n; i += stride) {
650  auto pr = f(i);
651  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
652  }
653 #ifdef AMREX_USE_SYCL
654  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
655 #else
656  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
657 #endif
658  });
659  nblocks = amrex::max(nblocks, nblocks_ec);
660  }
661 
662  template <typename D>
663  typename D::Type value (D & reduce_data)
664  {
665  auto hp = reduce_data.hostPtr();
666 
667  if (m_result_is_ready) {
668  return *hp;
669  }
670 
671  using ReduceTuple = typename D::Type;
672  auto const& stream = Gpu::gpuStream();
673  auto dp = reduce_data.devicePtr();
674  auto const& nblocks = reduce_data.nBlocks();
675 #if defined(AMREX_USE_SYCL)
676  if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
677  const int N = nblocks[0];
678  if (N == 0) {
679  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
680  } else {
682  Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
684  for (int i = 1; i < N; ++i) {
685  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
686  }
687  *hp = tmp[0];
688  }
689  } else
690 #endif
691  {
692  int maxblocks = reduce_data.maxBlocks();
693 #ifdef AMREX_USE_SYCL
694  // device reduce needs local(i.e., shared) memory
695  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
696 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
697  // xxxxx SYCL todo: reduce bug workaround
699  auto presult = dtmp.data();
700 #else
701  auto presult = hp;
702 #endif
703  amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
704  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
705  {
706  ReduceTuple r;
707  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
708  ReduceTuple dst = r;
709  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
710  auto dp_stream = dp+istream*maxblocks;
711  for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
712  i < nblocks[istream]; i += stride) {
713  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
714  }
715  }
716  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
717  if (gh.threadIdx() == 0) { *presult = dst; }
718  });
719 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
720  Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
721 #endif
722 #else
723  amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
724  [=] AMREX_GPU_DEVICE () noexcept
725  {
726  ReduceTuple r;
727  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
728  ReduceTuple dst = r;
729  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
730  auto dp_stream = dp+istream*maxblocks;
731  for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
732  i < nblocks[istream]; i += stride) {
733  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
734  }
735  }
736  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
737  if (threadIdx.x == 0) { *hp = dst; }
738  });
739 #endif
741  }
742 
743  m_result_is_ready = true;
744  return *hp;
745  }
746 
747 private:
748  bool m_result_is_ready = false;
749 
750 public:
751  void resetResultReadiness () { m_result_is_ready = false; }
752 };
753 
754 namespace Reduce {
755 
756 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
757 T Sum (N n, T const* v, T init_val = 0)
758 {
759  ReduceOps<ReduceOpSum> reduce_op;
760  ReduceData<T> reduce_data(reduce_op);
761  using ReduceTuple = typename decltype(reduce_data)::Type;
762  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
763  ReduceTuple hv = reduce_data.value(reduce_op);
764  return amrex::get<0>(hv) + init_val;
765 }
766 
767 template <typename T, typename N, typename F,
768  typename M=std::enable_if_t<std::is_integral<N>::value> >
769 T Sum (N n, F const& f, T init_val = 0)
770 {
771  ReduceOps<ReduceOpSum> reduce_op;
772  ReduceData<T> reduce_data(reduce_op);
773  using ReduceTuple = typename decltype(reduce_data)::Type;
774  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
775  ReduceTuple hv = reduce_data.value(reduce_op);
776  return amrex::get<0>(hv) + init_val;
777 }
778 
779 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
780 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
781 {
782  ReduceOps<ReduceOpMin> reduce_op;
783  ReduceData<T> reduce_data(reduce_op);
784  using ReduceTuple = typename decltype(reduce_data)::Type;
785  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
786  ReduceTuple hv = reduce_data.value(reduce_op);
787  return std::min(amrex::get<0>(hv),init_val);
788 }
789 
790 template <typename T, typename N, typename F,
791  typename M=std::enable_if_t<std::is_integral<N>::value> >
792 T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
793 {
794  ReduceOps<ReduceOpMin> reduce_op;
795  ReduceData<T> reduce_data(reduce_op);
796  using ReduceTuple = typename decltype(reduce_data)::Type;
797  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
798  ReduceTuple hv = reduce_data.value(reduce_op);
799  return std::min(amrex::get<0>(hv),init_val);
800 }
801 
802 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
803 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
804 {
805  ReduceOps<ReduceOpMax> reduce_op;
806  ReduceData<T> reduce_data(reduce_op);
807  using ReduceTuple = typename decltype(reduce_data)::Type;
808  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
809  ReduceTuple hv = reduce_data.value(reduce_op);
810  return std::max(amrex::get<0>(hv),init_val);
811 }
812 
813 template <typename T, typename N, typename F,
814  typename M=std::enable_if_t<std::is_integral<N>::value> >
815 T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
816 {
817  ReduceOps<ReduceOpMax> reduce_op;
818  ReduceData<T> reduce_data(reduce_op);
819  using ReduceTuple = typename decltype(reduce_data)::Type;
820  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
821  ReduceTuple hv = reduce_data.value(reduce_op);
822  return std::max(amrex::get<0>(hv),init_val);
823 }
824 
825 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
826 std::pair<T,T> MinMax (N n, T const* v)
827 {
829  ReduceData<T,T> reduce_data(reduce_op);
830  using ReduceTuple = typename decltype(reduce_data)::Type;
831  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
832  return {v[i],v[i]};
833  });
834  auto hv = reduce_data.value(reduce_op);
835  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
836 }
837 
838 template <typename T, typename N, typename F,
839  typename M=std::enable_if_t<std::is_integral<N>::value> >
840 std::pair<T,T> MinMax (N n, F const& f)
841 {
843  ReduceData<T,T> reduce_data(reduce_op);
844  using ReduceTuple = typename decltype(reduce_data)::Type;
845  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
846  T tmp = f(i);
847  return {tmp,tmp};
848  });
849  auto hv = reduce_data.value(reduce_op);
850  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
851 }
852 
853 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
854 bool AnyOf (N n, T const* v, P const& pred)
855 {
856  Gpu::LaunchSafeGuard lsg(true);
858  int* dp = ds.dataPtr();
859  auto ec = Gpu::ExecutionConfig(n);
860  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
861 
862 #ifdef AMREX_USE_SYCL
863  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
864  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
865  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
866  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
867  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
868  if (gh.threadIdx() == 0) { *has_any = *dp; }
869  gh.sharedBarrier();
870 
871  if (!(*has_any))
872  {
873  int r = false;
874  for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
875  i < n && !r; i += stride)
876  {
877  r = pred(v[i]) ? 1 : 0;
878  }
879 
880  r = Gpu::blockReduce<Gpu::Device::warp_size>
881  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
882  if (gh.threadIdx() == 0 && r) { *dp = 1; }
883  }
884  });
885 #else
886  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
887  [=] AMREX_GPU_DEVICE () noexcept {
888  __shared__ int has_any;
889  if (threadIdx.x == 0) { has_any = *dp; }
890  __syncthreads();
891 
892  if (!has_any)
893  {
894  int r = false;
895  for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
896  i < n && !r; i += stride)
897  {
898  r = pred(v[i]) ? 1 : 0;
899  }
900  r = Gpu::blockReduce<Gpu::Device::warp_size>
901  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
902  if (threadIdx.x == 0 && r) *dp = 1;
903  }
904  });
905 #endif
906  return ds.dataValue();
907 }
908 
909 template <typename P>
910 bool AnyOf (Box const& box, P const& pred)
911 {
912  Gpu::LaunchSafeGuard lsg(true);
914  int* dp = ds.dataPtr();
915  int ncells = box.numPts();
916  const auto lo = amrex::lbound(box);
917  const auto len = amrex::length(box);
918  const auto lenxy = len.x*len.y;
919  const auto lenx = len.x;
920  auto ec = Gpu::ExecutionConfig(ncells);
921  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
922 
923 #ifdef AMREX_USE_SYCL
924  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
925  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
926  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
927  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
928  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
929  if (gh.threadIdx() == 0) { *has_any = *dp; }
930  gh.sharedBarrier();
931 
932  if (!(*has_any))
933  {
934  int r = false;
935  for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
936  icell < ncells && !r; icell += stride) {
937  int k = icell / lenxy;
938  int j = (icell - k*lenxy) / lenx;
939  int i = (icell - k*lenxy) - j*lenx;
940  i += lo.x;
941  j += lo.y;
942  k += lo.z;
943  r = pred(i,j,k) ? 1 : 0;
944  }
945  r = Gpu::blockReduce<Gpu::Device::warp_size>
946  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
947  if (gh.threadIdx() == 0 && r) { *dp = 1; }
948  }
949  });
950 #else
951  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
952  Gpu::gpuStream(),
953  [=] AMREX_GPU_DEVICE () noexcept {
954  __shared__ int has_any;
955  if (threadIdx.x == 0) { has_any = *dp; }
956  __syncthreads();
957 
958  if (!has_any)
959  {
960  int r = false;
961  for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
962  icell < ncells && !r; icell += stride) {
963  int k = icell / lenxy;
964  int j = (icell - k*lenxy) / lenx;
965  int i = (icell - k*lenxy) - j*lenx;
966  i += lo.x;
967  j += lo.y;
968  k += lo.z;
969  r = pred(i,j,k) ? 1 : 0;
970  }
971  r = Gpu::blockReduce<Gpu::Device::warp_size>
972  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
973  if (threadIdx.x == 0 && r) *dp = 1;
974  }
975  });
976 #endif
977  return ds.dataValue();
978 }
979 
980 }
981 
982 #else
983 
984 template <typename... Ts>
985 class ReduceData
986 {
987 public:
988  using Type = GpuTuple<Ts...>;
989 
990  template <typename... Ps>
991  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
992  : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
993  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
994  {
995  reduce_op.resetResultReadiness();
996  for (auto& t : m_tuple) {
997  Reduce::detail::for_each_init<0, Type, Ps...>(t);
998  }
999  }
1000 
1001  ~ReduceData () = default;
1002  ReduceData (ReduceData<Ts...> const&) = delete;
1003  ReduceData (ReduceData<Ts...> &&) = delete;
1004  void operator= (ReduceData<Ts...> const&) = delete;
1005  void operator= (ReduceData<Ts...> &&) = delete;
1006 
1007  Type value () { return m_fn_value(); }
1008 
1009  template <typename... Ps>
1010  Type value (ReduceOps<Ps...>& reduce_op)
1011  {
1012  return reduce_op.value(*this);
1013  }
1014 
1015  Vector<Type>& reference () { return m_tuple; }
1016 
1017  Type& reference (int tid)
1018  {
1019  if (m_tuple.size() == 1) {
1020  // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1021  return m_tuple[0];
1022  } else {
1023  return m_tuple[tid];
1024  }
1025  }
1026 
1027 private:
1028  Vector<Type> m_tuple;
1029  std::function<Type()> m_fn_value;
1030 };
1031 
1032 template <typename... Ps>
1033 class ReduceOps
1034 {
1035 private:
1036 
1037  template <typename D, typename F>
1039  static auto call_f (Box const& box, typename D::Type & r, F const& f)
1040  noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
1041  typename D::Type>>
1042  {
1043  using ReduceTuple = typename D::Type;
1044  const auto lo = amrex::lbound(box);
1045  const auto hi = amrex::ubound(box);
1046  for (int k = lo.z; k <= hi.z; ++k) {
1047  for (int j = lo.y; j <= hi.y; ++j) {
1048  for (int i = lo.x; i <= hi.x; ++i) {
1049  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
1050  }}}
1051  }
1052 
1053  template <typename D, typename F>
1055  static auto call_f (Box const& box, typename D::Type & r, F const& f)
1056  noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
1057  typename D::Type>>
1058  {
1059  using ReduceTuple = typename D::Type;
1060  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1061  }
1062 
1063 public:
1064 
1065  template <typename MF, typename D, typename F>
1066  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1067  eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1068  {
1069  using ReduceTuple = typename D::Type;
1070 #ifdef AMREX_USE_OMP
1071 #pragma omp parallel
1072 #endif
1073  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1074  Box const& b = mfi.growntilebox(nghost);
1075  const int li = mfi.LocalIndex();
1076  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1077  const auto lo = amrex::lbound(b);
1078  const auto hi = amrex::ubound(b);
1079  for (int k = lo.z; k <= hi.z; ++k) {
1080  for (int j = lo.y; j <= hi.y; ++j) {
1081  for (int i = lo.x; i <= hi.x; ++i) {
1082  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1083  }}}
1084  }
1085  }
1086 
1087  template <typename MF, typename D, typename F>
1088  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1089  eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1090  {
1091  using ReduceTuple = typename D::Type;
1092 #ifdef AMREX_USE_OMP
1093 #pragma omp parallel
1094 #endif
1095  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1096  Box const& b = mfi.growntilebox(nghost);
1097  const int li = mfi.LocalIndex();
1098  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1099  const auto lo = amrex::lbound(b);
1100  const auto hi = amrex::ubound(b);
1101  for (int n = 0; n < ncomp; ++n) {
1102  for (int k = lo.z; k <= hi.z; ++k) {
1103  for (int j = lo.y; j <= hi.y; ++j) {
1104  for (int i = lo.x; i <= hi.x; ++i) {
1105  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1106  }}}}
1107  }
1108  }
1109 
1110  template <typename D, typename F>
1111  void eval (Box const& box, D & reduce_data, F&& f)
1112  {
1113  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1114  call_f<D>(box, rr, std::forward<F>(f));
1115  }
1116 
1117  template <typename N, typename D, typename F,
1118  typename M=std::enable_if_t<std::is_integral_v<N>> >
1119  void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
1120  {
1121  using ReduceTuple = typename D::Type;
1122  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1123  const auto lo = amrex::lbound(box);
1124  const auto hi = amrex::ubound(box);
1125  for (N n = 0; n < ncomp; ++n) {
1126  for (int k = lo.z; k <= hi.z; ++k) {
1127  for (int j = lo.y; j <= hi.y; ++j) {
1128  for (int i = lo.x; i <= hi.x; ++i) {
1129  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1130  }}}}
1131  }
1132 
1133  template <typename N, typename D, typename F,
1134  typename M=std::enable_if_t<std::is_integral_v<N>> >
1135  void eval (N n, D & reduce_data, F const& f)
1136  {
1137  using ReduceTuple = typename D::Type;
1138  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1139  for (N i = 0; i < n; ++i) {
1140  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1141  }
1142  }
1143 
1144  template <typename D>
1145  typename D::Type value (D & reduce_data)
1146  {
1147  auto& rrv = reduce_data.reference();
1148  if (! m_result_is_ready) {
1149  using ReduceTuple = typename D::Type;
1150  if (rrv.size() > 1) {
1151  for (int i = 1, N = rrv.size(); i < N; ++i) {
1152  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1153  }
1154  }
1155  m_result_is_ready = true;
1156  }
1157  return rrv[0];
1158  }
1159 
1160  bool m_result_is_ready = false;
1161 
1162  void resetResultReadiness () { m_result_is_ready = false; }
1163 };
1164 
1165 namespace Reduce {
1166 
1167 template <typename T, typename N, typename F,
1168  typename M=std::enable_if_t<std::is_integral_v<N>> >
1169 T Sum (N n, F const& f, T init_val = 0)
1170 {
1171  T r = init_val;
1172 #ifdef AMREX_USE_OMP
1173 #pragma omp parallel for reduction(+:r)
1174 #endif
1175  for (N i = 0; i < n; ++i) {
1176  r += f(i);
1177  }
1178  return r;
1179 }
1180 
1181 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1182 T Sum (N n, T const* v, T init_val = 0)
1183 {
1184  return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1185 }
1186 
1187 template <typename T, typename N, typename F,
1188  typename M=std::enable_if_t<std::is_integral_v<N>> >
1189 T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1190 {
1191  T r = init_val;
1192 #ifdef AMREX_USE_OMP
1193 #pragma omp parallel for reduction(min:r)
1194 #endif
1195  for (N i = 0; i < n; ++i) {
1196  r = std::min(r,f(i));
1197  }
1198  return r;
1199 }
1200 
1201 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1202 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1203 {
1204  return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1205 }
1206 
1207 template <typename T, typename N, typename F,
1208  typename M=std::enable_if_t<std::is_integral_v<N>> >
1209 T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1210 {
1211  T r = init_val;
1212 #ifdef AMREX_USE_OMP
1213 #pragma omp parallel for reduction(max:r)
1214 #endif
1215  for (N i = 0; i < n; ++i) {
1216  r = std::max(r,f(i));
1217  }
1218  return r;
1219 }
1220 
1221 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1222 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1223 {
1224  return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1225 }
1226 
1227 template <typename T, typename N, typename F,
1228  typename M=std::enable_if_t<std::is_integral_v<N>> >
1229 std::pair<T,T> Min (N n, F const& f)
1230 {
1231  T r_min = std::numeric_limits<T>::max();
1232  T r_max = std::numeric_limits<T>::lowest();
1233 #ifdef AMREX_USE_OMP
1234 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1235 #endif
1236  for (N i = 0; i < n; ++i) {
1237  T tmp = f(i);
1238  r_min = std::min(r_min,tmp);
1239  r_max = std::max(r_max,tmp);
1240  }
1241  return std::make_pair(r_min,r_max);
1242 }
1243 
1244 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1245 std::pair<T,T> MinMax (N n, T const* v)
1246 {
1247  return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1248 }
1249 
1250 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1251 bool AnyOf (N n, T const* v, P&& pred)
1252 {
1253  return std::any_of(v, v+n, std::forward<P>(pred));
1254 }
1255 
1256 template <typename P>
1257 bool AnyOf (Box const& box, P const& pred)
1258 {
1259  const auto lo = amrex::lbound(box);
1260  const auto hi = amrex::ubound(box);
1261  for (int k = lo.z; k <= hi.z; ++k) {
1262  for (int j = lo.y; j <= hi.y; ++j) {
1263  for (int i = lo.x; i <= hi.x; ++i) {
1264  if (pred(i,j,k)) { return true; }
1265  }}}
1266  return false;
1267 }
1268 
1269 }
1270 
1271 #endif
1272 
1277 template <typename... Ts, typename... Ps>
1279 constexpr GpuTuple<Ts...>
1281 {
1282  GpuTuple<Ts...> r{};
1283  Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1284  return r;
1285 }
1286 
1291 template <typename... Ts, typename... Ps>
1293 constexpr GpuTuple<Ts...>
1295 {
1296  GpuTuple<Ts...> r{};
1297  Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1298  return r;
1299 }
1300 
1301 }
1302 
1303 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition: AMReX_GpuDevice.H:20
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:34
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
Real * pdst
Definition: AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition: AMReX_SPACE.H:151
#define AMREX_D_DECL(a, b, c)
Definition: AMReX_SPACE.H:104
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
AMREX_GPU_HOST_DEVICE IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition: AMReX_Box.H:127
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
Definition: AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition: AMReX_GpuDevice.cpp:586
Definition: AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
Definition: AMReX_PODVector.H:246
T * data() noexcept
Definition: AMReX_PODVector.H:593
Definition: AMReX_Reduce.H:249
~ReduceData()
Definition: AMReX_Reduce.H:271
int maxStreamIndex() const
Definition: AMReX_Reduce.H:304
Type * devicePtr(gpuStream_t const &s)
Definition: AMReX_Reduce.H:293
Type value()
Definition: AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition: AMReX_Reduce.H:305
Type * m_host_tuple
Definition: AMReX_Reduce.H:312
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:254
Type * devicePtr()
Definition: AMReX_Reduce.H:292
Type * hostPtr()
Definition: AMReX_Reduce.H:297
int m_max_blocks
Definition: AMReX_Reduce.H:310
int & nBlocks(gpuStream_t const &s)
Definition: AMReX_Reduce.H:300
Type value(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:287
Type * m_device_tuple
Definition: AMReX_Reduce.H:313
GpuArray< int, AMREX_GPU_MAX_STREAMS > & nBlocks()
Definition: AMReX_Reduce.H:299
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition: AMReX_Reduce.H:315
int maxBlocks() const
Definition: AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
GpuArray< int, AMREX_GPU_MAX_STREAMS > m_nblocks
Definition: AMReX_Reduce.H:314
Definition: AMReX_Reduce.H:364
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:441
D::Type value(D &reduce_data)
Definition: AMReX_Reduce.H:663
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:554
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:494
void resetResultReadiness()
Definition: AMReX_Reduce.H:751
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:469
void eval(N n, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:615
static constexpr OpenMPBinPolicy OpenMP
Definition: AMReX_DenseBins.H:20
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition: AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition: AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition: AMReX_GpuReduce.H:445
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition: AMReX_OpenMP.H:37
constexpr int in_parallel()
Definition: AMReX_OpenMP.H:38
constexpr int get_max_threads()
Definition: AMReX_OpenMP.H:36
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:62
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:70
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:45
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition: AMReX_Reduce.H:344
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f(F const &f, int i, int j, int k, IndexType) noexcept -> decltype(f(0, 0, 0))
Definition: AMReX_Reduce.H:321
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:38
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:55
bool AnyOf(Box const &box, P const &pred)
Definition: AMReX_Reduce.H:910
T Min(N n, F const &f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:792
std::pair< T, T > MinMax(N n, F const &f)
Definition: AMReX_Reduce.H:840
T Max(N n, F const &f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:815
T Sum(N n, F const &f, T init_val=0)
Definition: AMReX_Reduce.H:769
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition: AMReX_ParallelReduce.H:92
static constexpr int M
Definition: AMReX_OpenBC.H:13
static constexpr int P
Definition: AMReX_OpenBC.H:14
Definition: AMReX_Amr.cpp:49
constexpr AMREX_GPU_HOST_DEVICE GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, TypeList< Ps... >) noexcept
Return a GpuTuple containing the identity element for each ReduceOp in TypeList. For example 0,...
Definition: AMReX_Reduce.H:1294
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:35
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition: AMReX_Box.H:1211
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:315
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition: AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:308
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:322
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:634
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition: AMReX_Arena.cpp:594
integer, parameter dp
Definition: AMReX_SDCquadrature.F90:8
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuMemory.H:56
T dataValue() const
Definition: AMReX_GpuMemory.H:92
T * dataPtr()
Definition: AMReX_GpuMemory.H:90
Definition: AMReX_GpuLaunch.H:127
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_GpuControl.H:125
Definition: AMReX_GpuReduce.H:282
Test if a given type T is callable with arguments of type Args...
Definition: AMReX_TypeTraits.H:201
Definition: AMReX_Functional.H:14
Definition: AMReX_Reduce.H:180
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:204
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:194
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:208
Definition: AMReX_Reduce.H:212
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:240
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:226
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:236
Definition: AMReX_Reduce.H:147
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:159
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:168
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:176
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:172
Definition: AMReX_Reduce.H:114
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:139
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:143
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:126
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:135
Definition: AMReX_Reduce.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition: AMReX_Reduce.H:110
Definition: AMReX_Reduce.H:339
Definition: AMReX_Reduce.H:338
Struct for holding types.
Definition: AMReX_TypeList.H:12