Block-Structured AMR Software Framework
AMReX_Reduce.H
Go to the documentation of this file.
1 #ifndef AMREX_REDUCE_H_
2 #define AMREX_REDUCE_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_Gpu.H>
6 #include <AMReX_Arena.H>
7 #include <AMReX_OpenMP.H>
8 #include <AMReX_MFIter.H>
9 #include <AMReX_ValLocPair.H>
10 
11 #include <algorithm>
12 #include <functional>
13 #include <limits>
14 
15 namespace amrex {
16 
17 namespace Reduce::detail {
18 
19 #ifdef AMREX_USE_GPU
20 #ifdef AMREX_USE_SYCL
21  template <std::size_t I, typename T, typename P>
23  void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
24  {
25  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
26  }
27 
28  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
30  void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
31  {
32  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
33  for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
34  }
35 #else
36  template <std::size_t I, typename T, typename P>
38  void for_each_parallel (T& d, T const& s)
39  {
40  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
41  }
42 
43  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
45  void for_each_parallel (T& d, T const& s)
46  {
47  P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
48  for_each_parallel<I+1,T,P1,Ps...>(d, s);
49  }
50 #endif
51 #endif
52 
53  template <std::size_t I, typename T, typename P>
55  void for_each_local (T& d, T const& s)
56  {
57  P().local_update(amrex::get<I>(d), amrex::get<I>(s));
58  }
59 
60  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
62  void for_each_local (T& d, T const& s)
63  {
64  P().local_update(amrex::get<I>(d), amrex::get<I>(s));
65  for_each_local<I+1,T,P1,Ps...>(d, s);
66  }
67 
68  template <std::size_t I, typename T, typename P>
70  constexpr void for_each_init (T& t)
71  {
72  P().init(amrex::get<I>(t));
73  }
74 
75  template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
77  constexpr void for_each_init (T& t)
78  {
79  P().init(amrex::get<I>(t));
80  for_each_init<I+1,T,P1,Ps...>(t);
81  }
82 }
83 
85 {
86 
87 #ifdef AMREX_USE_GPU
88 #ifdef AMREX_USE_SYCL
89  template <typename T>
91  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
92  T r = Gpu::blockReduceSum(s,h);
93  if (h.threadIdx() == 0) { d += r; }
94  }
95 #else
96  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
98  void parallel_update (T& d, T const& s) const noexcept {
99  T r = Gpu::blockReduceSum<MT>(s);
100  if (threadIdx.x == 0) { d += r; }
101  }
102 #endif
103 #endif
104 
105  template <typename T>
107  void local_update (T& d, T const& s) const noexcept { d += s; }
108 
109  template <typename T>
110  constexpr void init (T& t) const noexcept { t = 0; }
111 };
112 
114 {
115 #ifdef AMREX_USE_GPU
116 #ifdef AMREX_USE_SYCL
117  template <typename T>
119  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
120  T r = Gpu::blockReduceMin(s,h);
121  if (h.threadIdx() == 0) { d = amrex::min(d,r); }
122  }
123 #else
124  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
126  void parallel_update (T& d, T const& s) const noexcept {
127  T r = Gpu::blockReduceMin<MT>(s);
128  if (threadIdx.x == 0) { d = amrex::min(d,r); }
129  }
130 #endif
131 #endif
132 
133  template <typename T>
135  void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
136 
137  template <typename T>
138  constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
139  init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
140 
141  template <typename T>
142  constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
143  init (T& t) const noexcept { t = T::max(); }
144 };
145 
147 {
148 #ifdef AMREX_USE_GPU
149 #ifdef AMREX_USE_SYCL
150  template <typename T>
152  void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
153  T r = Gpu::blockReduceMax(s,h);
154  if (h.threadIdx() == 0) { d = amrex::max(d,r); }
155  }
156 #else
157  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
159  void parallel_update (T& d, T const& s) const noexcept {
160  T r = Gpu::blockReduceMax<MT>(s);
161  if (threadIdx.x == 0) { d = amrex::max(d,r); }
162  }
163 #endif
164 #endif
165 
166  template <typename T>
168  void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
169 
170  template <typename T>
171  constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172  init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
173 
174  template <typename T>
175  constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176  init (T& t) const noexcept { t = T::lowest(); }
177 };
178 
180 {
181 #ifdef AMREX_USE_GPU
182 #ifdef AMREX_USE_SYCL
183  template <typename T>
185  std::enable_if_t<std::is_integral<T>::value>
186  parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
187  T r = Gpu::blockReduceLogicalAnd(s,h);
188  if (h.threadIdx() == 0) { d = d && r; }
189  }
190 #else
191  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
193  std::enable_if_t<std::is_integral<T>::value>
194  parallel_update (T& d, T s) const noexcept {
195  T r = Gpu::blockReduceLogicalAnd<MT>(s);
196  if (threadIdx.x == 0) { d = d && r; }
197  }
198 #endif
199 #endif
200 
201  template <typename T>
203  std::enable_if_t<std::is_integral_v<T>>
204  local_update (T& d, T s) const noexcept { d = d && s; }
205 
206  template <typename T>
207  constexpr std::enable_if_t<std::is_integral_v<T>>
208  init (T& t) const noexcept { t = true; }
209 };
210 
212 {
213 #ifdef AMREX_USE_GPU
214 #ifdef AMREX_USE_SYCL
215  template <typename T>
217  std::enable_if_t<std::is_integral<T>::value>
218  parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
219  T r = Gpu::blockReduceLogicalOr(s,h);
220  if (h.threadIdx() == 0) { d = d || r; }
221  }
222 #else
223  template <typename T, int MT=AMREX_GPU_MAX_THREADS>
225  std::enable_if_t<std::is_integral<T>::value>
226  parallel_update (T& d, T s) const noexcept {
227  T r = Gpu::blockReduceLogicalOr<MT>(s);
228  if (threadIdx.x == 0) { d = d || r; }
229  }
230 #endif
231 #endif
232 
233  template <typename T>
235  std::enable_if_t<std::is_integral_v<T>>
236  local_update (T& d, T s) const noexcept { d = d || s; }
237 
238  template <typename T>
239  constexpr std::enable_if_t<std::is_integral_v<T>>
240  init (T& t) const noexcept { t = false; }
241 };
242 
243 template <typename... Ps> class ReduceOps;
244 
245 #ifdef AMREX_USE_GPU
246 
247 template <typename... Ts>
249 {
250 public:
251  using Type = GpuTuple<Ts...>;
252 
253  template <typename... Ps>
254  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
255  : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
256  m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
258  * m_max_blocks * sizeof(Type)))),
259  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
260  {
261  reduce_op.resetResultReadiness();
262  static_assert(std::is_trivially_copyable<Type>(),
263  "ReduceData::Type must be trivially copyable");
264  static_assert(std::is_trivially_destructible<Type>(),
265  "ReduceData::Type must be trivially destructible");
266 
267  new (m_host_tuple) Type();
268  m_nblocks.fill(0);
269  }
270 
272  The_Pinned_Arena()->free(m_host_tuple);
273  The_Arena()->free(m_device_tuple);
274  }
275 
276  ReduceData (ReduceData<Ts...> const&) = delete;
278  void operator= (ReduceData<Ts...> const&) = delete;
279  void operator= (ReduceData<Ts...> &&) = delete;
280 
282  {
283  return m_fn_value();
284  }
285 
286  template <typename... Ps>
288  {
289  return reduce_op.value(*this);
290  }
291 
292  Type* devicePtr () { return m_device_tuple; }
293  Type* devicePtr (gpuStream_t const& s) {
294  return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
295  }
296 
297  Type* hostPtr () { return m_host_tuple; }
298 
300  int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
301 
302  int maxBlocks () const { return m_max_blocks; }
303 
304  int maxStreamIndex () const { return m_max_stream_index; }
306  m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
307  }
308 
309 private:
311  int m_max_stream_index = 0;
312  Type* m_host_tuple = nullptr;
313  Type* m_device_tuple = nullptr;
315  std::function<Type()> m_fn_value;
316 };
317 
318 namespace Reduce::detail {
319  template <typename F>
321  auto call_f (F const& f, int i, int j, int k, IndexType)
322  noexcept -> decltype(f(0,0,0))
323  {
324  return f(i,j,k);
325  }
326 
327  template <typename F>
329  auto call_f (F const& f, int i, int j, int k, IndexType t)
330  noexcept -> decltype(f(Box()))
331  {
333  return f(Box(IntVect(AMREX_D_DECL(i,j,k)),
334  IntVect(AMREX_D_DECL(i,j,k)),
335  t));
336  }
337 
338  struct iterate_box {};
339  struct iterate_box_comp {};
340 
341  template <typename I, typename F, typename T, typename... Ps,
342  std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
344  void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
345  {
346  auto const& pr = f(ibox,i,j,k);
347  Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
348  }
349 
350  template <typename I, typename F, typename T, typename... Ps,
351  std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
353  void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
354  {
355  for (int n = 0; n < ncomp; ++n) {
356  auto const& pr = f(ibox,i,j,k,n);
357  Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
358  }
359  }
360 }
361 
362 template <typename... Ps>
364 {
365 public:
366 
367  // This is public for CUDA
368  template <typename I, typename MF, typename D, typename F>
369  void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
370  {
371  using ReduceTuple = typename D::Type;
372  const int nboxes = mf.local_size();
373  if (nboxes > 0) {
374  auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS);
375  auto par_for_blocks = parforinfo.getBlocks();
376  const int nblocks = par_for_blocks.first[nboxes];
377  const int block_0_size = par_for_blocks.first[1];
378  const int* dp_nblocks = par_for_blocks.second;
379  const BoxIndexer* dp_boxes = parforinfo.getBoxes();
380 
381  auto const& stream = Gpu::gpuStream();
382  auto pdst = reduce_data.devicePtr(stream);
383  int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
384  AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
385  reduce_data.nBlocks(stream) = nblocks_ec;
386  reduce_data.updateMaxStreamIndex(stream);
387 
388 #ifdef AMREX_USE_SYCL
389  // device reduce needs local(i.e., shared) memory
390  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
391  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
392  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
393  {
394  Dim1 blockIdx {gh.blockIdx()};
395  Dim1 threadIdx{gh.threadIdx()};
396 #else
397  amrex::launch_global<AMREX_GPU_MAX_THREADS>
398  <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
399  ([=] AMREX_GPU_DEVICE () noexcept
400  {
401 #endif
402  ReduceTuple r;
403  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
404  ReduceTuple& dst = pdst[blockIdx.x];
405  if (threadIdx.x == 0) {
406  dst = r;
407  }
408  for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
409  int ibox;
410  std::uint64_t icell;
411  if (dp_nblocks) {
412  ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);
413  icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
414  } else {
415  ibox = iblock / block_0_size;
416  icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
417  }
418 
419  BoxIndexer const& indexer = dp_boxes[ibox];
420  if (icell < indexer.numPts()) {
421  auto [i, j, k] = indexer(icell);
422  Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
423  (f, ibox, i, j, k, ncomp, r);
424  }
425  }
426 #ifdef AMREX_USE_SYCL
427  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
428 #else
429  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
430 #endif
431  });
432  }
433  }
434 
435  template <typename MF, typename D, typename F>
436  std::enable_if_t<IsFabArray<MF>::value
437 #ifndef AMREX_USE_CUDA
439 #endif
440  >
441  eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
442  {
443  using ReduceTuple = typename D::Type;
444  const int nboxes = mf.local_size();
445  if (nboxes == 0) {
446  return;
447  } else if (!mf.isFusingCandidate()) {
448  for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
449  Box const& b = amrex::grow(mfi.validbox(), nghost);
450  const int li = mfi.LocalIndex();
451  this->eval(b, reduce_data,
452  [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
453  {
454  return f(li, i, j, k);
455  });
456  }
457  } else {
458  eval_mf(Reduce::detail::iterate_box{},
459  mf, nghost, 0, reduce_data, std::forward<F>(f));
460  }
461  }
462 
463  template <typename MF, typename D, typename F>
464  std::enable_if_t<IsFabArray<MF>::value
465 #ifndef AMREX_USE_CUDA
467 #endif
468  >
469  eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
470  {
471  using ReduceTuple = typename D::Type;
472 
473  const int nboxes = mf.local_size();
474 
475  if (nboxes == 0) {
476  return;
477  } else if (!mf.isFusingCandidate()) {
478  for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
479  Box const& b = amrex::grow(mfi.validbox(), nghost);
480  const int li = mfi.LocalIndex();
481  this->eval(b, ncomp, reduce_data,
482  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
483  {
484  return f(li, i, j, k, n);
485  });
486  }
487  } else {
489  mf, nghost, ncomp, reduce_data, std::forward<F>(f));
490  }
491  }
492 
493  template <typename D, typename F>
494  void eval (Box const& box, D & reduce_data, F const& f)
495  {
496  using ReduceTuple = typename D::Type;
497  auto const& stream = Gpu::gpuStream();
498  auto dp = reduce_data.devicePtr(stream);
499  int& nblocks = reduce_data.nBlocks(stream);
500  int ncells = box.numPts();
501  const auto lo = amrex::lbound(box);
502  const auto len = amrex::length(box);
503  const auto lenxy = len.x*len.y;
504  const auto lenx = len.x;
505  IndexType ixtype = box.ixType();
506  constexpr int nitems_per_thread = 4;
507  int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
508  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
509  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
510  reduce_data.updateMaxStreamIndex(stream);
511 #ifdef AMREX_USE_SYCL
512  // device reduce needs local(i.e., shared) memory
513  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
514  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
515  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
516  {
517  Dim1 blockIdx {gh.blockIdx()};
518  Dim1 threadIdx{gh.threadIdx()};
519  Dim1 gridDim {gh.gridDim()};
520 #else
521  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
522  [=] AMREX_GPU_DEVICE () noexcept
523  {
524 #endif
525  ReduceTuple r;
526  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
527  ReduceTuple& dst = *(dp+blockIdx.x);
528  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
529  dst = r;
530  }
531  for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
532  icell < ncells; icell += stride) {
533  int k = icell / lenxy;
534  int j = (icell - k*lenxy) / lenx;
535  int i = (icell - k*lenxy) - j*lenx;
536  i += lo.x;
537  j += lo.y;
538  k += lo.z;
539  auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
540  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
541  }
542 #ifdef AMREX_USE_SYCL
543  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
544 #else
545  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
546 #endif
547  });
548  nblocks = std::max(nblocks, nblocks_ec);
549  }
550 
551  template <typename N, typename D, typename F,
552  typename M=std::enable_if_t<std::is_integral<N>::value> >
553  void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
554  {
555  using ReduceTuple = typename D::Type;
556  auto const& stream = Gpu::gpuStream();
557  auto dp = reduce_data.devicePtr(stream);
558  int& nblocks = reduce_data.nBlocks(stream);
559  int ncells = box.numPts();
560  const auto lo = amrex::lbound(box);
561  const auto len = amrex::length(box);
562  const auto lenxy = len.x*len.y;
563  const auto lenx = len.x;
564  constexpr int nitems_per_thread = 4;
565  int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
566  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
567  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
568  reduce_data.updateMaxStreamIndex(stream);
569 #ifdef AMREX_USE_SYCL
570  // device reduce needs local(i.e., shared) memory
571  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
572  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
573  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
574  {
575  Dim1 blockIdx {gh.blockIdx()};
576  Dim1 threadIdx{gh.threadIdx()};
577  Dim1 gridDim {gh.gridDim()};
578 #else
579  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
580  [=] AMREX_GPU_DEVICE () noexcept
581  {
582 #endif
583  ReduceTuple r;
584  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
585  ReduceTuple& dst = *(dp+blockIdx.x);
586  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
587  dst = r;
588  }
589  for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
590  icell < ncells; icell += stride) {
591  int k = icell / lenxy;
592  int j = (icell - k*lenxy) / lenx;
593  int i = (icell - k*lenxy) - j*lenx;
594  i += lo.x;
595  j += lo.y;
596  k += lo.z;
597  for (N n = 0; n < ncomp; ++n) {
598  auto pr = f(i,j,k,n);
599  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
600  }
601  }
602 #ifdef AMREX_USE_SYCL
603  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
604 #else
605  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
606 #endif
607  });
608  nblocks = std::max(nblocks, nblocks_ec);
609  }
610 
611  template <typename N, typename D, typename F,
612  typename M=std::enable_if_t<std::is_integral<N>::value> >
613  void eval (N n, D & reduce_data, F const& f)
614  {
615  if (n <= 0) { return; }
616  using ReduceTuple = typename D::Type;
617  auto const& stream = Gpu::gpuStream();
618  auto dp = reduce_data.devicePtr(stream);
619  int& nblocks = reduce_data.nBlocks(stream);
620  constexpr int nitems_per_thread = 4;
621  int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
622  / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
623  nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
624  reduce_data.updateMaxStreamIndex(stream);
625 #ifdef AMREX_USE_SYCL
626  // device reduce needs local(i.e., shared) memory
627  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
628  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
629  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
630  {
631  Dim1 blockIdx {gh.blockIdx()};
632  Dim1 threadIdx{gh.threadIdx()};
633  Dim1 gridDim {gh.gridDim()};
634 #else
635  amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
636  [=] AMREX_GPU_DEVICE () noexcept
637  {
638 #endif
639  ReduceTuple r;
640  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
641  ReduceTuple& dst = *(dp+blockIdx.x);
642  if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
643  dst = r;
644  }
645  for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
646  i < n; i += stride) {
647  auto pr = f(i);
648  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
649  }
650 #ifdef AMREX_USE_SYCL
651  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
652 #else
653  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
654 #endif
655  });
656  nblocks = amrex::max(nblocks, nblocks_ec);
657  }
658 
659  template <typename D>
660  typename D::Type value (D & reduce_data)
661  {
662  auto hp = reduce_data.hostPtr();
663 
664  if (m_result_is_ready) {
665  return *hp;
666  }
667 
668  using ReduceTuple = typename D::Type;
669  auto const& stream = Gpu::gpuStream();
670  auto dp = reduce_data.devicePtr();
671  auto const& nblocks = reduce_data.nBlocks();
672 #if defined(AMREX_USE_SYCL)
673  if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
674  const int N = nblocks[0];
675  if (N == 0) {
676  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
677  } else {
679  Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
681  for (int i = 1; i < N; ++i) {
682  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
683  }
684  *hp = tmp[0];
685  }
686  } else
687 #endif
688  {
689  int maxblocks = reduce_data.maxBlocks();
690 #ifdef AMREX_USE_SYCL
691  // device reduce needs local(i.e., shared) memory
692  constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
693 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
694  // xxxxx SYCL todo: reduce bug workaround
696  auto presult = dtmp.data();
697 #else
698  auto presult = hp;
699 #endif
700  amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
701  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
702  {
703  ReduceTuple r;
704  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
705  ReduceTuple dst = r;
706  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
707  auto dp_stream = dp+istream*maxblocks;
708  for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
709  i < nblocks[istream]; i += stride) {
710  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
711  }
712  }
713  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
714  if (gh.threadIdx() == 0) { *presult = dst; }
715  });
716 #ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
717  Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
718 #endif
719 #else
720  amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
721  [=] AMREX_GPU_DEVICE () noexcept
722  {
723  ReduceTuple r;
724  Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
725  ReduceTuple dst = r;
726  for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
727  auto dp_stream = dp+istream*maxblocks;
728  for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
729  i < nblocks[istream]; i += stride) {
730  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
731  }
732  }
733  Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
734  if (threadIdx.x == 0) { *hp = dst; }
735  });
736 #endif
738  }
739 
740  m_result_is_ready = true;
741  return *hp;
742  }
743 
744 private:
745  bool m_result_is_ready = false;
746 
747 public:
748  void resetResultReadiness () { m_result_is_ready = false; }
749 };
750 
751 namespace Reduce {
752 
753 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
754 T Sum (N n, T const* v, T init_val = 0)
755 {
756  ReduceOps<ReduceOpSum> reduce_op;
757  ReduceData<T> reduce_data(reduce_op);
758  using ReduceTuple = typename decltype(reduce_data)::Type;
759  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
760  ReduceTuple hv = reduce_data.value(reduce_op);
761  return amrex::get<0>(hv) + init_val;
762 }
763 
764 template <typename T, typename N, typename F,
765  typename M=std::enable_if_t<std::is_integral<N>::value> >
766 T Sum (N n, F const& f, T init_val = 0)
767 {
768  ReduceOps<ReduceOpSum> reduce_op;
769  ReduceData<T> reduce_data(reduce_op);
770  using ReduceTuple = typename decltype(reduce_data)::Type;
771  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
772  ReduceTuple hv = reduce_data.value(reduce_op);
773  return amrex::get<0>(hv) + init_val;
774 }
775 
776 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
777 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
778 {
779  ReduceOps<ReduceOpMin> reduce_op;
780  ReduceData<T> reduce_data(reduce_op);
781  using ReduceTuple = typename decltype(reduce_data)::Type;
782  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
783  ReduceTuple hv = reduce_data.value(reduce_op);
784  return std::min(amrex::get<0>(hv),init_val);
785 }
786 
787 template <typename T, typename N, typename F,
788  typename M=std::enable_if_t<std::is_integral<N>::value> >
789 T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
790 {
791  ReduceOps<ReduceOpMin> reduce_op;
792  ReduceData<T> reduce_data(reduce_op);
793  using ReduceTuple = typename decltype(reduce_data)::Type;
794  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
795  ReduceTuple hv = reduce_data.value(reduce_op);
796  return std::min(amrex::get<0>(hv),init_val);
797 }
798 
799 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
800 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
801 {
802  ReduceOps<ReduceOpMax> reduce_op;
803  ReduceData<T> reduce_data(reduce_op);
804  using ReduceTuple = typename decltype(reduce_data)::Type;
805  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
806  ReduceTuple hv = reduce_data.value(reduce_op);
807  return std::max(amrex::get<0>(hv),init_val);
808 }
809 
810 template <typename T, typename N, typename F,
811  typename M=std::enable_if_t<std::is_integral<N>::value> >
812 T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
813 {
814  ReduceOps<ReduceOpMax> reduce_op;
815  ReduceData<T> reduce_data(reduce_op);
816  using ReduceTuple = typename decltype(reduce_data)::Type;
817  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
818  ReduceTuple hv = reduce_data.value(reduce_op);
819  return std::max(amrex::get<0>(hv),init_val);
820 }
821 
822 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
823 std::pair<T,T> MinMax (N n, T const* v)
824 {
826  ReduceData<T,T> reduce_data(reduce_op);
827  using ReduceTuple = typename decltype(reduce_data)::Type;
828  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
829  return {v[i],v[i]};
830  });
831  auto hv = reduce_data.value(reduce_op);
832  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
833 }
834 
835 template <typename T, typename N, typename F,
836  typename M=std::enable_if_t<std::is_integral<N>::value> >
837 std::pair<T,T> MinMax (N n, F const& f)
838 {
840  ReduceData<T,T> reduce_data(reduce_op);
841  using ReduceTuple = typename decltype(reduce_data)::Type;
842  reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
843  T tmp = f(i);
844  return {tmp,tmp};
845  });
846  auto hv = reduce_data.value(reduce_op);
847  return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
848 }
849 
850 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
851 bool AnyOf (N n, T const* v, P const& pred)
852 {
853  Gpu::LaunchSafeGuard lsg(true);
855  int* dp = ds.dataPtr();
856  auto ec = Gpu::ExecutionConfig(n);
857  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
858 
859 #ifdef AMREX_USE_SYCL
860  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
861  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
862  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
863  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
864  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
865  if (gh.threadIdx() == 0) { *has_any = *dp; }
866  gh.sharedBarrier();
867 
868  if (!(*has_any))
869  {
870  int r = false;
871  for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
872  i < n && !r; i += stride)
873  {
874  r = pred(v[i]) ? 1 : 0;
875  }
876 
877  r = Gpu::blockReduce<Gpu::Device::warp_size>
878  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
879  if (gh.threadIdx() == 0 && r) { *dp = 1; }
880  }
881  });
882 #else
883  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
884  [=] AMREX_GPU_DEVICE () noexcept {
885  __shared__ int has_any;
886  if (threadIdx.x == 0) { has_any = *dp; }
887  __syncthreads();
888 
889  if (!has_any)
890  {
891  int r = false;
892  for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
893  i < n && !r; i += stride)
894  {
895  r = pred(v[i]) ? 1 : 0;
896  }
897  r = Gpu::blockReduce<Gpu::Device::warp_size>
898  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
899  if (threadIdx.x == 0 && r) *dp = 1;
900  }
901  });
902 #endif
903  return ds.dataValue();
904 }
905 
906 template <typename P>
907 bool AnyOf (Box const& box, P const& pred)
908 {
909  Gpu::LaunchSafeGuard lsg(true);
911  int* dp = ds.dataPtr();
912  int ncells = box.numPts();
913  const auto lo = amrex::lbound(box);
914  const auto len = amrex::length(box);
915  const auto lenxy = len.x*len.y;
916  const auto lenx = len.x;
917  auto ec = Gpu::ExecutionConfig(ncells);
918  ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
919 
920 #ifdef AMREX_USE_SYCL
921  const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
922  const std::size_t shared_mem_bytes = num_ints*sizeof(int);
923  amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
924  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
925  int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
926  if (gh.threadIdx() == 0) { *has_any = *dp; }
927  gh.sharedBarrier();
928 
929  if (!(*has_any))
930  {
931  int r = false;
932  for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
933  icell < ncells && !r; icell += stride) {
934  int k = icell / lenxy;
935  int j = (icell - k*lenxy) / lenx;
936  int i = (icell - k*lenxy) - j*lenx;
937  i += lo.x;
938  j += lo.y;
939  k += lo.z;
940  r = pred(i,j,k) ? 1 : 0;
941  }
942  r = Gpu::blockReduce<Gpu::Device::warp_size>
943  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
944  if (gh.threadIdx() == 0 && r) { *dp = 1; }
945  }
946  });
947 #else
948  AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
949  Gpu::gpuStream(),
950  [=] AMREX_GPU_DEVICE () noexcept {
951  __shared__ int has_any;
952  if (threadIdx.x == 0) { has_any = *dp; }
953  __syncthreads();
954 
955  if (!has_any)
956  {
957  int r = false;
958  for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
959  icell < ncells && !r; icell += stride) {
960  int k = icell / lenxy;
961  int j = (icell - k*lenxy) / lenx;
962  int i = (icell - k*lenxy) - j*lenx;
963  i += lo.x;
964  j += lo.y;
965  k += lo.z;
966  r = pred(i,j,k) ? 1 : 0;
967  }
968  r = Gpu::blockReduce<Gpu::Device::warp_size>
969  (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
970  if (threadIdx.x == 0 && r) *dp = 1;
971  }
972  });
973 #endif
974  return ds.dataValue();
975 }
976 
977 }
978 
979 #else
980 
981 template <typename... Ts>
982 class ReduceData
983 {
984 public:
985  using Type = GpuTuple<Ts...>;
986 
987  template <typename... Ps>
988  explicit ReduceData (ReduceOps<Ps...>& reduce_op)
989  : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
990  m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
991  {
992  reduce_op.resetResultReadiness();
993  for (auto& t : m_tuple) {
994  Reduce::detail::for_each_init<0, Type, Ps...>(t);
995  }
996  }
997 
998  ~ReduceData () = default;
999  ReduceData (ReduceData<Ts...> const&) = delete;
1000  ReduceData (ReduceData<Ts...> &&) = delete;
1001  void operator= (ReduceData<Ts...> const&) = delete;
1002  void operator= (ReduceData<Ts...> &&) = delete;
1003 
1004  Type value () { return m_fn_value(); }
1005 
1006  template <typename... Ps>
1007  Type value (ReduceOps<Ps...>& reduce_op)
1008  {
1009  return reduce_op.value(*this);
1010  }
1011 
1012  Vector<Type>& reference () { return m_tuple; }
1013 
1014  Type& reference (int tid)
1015  {
1016  if (m_tuple.size() == 1) {
1017  // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1018  return m_tuple[0];
1019  } else {
1020  return m_tuple[tid];
1021  }
1022  }
1023 
1024 private:
1025  Vector<Type> m_tuple;
1026  std::function<Type()> m_fn_value;
1027 };
1028 
1029 template <typename... Ps>
1030 class ReduceOps
1031 {
1032 private:
1033 
1034  template <typename D, typename F>
1036  static auto call_f (Box const& box, typename D::Type & r, F const& f)
1037  noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
1038  typename D::Type>>
1039  {
1040  using ReduceTuple = typename D::Type;
1041  const auto lo = amrex::lbound(box);
1042  const auto hi = amrex::ubound(box);
1043  for (int k = lo.z; k <= hi.z; ++k) {
1044  for (int j = lo.y; j <= hi.y; ++j) {
1045  for (int i = lo.x; i <= hi.x; ++i) {
1046  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
1047  }}}
1048  }
1049 
1050  template <typename D, typename F>
1052  static auto call_f (Box const& box, typename D::Type & r, F const& f)
1053  noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
1054  typename D::Type>>
1055  {
1056  using ReduceTuple = typename D::Type;
1057  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1058  }
1059 
1060 public:
1061 
1062  template <typename MF, typename D, typename F>
1063  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1064  eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1065  {
1066  using ReduceTuple = typename D::Type;
1067 #ifdef AMREX_USE_OMP
1068 #pragma omp parallel
1069 #endif
1070  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1071  Box const& b = mfi.growntilebox(nghost);
1072  const int li = mfi.LocalIndex();
1073  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1074  const auto lo = amrex::lbound(b);
1075  const auto hi = amrex::ubound(b);
1076  for (int k = lo.z; k <= hi.z; ++k) {
1077  for (int j = lo.y; j <= hi.y; ++j) {
1078  for (int i = lo.x; i <= hi.x; ++i) {
1079  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1080  }}}
1081  }
1082  }
1083 
1084  template <typename MF, typename D, typename F>
1085  std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1086  eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1087  {
1088  using ReduceTuple = typename D::Type;
1089 #ifdef AMREX_USE_OMP
1090 #pragma omp parallel
1091 #endif
1092  for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1093  Box const& b = mfi.growntilebox(nghost);
1094  const int li = mfi.LocalIndex();
1095  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1096  const auto lo = amrex::lbound(b);
1097  const auto hi = amrex::ubound(b);
1098  for (int n = 0; n < ncomp; ++n) {
1099  for (int k = lo.z; k <= hi.z; ++k) {
1100  for (int j = lo.y; j <= hi.y; ++j) {
1101  for (int i = lo.x; i <= hi.x; ++i) {
1102  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1103  }}}}
1104  }
1105  }
1106 
1107  template <typename D, typename F>
1108  void eval (Box const& box, D & reduce_data, F&& f)
1109  {
1110  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1111  call_f<D>(box, rr, std::forward<F>(f));
1112  }
1113 
1114  template <typename N, typename D, typename F,
1115  typename M=std::enable_if_t<std::is_integral_v<N>> >
1116  void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
1117  {
1118  using ReduceTuple = typename D::Type;
1119  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1120  const auto lo = amrex::lbound(box);
1121  const auto hi = amrex::ubound(box);
1122  for (N n = 0; n < ncomp; ++n) {
1123  for (int k = lo.z; k <= hi.z; ++k) {
1124  for (int j = lo.y; j <= hi.y; ++j) {
1125  for (int i = lo.x; i <= hi.x; ++i) {
1126  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1127  }}}}
1128  }
1129 
1130  template <typename N, typename D, typename F,
1131  typename M=std::enable_if_t<std::is_integral_v<N>> >
1132  void eval (N n, D & reduce_data, F const& f)
1133  {
1134  using ReduceTuple = typename D::Type;
1135  auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1136  for (N i = 0; i < n; ++i) {
1137  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1138  }
1139  }
1140 
1141  template <typename D>
1142  typename D::Type value (D & reduce_data)
1143  {
1144  auto& rrv = reduce_data.reference();
1145  if (! m_result_is_ready) {
1146  using ReduceTuple = typename D::Type;
1147  if (rrv.size() > 1) {
1148  for (int i = 1, N = rrv.size(); i < N; ++i) {
1149  Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1150  }
1151  }
1152  m_result_is_ready = true;
1153  }
1154  return rrv[0];
1155  }
1156 
1157  bool m_result_is_ready = false;
1158 
1159  void resetResultReadiness () { m_result_is_ready = false; }
1160 };
1161 
1162 namespace Reduce {
1163 
1164 template <typename T, typename N, typename F,
1165  typename M=std::enable_if_t<std::is_integral_v<N>> >
1166 T Sum (N n, F const& f, T init_val = 0)
1167 {
1168  T r = init_val;
1169 #ifdef AMREX_USE_OMP
1170 #pragma omp parallel for reduction(+:r)
1171 #endif
1172  for (N i = 0; i < n; ++i) {
1173  r += f(i);
1174  }
1175  return r;
1176 }
1177 
1178 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1179 T Sum (N n, T const* v, T init_val = 0)
1180 {
1181  return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1182 }
1183 
1184 template <typename T, typename N, typename F,
1185  typename M=std::enable_if_t<std::is_integral_v<N>> >
1186 T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1187 {
1188  T r = init_val;
1189 #ifdef AMREX_USE_OMP
1190 #pragma omp parallel for reduction(min:r)
1191 #endif
1192  for (N i = 0; i < n; ++i) {
1193  r = std::min(r,f(i));
1194  }
1195  return r;
1196 }
1197 
1198 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1199 T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1200 {
1201  return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1202 }
1203 
1204 template <typename T, typename N, typename F,
1205  typename M=std::enable_if_t<std::is_integral_v<N>> >
1206 T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1207 {
1208  T r = init_val;
1209 #ifdef AMREX_USE_OMP
1210 #pragma omp parallel for reduction(max:r)
1211 #endif
1212  for (N i = 0; i < n; ++i) {
1213  r = std::max(r,f(i));
1214  }
1215  return r;
1216 }
1217 
1218 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1219 T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1220 {
1221  return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1222 }
1223 
1224 template <typename T, typename N, typename F,
1225  typename M=std::enable_if_t<std::is_integral_v<N>> >
1226 std::pair<T,T> Min (N n, F const& f)
1227 {
1228  T r_min = std::numeric_limits<T>::max();
1229  T r_max = std::numeric_limits<T>::lowest();
1230 #ifdef AMREX_USE_OMP
1231 #pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1232 #endif
1233  for (N i = 0; i < n; ++i) {
1234  T tmp = f(i);
1235  r_min = std::min(r_min,tmp);
1236  r_max = std::max(r_max,tmp);
1237  }
1238  return std::make_pair(r_min,r_max);
1239 }
1240 
1241 template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1242 std::pair<T,T> MinMax (N n, T const* v)
1243 {
1244  return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1245 }
1246 
1247 template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1248 bool AnyOf (N n, T const* v, P&& pred)
1249 {
1250  return std::any_of(v, v+n, std::forward<P>(pred));
1251 }
1252 
1253 template <typename P>
1254 bool AnyOf (Box const& box, P const& pred)
1255 {
1256  const auto lo = amrex::lbound(box);
1257  const auto hi = amrex::ubound(box);
1258  for (int k = lo.z; k <= hi.z; ++k) {
1259  for (int j = lo.y; j <= hi.y; ++j) {
1260  for (int i = lo.x; i <= hi.x; ++i) {
1261  if (pred(i,j,k)) { return true; }
1262  }}}
1263  return false;
1264 }
1265 
1266 }
1267 
1268 #endif
1269 
1274 template <typename... Ts, typename... Ps>
1276 constexpr GpuTuple<Ts...>
1278 {
1279  GpuTuple<Ts...> r{};
1280  Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1281  return r;
1282 }
1283 
1288 template <typename... Ts, typename... Ps>
1290 constexpr GpuTuple<Ts...>
1292 {
1293  GpuTuple<Ts...> r{};
1294  Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1295  return r;
1296 }
1297 
1298 }
1299 
1300 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition: AMReX_GpuDevice.H:20
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition: AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
Real * pdst
Definition: AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition: AMReX_SPACE.H:151
#define AMREX_D_DECL(a, b, c)
Definition: AMReX_SPACE.H:104
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
AMREX_GPU_HOST_DEVICE IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition: AMReX_Box.H:127
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
Definition: AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition: AMReX_GpuDevice.cpp:586
Definition: AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
Definition: AMReX_PODVector.H:246
T * data() noexcept
Definition: AMReX_PODVector.H:593
Definition: AMReX_Reduce.H:249
~ReduceData()
Definition: AMReX_Reduce.H:271
int maxStreamIndex() const
Definition: AMReX_Reduce.H:304
Type * devicePtr(gpuStream_t const &s)
Definition: AMReX_Reduce.H:293
Type value()
Definition: AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition: AMReX_Reduce.H:305
Type * m_host_tuple
Definition: AMReX_Reduce.H:312
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:254
Type * devicePtr()
Definition: AMReX_Reduce.H:292
Type * hostPtr()
Definition: AMReX_Reduce.H:297
int m_max_blocks
Definition: AMReX_Reduce.H:310
int & nBlocks(gpuStream_t const &s)
Definition: AMReX_Reduce.H:300
Type value(ReduceOps< Ps... > &reduce_op)
Definition: AMReX_Reduce.H:287
Type * m_device_tuple
Definition: AMReX_Reduce.H:313
GpuArray< int, AMREX_GPU_MAX_STREAMS > & nBlocks()
Definition: AMReX_Reduce.H:299
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition: AMReX_Reduce.H:315
int maxBlocks() const
Definition: AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
GpuArray< int, AMREX_GPU_MAX_STREAMS > m_nblocks
Definition: AMReX_Reduce.H:314
Definition: AMReX_Reduce.H:364
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:441
D::Type value(D &reduce_data)
Definition: AMReX_Reduce.H:660
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:553
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:494
void resetResultReadiness()
Definition: AMReX_Reduce.H:748
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:469
void eval(N n, D &reduce_data, F const &f)
Definition: AMReX_Reduce.H:613
static constexpr OpenMPBinPolicy OpenMP
Definition: AMReX_DenseBins.H:20
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition: AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition: AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition: AMReX_GpuReduce.H:445
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition: AMReX_OpenMP.H:37
constexpr int in_parallel()
Definition: AMReX_OpenMP.H:38
constexpr int get_max_threads()
Definition: AMReX_OpenMP.H:36
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:62
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:70
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:45
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition: AMReX_Reduce.H:344
AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto call_f(F const &f, int i, int j, int k, IndexType) noexcept -> decltype(f(0, 0, 0))
Definition: AMReX_Reduce.H:321
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition: AMReX_Reduce.H:38
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE void for_each_init(T &t)
Definition: AMReX_Reduce.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition: AMReX_Reduce.H:55
bool AnyOf(Box const &box, P const &pred)
Definition: AMReX_Reduce.H:907
T Min(N n, F const &f, T init_val=std::numeric_limits< T >::max())
Definition: AMReX_Reduce.H:789
std::pair< T, T > MinMax(N n, F const &f)
Definition: AMReX_Reduce.H:837
T Max(N n, F const &f, T init_val=std::numeric_limits< T >::lowest())
Definition: AMReX_Reduce.H:812
T Sum(N n, F const &f, T init_val=0)
Definition: AMReX_Reduce.H:766
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition: AMReX_ParallelReduce.H:92
static constexpr int M
Definition: AMReX_OpenBC.H:13
static constexpr int P
Definition: AMReX_OpenBC.H:14
Definition: AMReX_Amr.cpp:49
constexpr AMREX_GPU_HOST_DEVICE GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, TypeList< Ps... >) noexcept
Return a GpuTuple containing the identity element for each ReduceOp in TypeList. For example 0,...
Definition: AMReX_Reduce.H:1291
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:35
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition: AMReX_Box.H:1211
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:315
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition: AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:308
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:322
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:649
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609
integer, parameter dp
Definition: AMReX_SDCquadrature.F90:8
Definition: AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068
Definition: AMReX_GpuMemory.H:56
T dataValue() const
Definition: AMReX_GpuMemory.H:92
T * dataPtr()
Definition: AMReX_GpuMemory.H:90
Definition: AMReX_GpuLaunch.H:128
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_GpuControl.H:125
Definition: AMReX_GpuReduce.H:282
Test if a given type T is callable with arguments of type Args...
Definition: AMReX_TypeTraits.H:201
Definition: AMReX_Functional.H:14
Definition: AMReX_Reduce.H:180
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:204
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:194
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:208
Definition: AMReX_Reduce.H:212
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition: AMReX_Reduce.H:240
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:226
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition: AMReX_Reduce.H:236
Definition: AMReX_Reduce.H:147
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:159
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:168
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:176
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:172
Definition: AMReX_Reduce.H:114
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:139
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition: AMReX_Reduce.H:143
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:126
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:135
Definition: AMReX_Reduce.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition: AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition: AMReX_Reduce.H:110
Definition: AMReX_Reduce.H:339
Definition: AMReX_Reduce.H:338
Struct for holding types.
Definition: AMReX_TypeList.H:12