Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_ValLocPair.H>
10
11#include <algorithm>
12#include <functional>
13#include <limits>
14
15namespace amrex {
16
17namespace Reduce::detail {
18
19#ifdef AMREX_USE_GPU
20#ifdef AMREX_USE_SYCL
21 template <std::size_t I, typename T, typename P>
23 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
24 {
25 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
26 }
27
28 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
30 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
31 {
32 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
33 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
34 }
35#else
36 template <std::size_t I, typename T, typename P>
38 void for_each_parallel (T& d, T const& s)
39 {
40 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
41 }
42
43 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
45 void for_each_parallel (T& d, T const& s)
46 {
47 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
48 for_each_parallel<I+1,T,P1,Ps...>(d, s);
49 }
50#endif
51#endif
52
53 template <std::size_t I, typename T, typename P>
55 void for_each_local (T& d, T const& s)
56 {
57 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
58 }
59
60 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
62 void for_each_local (T& d, T const& s)
63 {
64 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
65 for_each_local<I+1,T,P1,Ps...>(d, s);
66 }
67
68 template <std::size_t I, typename T, typename P>
70 constexpr void for_each_init (T& t)
71 {
72 P().init(amrex::get<I>(t));
73 }
74
75 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
77 constexpr void for_each_init (T& t)
78 {
79 P().init(amrex::get<I>(t));
80 for_each_init<I+1,T,P1,Ps...>(t);
81 }
82}
83
85{
86
87#ifdef AMREX_USE_GPU
88#ifdef AMREX_USE_SYCL
89 template <typename T>
91 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
92 T r = Gpu::blockReduceSum(s,h);
93 if (h.threadIdx() == 0) { d += r; }
94 }
95#else
96 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
98 void parallel_update (T& d, T const& s) const noexcept {
99 T r = Gpu::blockReduceSum<MT>(s);
100 if (threadIdx.x == 0) { d += r; }
101 }
102#endif
103#endif
104
105 template <typename T>
107 void local_update (T& d, T const& s) const noexcept { d += s; }
108
109 template <typename T>
110 constexpr void init (T& t) const noexcept { t = 0; }
111};
112
114{
115#ifdef AMREX_USE_GPU
116#ifdef AMREX_USE_SYCL
117 template <typename T>
119 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
120 T r = Gpu::blockReduceMin(s,h);
121 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
122 }
123#else
124 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
126 void parallel_update (T& d, T const& s) const noexcept {
127 T r = Gpu::blockReduceMin<MT>(s);
128 if (threadIdx.x == 0) { d = amrex::min(d,r); }
129 }
130#endif
131#endif
132
133 template <typename T>
135 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
136
137 template <typename T>
138 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
139 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
140
141 template <typename T>
142 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
143 init (T& t) const noexcept { t = T::max(); }
144};
145
147{
148#ifdef AMREX_USE_GPU
149#ifdef AMREX_USE_SYCL
150 template <typename T>
152 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
153 T r = Gpu::blockReduceMax(s,h);
154 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
155 }
156#else
157 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
159 void parallel_update (T& d, T const& s) const noexcept {
160 T r = Gpu::blockReduceMax<MT>(s);
161 if (threadIdx.x == 0) { d = amrex::max(d,r); }
162 }
163#endif
164#endif
165
166 template <typename T>
168 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
169
170 template <typename T>
171 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
173
174 template <typename T>
175 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176 init (T& t) const noexcept { t = T::lowest(); }
177};
178
180{
181#ifdef AMREX_USE_GPU
182#ifdef AMREX_USE_SYCL
183 template <typename T>
185 std::enable_if_t<std::is_integral<T>::value>
186 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
187 T r = Gpu::blockReduceLogicalAnd(s,h);
188 if (h.threadIdx() == 0) { d = d && r; }
189 }
190#else
191 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
193 std::enable_if_t<std::is_integral<T>::value>
194 parallel_update (T& d, T s) const noexcept {
195 T r = Gpu::blockReduceLogicalAnd<MT>(s);
196 if (threadIdx.x == 0) { d = d && r; }
197 }
198#endif
199#endif
200
201 template <typename T>
203 std::enable_if_t<std::is_integral_v<T>>
204 local_update (T& d, T s) const noexcept { d = d && s; }
205
206 template <typename T>
207 constexpr std::enable_if_t<std::is_integral_v<T>>
208 init (T& t) const noexcept { t = true; }
209};
210
212{
213#ifdef AMREX_USE_GPU
214#ifdef AMREX_USE_SYCL
215 template <typename T>
217 std::enable_if_t<std::is_integral<T>::value>
218 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
219 T r = Gpu::blockReduceLogicalOr(s,h);
220 if (h.threadIdx() == 0) { d = d || r; }
221 }
222#else
223 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
225 std::enable_if_t<std::is_integral<T>::value>
226 parallel_update (T& d, T s) const noexcept {
227 T r = Gpu::blockReduceLogicalOr<MT>(s);
228 if (threadIdx.x == 0) { d = d || r; }
229 }
230#endif
231#endif
232
233 template <typename T>
235 std::enable_if_t<std::is_integral_v<T>>
236 local_update (T& d, T s) const noexcept { d = d || s; }
237
238 template <typename T>
239 constexpr std::enable_if_t<std::is_integral_v<T>>
240 init (T& t) const noexcept { t = false; }
241};
242
243template <typename... Ps> class ReduceOps;
244
245#ifdef AMREX_USE_GPU
246
247template <typename... Ts>
249{
250public:
251 using Type = GpuTuple<Ts...>;
252
253 template <typename... Ps>
254 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
255 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
256 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
258 * m_max_blocks * sizeof(Type)))),
259 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
260 {
261 reduce_op.resetResultReadiness();
262 static_assert(std::is_trivially_copyable<Type>(),
263 "ReduceData::Type must be trivially copyable");
264 static_assert(std::is_trivially_destructible<Type>(),
265 "ReduceData::Type must be trivially destructible");
266
267 new (m_host_tuple) Type();
268 m_nblocks.fill(0);
269 }
270
272 The_Pinned_Arena()->free(m_host_tuple);
273 The_Arena()->free(m_device_tuple);
274 }
275
276 ReduceData (ReduceData<Ts...> const&) = delete;
278 void operator= (ReduceData<Ts...> const&) = delete;
279 void operator= (ReduceData<Ts...> &&) = delete;
280
282 {
283 return m_fn_value();
284 }
285
286 template <typename... Ps>
288 {
289 return reduce_op.value(*this);
290 }
291
292 Type* devicePtr () { return m_device_tuple; }
294 return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
295 }
296
297 Type* hostPtr () { return m_host_tuple; }
298
300 int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
301
302 int maxBlocks () const { return m_max_blocks; }
303
304 int maxStreamIndex () const { return m_max_stream_index; }
306 m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
307 }
308
309private:
311 int m_max_stream_index = 0;
312 Type* m_host_tuple = nullptr;
313 Type* m_device_tuple = nullptr;
315 std::function<Type()> m_fn_value;
316};
317
318namespace Reduce::detail {
319 template <typename F>
321 auto call_f (F const& f, int i, int j, int k, IndexType)
322 noexcept -> decltype(f(0,0,0))
323 {
324 return f(i,j,k);
325 }
326
327 template <typename F>
329 auto call_f (F const& f, int i, int j, int k, IndexType t)
330 noexcept -> decltype(f(Box()))
331 {
333 return f(Box(IntVect(AMREX_D_DECL(i,j,k)),
334 IntVect(AMREX_D_DECL(i,j,k)),
335 t));
336 }
337
338 struct iterate_box {};
340
341 template <typename I, typename F, typename T, typename... Ps,
342 std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
344 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
345 {
346 auto const& pr = f(ibox,i,j,k);
347 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
348 }
349
350 template <typename I, typename F, typename T, typename... Ps,
351 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
353 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
354 {
355 for (int n = 0; n < ncomp; ++n) {
356 auto const& pr = f(ibox,i,j,k,n);
357 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
358 }
359 }
360}
361
362template <typename... Ps>
364{
365public:
366
367 // This is public for CUDA
368 template <typename I, typename MF, typename D, typename F>
369 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
370 {
371 using ReduceTuple = typename D::Type;
372 const int nboxes = mf.local_size();
373 if (nboxes > 0) {
374 auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS);
375 auto par_for_blocks = parforinfo.getBlocks();
376 const int nblocks = par_for_blocks.first[nboxes];
377 const int block_0_size = par_for_blocks.first[1];
378 const int* dp_nblocks = par_for_blocks.second;
379 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
380
381 auto const& stream = Gpu::gpuStream();
382 auto pdst = reduce_data.devicePtr(stream);
383 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
384 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
385 reduce_data.nBlocks(stream) = nblocks_ec;
386 reduce_data.updateMaxStreamIndex(stream);
387
388#ifdef AMREX_USE_SYCL
389 // device reduce needs local(i.e., shared) memory
390 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
391 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
392 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
393 {
394 Dim1 blockIdx {gh.blockIdx()};
395 Dim1 threadIdx{gh.threadIdx()};
396#else
397 amrex::launch_global<AMREX_GPU_MAX_THREADS>
398 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
399 ([=] AMREX_GPU_DEVICE () noexcept
400 {
401#endif
402 ReduceTuple r;
403 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
404 ReduceTuple& dst = pdst[blockIdx.x];
405 if (threadIdx.x == 0) {
406 dst = r;
407 }
408 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
409 int ibox;
410 std::uint64_t icell;
411 if (dp_nblocks) {
412 ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);
413 icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;
414 } else {
415 ibox = iblock / block_0_size;
416 icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;
417 }
418
419 BoxIndexer const& indexer = dp_boxes[ibox];
420 if (icell < indexer.numPts()) {
421 auto [i, j, k] = indexer(icell);
422 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
423 (f, ibox, i, j, k, ncomp, r);
424 }
425 }
426#ifdef AMREX_USE_SYCL
427 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
428#else
429 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
430#endif
431 });
432 }
433 }
434
435 template <typename MF, typename D, typename F>
436 std::enable_if_t<IsFabArray<MF>::value
437#ifndef AMREX_USE_CUDA
439#endif
440 >
441 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
442 {
443 using ReduceTuple = typename D::Type;
444 const int nboxes = mf.local_size();
445 if (nboxes == 0) {
446 return;
447 } else if (!mf.isFusingCandidate()) {
448 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
449 Box const& b = amrex::grow(mfi.validbox(), nghost);
450 const int li = mfi.LocalIndex();
451 this->eval(b, reduce_data,
452 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
453 {
454 return f(li, i, j, k);
455 });
456 }
457 } else {
459 mf, nghost, 0, reduce_data, std::forward<F>(f));
460 }
461 }
462
463 template <typename MF, typename D, typename F>
464 std::enable_if_t<IsFabArray<MF>::value
465#ifndef AMREX_USE_CUDA
467#endif
468 >
469 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
470 {
471 using ReduceTuple = typename D::Type;
472
473 const int nboxes = mf.local_size();
474
475 if (nboxes == 0) {
476 return;
477 } else if (!mf.isFusingCandidate()) {
478 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
479 Box const& b = amrex::grow(mfi.validbox(), nghost);
480 const int li = mfi.LocalIndex();
481 this->eval(b, ncomp, reduce_data,
482 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
483 {
484 return f(li, i, j, k, n);
485 });
486 }
487 } else {
489 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
490 }
491 }
492
493 template <typename D, typename F>
494 void eval (Box const& box, D & reduce_data, F const& f)
495 {
496 using ReduceTuple = typename D::Type;
497 auto const& stream = Gpu::gpuStream();
498 auto dp = reduce_data.devicePtr(stream);
499 int& nblocks = reduce_data.nBlocks(stream);
500 int ncells = box.numPts();
501 const auto lo = amrex::lbound(box);
502 const auto len = amrex::length(box);
503 const auto lenxy = len.x*len.y;
504 const auto lenx = len.x;
505 IndexType ixtype = box.ixType();
506 constexpr int nitems_per_thread = 4;
507 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
508 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
509 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
510 reduce_data.updateMaxStreamIndex(stream);
511#ifdef AMREX_USE_SYCL
512 // device reduce needs local(i.e., shared) memory
513 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
514 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
515 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
516 {
517 Dim1 blockIdx {gh.blockIdx()};
518 Dim1 threadIdx{gh.threadIdx()};
519 Dim1 gridDim {gh.gridDim()};
520#else
521 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
522 [=] AMREX_GPU_DEVICE () noexcept
523 {
524#endif
525 ReduceTuple r;
526 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
527 ReduceTuple& dst = *(dp+blockIdx.x);
528 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
529 dst = r;
530 }
531 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
532 icell < ncells; icell += stride) {
533 int k = icell / lenxy;
534 int j = (icell - k*lenxy) / lenx;
535 int i = (icell - k*lenxy) - j*lenx;
536 i += lo.x;
537 j += lo.y;
538 k += lo.z;
539 auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
540 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
541 }
542#ifdef AMREX_USE_SYCL
543 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
544#else
545 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
546#endif
547 });
548 nblocks = std::max(nblocks, nblocks_ec);
549 }
550
551 template <typename N, typename D, typename F,
552 typename M=std::enable_if_t<std::is_integral<N>::value> >
553 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
554 {
555 using ReduceTuple = typename D::Type;
556 auto const& stream = Gpu::gpuStream();
557 auto dp = reduce_data.devicePtr(stream);
558 int& nblocks = reduce_data.nBlocks(stream);
559 int ncells = box.numPts();
560 const auto lo = amrex::lbound(box);
561 const auto len = amrex::length(box);
562 const auto lenxy = len.x*len.y;
563 const auto lenx = len.x;
564 constexpr int nitems_per_thread = 4;
565 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
566 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
567 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
568 reduce_data.updateMaxStreamIndex(stream);
569#ifdef AMREX_USE_SYCL
570 // device reduce needs local(i.e., shared) memory
571 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
572 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
573 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
574 {
575 Dim1 blockIdx {gh.blockIdx()};
576 Dim1 threadIdx{gh.threadIdx()};
577 Dim1 gridDim {gh.gridDim()};
578#else
579 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
580 [=] AMREX_GPU_DEVICE () noexcept
581 {
582#endif
583 ReduceTuple r;
584 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
585 ReduceTuple& dst = *(dp+blockIdx.x);
586 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
587 dst = r;
588 }
589 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
590 icell < ncells; icell += stride) {
591 int k = icell / lenxy;
592 int j = (icell - k*lenxy) / lenx;
593 int i = (icell - k*lenxy) - j*lenx;
594 i += lo.x;
595 j += lo.y;
596 k += lo.z;
597 for (N n = 0; n < ncomp; ++n) {
598 auto pr = f(i,j,k,n);
599 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
600 }
601 }
602#ifdef AMREX_USE_SYCL
603 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
604#else
605 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
606#endif
607 });
608 nblocks = std::max(nblocks, nblocks_ec);
609 }
610
611 template <typename N, typename D, typename F,
612 typename M=std::enable_if_t<std::is_integral<N>::value> >
613 void eval (N n, D & reduce_data, F const& f)
614 {
615 if (n <= 0) { return; }
616 using ReduceTuple = typename D::Type;
617 auto const& stream = Gpu::gpuStream();
618 auto dp = reduce_data.devicePtr(stream);
619 int& nblocks = reduce_data.nBlocks(stream);
620 constexpr int nitems_per_thread = 4;
621 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
622 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
623 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
624 reduce_data.updateMaxStreamIndex(stream);
625#ifdef AMREX_USE_SYCL
626 // device reduce needs local(i.e., shared) memory
627 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
628 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
629 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
630 {
631 Dim1 blockIdx {gh.blockIdx()};
632 Dim1 threadIdx{gh.threadIdx()};
633 Dim1 gridDim {gh.gridDim()};
634#else
635 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
636 [=] AMREX_GPU_DEVICE () noexcept
637 {
638#endif
639 ReduceTuple r;
640 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
641 ReduceTuple& dst = *(dp+blockIdx.x);
642 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
643 dst = r;
644 }
645 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
646 i < n; i += stride) {
647 auto pr = f(i);
648 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
649 }
650#ifdef AMREX_USE_SYCL
651 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
652#else
653 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
654#endif
655 });
656 nblocks = amrex::max(nblocks, nblocks_ec);
657 }
658
659 template <typename D>
660 typename D::Type value (D & reduce_data)
661 {
662 auto hp = reduce_data.hostPtr();
663
664 if (m_result_is_ready) {
665 return *hp;
666 }
667
668 using ReduceTuple = typename D::Type;
669 auto const& stream = Gpu::gpuStream();
670 auto dp = reduce_data.devicePtr();
671 auto const& nblocks = reduce_data.nBlocks();
672#if defined(AMREX_USE_SYCL)
673 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
674 const int N = nblocks[0];
675 if (N == 0) {
676 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
677 } else {
679 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
680 Gpu::streamSynchronize();
681 for (int i = 1; i < N; ++i) {
682 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
683 }
684 *hp = tmp[0];
685 }
686 } else
687#endif
688 {
689 int maxblocks = reduce_data.maxBlocks();
690#ifdef AMREX_USE_SYCL
691 // device reduce needs local(i.e., shared) memory
692 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
693#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
694 // xxxxx SYCL todo: reduce bug workaround
696 auto presult = dtmp.data();
697#else
698 auto presult = hp;
699#endif
700 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
701 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
702 {
703 ReduceTuple r;
704 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
705 ReduceTuple dst = r;
706 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
707 auto dp_stream = dp+istream*maxblocks;
708 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
709 i < nblocks[istream]; i += stride) {
710 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
711 }
712 }
713 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
714 if (gh.threadIdx() == 0) { *presult = dst; }
715 });
716#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
717 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
718#endif
719#else
720 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
721 [=] AMREX_GPU_DEVICE () noexcept
722 {
723 ReduceTuple r;
724 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
725 ReduceTuple dst = r;
726 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
727 auto dp_stream = dp+istream*maxblocks;
728 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
729 i < nblocks[istream]; i += stride) {
730 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
731 }
732 }
733 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
734 if (threadIdx.x == 0) { *hp = dst; }
735 });
736#endif
737 Gpu::streamSynchronize();
738 }
739
740 m_result_is_ready = true;
741 return *hp;
742 }
743
744private:
745 bool m_result_is_ready = false;
746
747public:
748 void resetResultReadiness () { m_result_is_ready = false; }
749};
750
751namespace Reduce {
752
753template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
754T Sum (N n, T const* v, T init_val = 0)
755{
756 ReduceOps<ReduceOpSum> reduce_op;
757 ReduceData<T> reduce_data(reduce_op);
758 using ReduceTuple = typename decltype(reduce_data)::Type;
759 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
760 ReduceTuple hv = reduce_data.value(reduce_op);
761 return amrex::get<0>(hv) + init_val;
762}
763
764template <typename T, typename N, typename F,
765 typename M=std::enable_if_t<std::is_integral<N>::value> >
766T Sum (N n, F const& f, T init_val = 0)
767{
768 ReduceOps<ReduceOpSum> reduce_op;
769 ReduceData<T> reduce_data(reduce_op);
770 using ReduceTuple = typename decltype(reduce_data)::Type;
771 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
772 ReduceTuple hv = reduce_data.value(reduce_op);
773 return amrex::get<0>(hv) + init_val;
774}
775
776template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
777T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
778{
779 ReduceOps<ReduceOpMin> reduce_op;
780 ReduceData<T> reduce_data(reduce_op);
781 using ReduceTuple = typename decltype(reduce_data)::Type;
782 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
783 ReduceTuple hv = reduce_data.value(reduce_op);
784 return std::min(amrex::get<0>(hv),init_val);
785}
786
787template <typename T, typename N, typename F,
788 typename M=std::enable_if_t<std::is_integral<N>::value> >
789T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
790{
791 ReduceOps<ReduceOpMin> reduce_op;
792 ReduceData<T> reduce_data(reduce_op);
793 using ReduceTuple = typename decltype(reduce_data)::Type;
794 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
795 ReduceTuple hv = reduce_data.value(reduce_op);
796 return std::min(amrex::get<0>(hv),init_val);
797}
798
799template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
800T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
801{
802 ReduceOps<ReduceOpMax> reduce_op;
803 ReduceData<T> reduce_data(reduce_op);
804 using ReduceTuple = typename decltype(reduce_data)::Type;
805 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
806 ReduceTuple hv = reduce_data.value(reduce_op);
807 return std::max(amrex::get<0>(hv),init_val);
808}
809
810template <typename T, typename N, typename F,
811 typename M=std::enable_if_t<std::is_integral<N>::value> >
812T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
813{
814 ReduceOps<ReduceOpMax> reduce_op;
815 ReduceData<T> reduce_data(reduce_op);
816 using ReduceTuple = typename decltype(reduce_data)::Type;
817 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
818 ReduceTuple hv = reduce_data.value(reduce_op);
819 return std::max(amrex::get<0>(hv),init_val);
820}
821
822template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >
823std::pair<T,T> MinMax (N n, T const* v)
824{
826 ReduceData<T,T> reduce_data(reduce_op);
827 using ReduceTuple = typename decltype(reduce_data)::Type;
828 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
829 return {v[i],v[i]};
830 });
831 auto hv = reduce_data.value(reduce_op);
832 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
833}
834
835template <typename T, typename N, typename F,
836 typename M=std::enable_if_t<std::is_integral<N>::value> >
837std::pair<T,T> MinMax (N n, F const& f)
838{
840 ReduceData<T,T> reduce_data(reduce_op);
841 using ReduceTuple = typename decltype(reduce_data)::Type;
842 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
843 T tmp = f(i);
844 return {tmp,tmp};
845 });
846 auto hv = reduce_data.value(reduce_op);
847 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
848}
849
850template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >
851bool AnyOf (N n, T const* v, P const& pred)
852{
853 Gpu::LaunchSafeGuard lsg(true);
855 int* dp = ds.dataPtr();
856 auto ec = Gpu::ExecutionConfig(n);
857 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
858
859#ifdef AMREX_USE_SYCL
860 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
861 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
862 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
863 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
864 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
865 if (gh.threadIdx() == 0) { *has_any = *dp; }
866 gh.sharedBarrier();
867
868 if (!(*has_any))
869 {
870 int r = false;
871 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
872 i < n && !r; i += stride)
873 {
874 r = pred(v[i]) ? 1 : 0;
875 }
876
877 r = Gpu::blockReduce<Gpu::Device::warp_size>
878 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
879 if (gh.threadIdx() == 0 && r) { *dp = 1; }
880 }
881 });
882#else
883 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
884 [=] AMREX_GPU_DEVICE () noexcept {
885 __shared__ int has_any;
886 if (threadIdx.x == 0) { has_any = *dp; }
887 __syncthreads();
888
889 if (!has_any)
890 {
891 int r = false;
892 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
893 i < n && !r; i += stride)
894 {
895 r = pred(v[i]) ? 1 : 0;
896 }
897 r = Gpu::blockReduce<Gpu::Device::warp_size>
898 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
899 if (threadIdx.x == 0 && r) *dp = 1;
900 }
901 });
902#endif
903 return ds.dataValue();
904}
905
906template <typename P>
907bool AnyOf (Box const& box, P const& pred)
908{
909 Gpu::LaunchSafeGuard lsg(true);
911 int* dp = ds.dataPtr();
912 int ncells = box.numPts();
913 const auto lo = amrex::lbound(box);
914 const auto len = amrex::length(box);
915 const auto lenxy = len.x*len.y;
916 const auto lenx = len.x;
917 auto ec = Gpu::ExecutionConfig(ncells);
918 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
919
920#ifdef AMREX_USE_SYCL
921 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
922 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
923 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
924 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
925 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
926 if (gh.threadIdx() == 0) { *has_any = *dp; }
927 gh.sharedBarrier();
928
929 if (!(*has_any))
930 {
931 int r = false;
932 for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
933 icell < ncells && !r; icell += stride) {
934 int k = icell / lenxy;
935 int j = (icell - k*lenxy) / lenx;
936 int i = (icell - k*lenxy) - j*lenx;
937 i += lo.x;
938 j += lo.y;
939 k += lo.z;
940 r = pred(i,j,k) ? 1 : 0;
941 }
942 r = Gpu::blockReduce<Gpu::Device::warp_size>
943 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
944 if (gh.threadIdx() == 0 && r) { *dp = 1; }
945 }
946 });
947#else
948 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
949 Gpu::gpuStream(),
950 [=] AMREX_GPU_DEVICE () noexcept {
951 __shared__ int has_any;
952 if (threadIdx.x == 0) { has_any = *dp; }
953 __syncthreads();
954
955 if (!has_any)
956 {
957 int r = false;
958 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
959 icell < ncells && !r; icell += stride) {
960 int k = icell / lenxy;
961 int j = (icell - k*lenxy) / lenx;
962 int i = (icell - k*lenxy) - j*lenx;
963 i += lo.x;
964 j += lo.y;
965 k += lo.z;
966 r = pred(i,j,k) ? 1 : 0;
967 }
968 r = Gpu::blockReduce<Gpu::Device::warp_size>
969 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
970 if (threadIdx.x == 0 && r) *dp = 1;
971 }
972 });
973#endif
974 return ds.dataValue();
975}
976
977}
978
979#else
980
981template <typename... Ts>
982class ReduceData
983{
984public:
985 using Type = GpuTuple<Ts...>;
986
987 template <typename... Ps>
988 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
989 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
990 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
991 {
992 reduce_op.resetResultReadiness();
993 for (auto& t : m_tuple) {
994 Reduce::detail::for_each_init<0, Type, Ps...>(t);
995 }
996 }
997
998 ~ReduceData () = default;
999 ReduceData (ReduceData<Ts...> const&) = delete;
1000 ReduceData (ReduceData<Ts...> &&) = delete;
1001 void operator= (ReduceData<Ts...> const&) = delete;
1002 void operator= (ReduceData<Ts...> &&) = delete;
1003
1004 Type value () { return m_fn_value(); }
1005
1006 template <typename... Ps>
1007 Type value (ReduceOps<Ps...>& reduce_op)
1008 {
1009 return reduce_op.value(*this);
1010 }
1011
1012 Vector<Type>& reference () { return m_tuple; }
1013
1014 Type& reference (int tid)
1015 {
1016 if (m_tuple.size() == 1) {
1017 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1018 return m_tuple[0];
1019 } else {
1020 return m_tuple[tid];
1021 }
1022 }
1023
1024private:
1025 Vector<Type> m_tuple;
1026 std::function<Type()> m_fn_value;
1027};
1028
1029template <typename... Ps>
1030class ReduceOps
1031{
1032private:
1033
1034 template <typename D, typename F>
1036 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1037 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
1038 typename D::Type>>
1039 {
1040 using ReduceTuple = typename D::Type;
1041 const auto lo = amrex::lbound(box);
1042 const auto hi = amrex::ubound(box);
1043 for (int k = lo.z; k <= hi.z; ++k) {
1044 for (int j = lo.y; j <= hi.y; ++j) {
1045 for (int i = lo.x; i <= hi.x; ++i) {
1046 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
1047 }}}
1048 }
1049
1050 template <typename D, typename F>
1052 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1053 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
1054 typename D::Type>>
1055 {
1056 using ReduceTuple = typename D::Type;
1057 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1058 }
1059
1060public:
1061
1062 template <typename MF, typename D, typename F>
1063 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1064 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1065 {
1066 using ReduceTuple = typename D::Type;
1067#ifdef AMREX_USE_OMP
1068#pragma omp parallel
1069#endif
1070 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1071 Box const& b = mfi.growntilebox(nghost);
1072 const int li = mfi.LocalIndex();
1073 auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1074 const auto lo = amrex::lbound(b);
1075 const auto hi = amrex::ubound(b);
1076 for (int k = lo.z; k <= hi.z; ++k) {
1077 for (int j = lo.y; j <= hi.y; ++j) {
1078 for (int i = lo.x; i <= hi.x; ++i) {
1079 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1080 }}}
1081 }
1082 }
1083
1084 template <typename MF, typename D, typename F>
1085 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1086 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1087 {
1088 using ReduceTuple = typename D::Type;
1089#ifdef AMREX_USE_OMP
1090#pragma omp parallel
1091#endif
1092 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1093 Box const& b = mfi.growntilebox(nghost);
1094 const int li = mfi.LocalIndex();
1095 auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1096 const auto lo = amrex::lbound(b);
1097 const auto hi = amrex::ubound(b);
1098 for (int n = 0; n < ncomp; ++n) {
1099 for (int k = lo.z; k <= hi.z; ++k) {
1100 for (int j = lo.y; j <= hi.y; ++j) {
1101 for (int i = lo.x; i <= hi.x; ++i) {
1102 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1103 }}}}
1104 }
1105 }
1106
1107 template <typename D, typename F>
1108 void eval (Box const& box, D & reduce_data, F&& f)
1109 {
1110 auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1111 call_f<D>(box, rr, std::forward<F>(f));
1112 }
1113
1114 template <typename N, typename D, typename F,
1115 typename M=std::enable_if_t<std::is_integral_v<N>> >
1116 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
1117 {
1118 using ReduceTuple = typename D::Type;
1119 auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1120 const auto lo = amrex::lbound(box);
1121 const auto hi = amrex::ubound(box);
1122 for (N n = 0; n < ncomp; ++n) {
1123 for (int k = lo.z; k <= hi.z; ++k) {
1124 for (int j = lo.y; j <= hi.y; ++j) {
1125 for (int i = lo.x; i <= hi.x; ++i) {
1126 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1127 }}}}
1128 }
1129
1130 template <typename N, typename D, typename F,
1131 typename M=std::enable_if_t<std::is_integral_v<N>> >
1132 void eval (N n, D & reduce_data, F const& f)
1133 {
1134 using ReduceTuple = typename D::Type;
1135 auto& rr = reduce_data.reference(OpenMP::get_thread_num());
1136 for (N i = 0; i < n; ++i) {
1137 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1138 }
1139 }
1140
1141 template <typename D>
1142 typename D::Type value (D & reduce_data)
1143 {
1144 auto& rrv = reduce_data.reference();
1145 if (! m_result_is_ready) {
1146 using ReduceTuple = typename D::Type;
1147 if (rrv.size() > 1) {
1148 for (int i = 1, N = rrv.size(); i < N; ++i) {
1149 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1150 }
1151 }
1152 m_result_is_ready = true;
1153 }
1154 return rrv[0];
1155 }
1156
1157 bool m_result_is_ready = false;
1158
1159 void resetResultReadiness () { m_result_is_ready = false; }
1160};
1161
1162namespace Reduce {
1163
1164template <typename T, typename N, typename F,
1165 typename M=std::enable_if_t<std::is_integral_v<N>> >
1166T Sum (N n, F const& f, T init_val = 0)
1167{
1168 T r = init_val;
1169#ifdef AMREX_USE_OMP
1170#pragma omp parallel for reduction(+:r)
1171#endif
1172 for (N i = 0; i < n; ++i) {
1173 r += f(i);
1174 }
1175 return r;
1176}
1177
1178template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1179T Sum (N n, T const* v, T init_val = 0)
1180{
1181 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1182}
1183
1184template <typename T, typename N, typename F,
1185 typename M=std::enable_if_t<std::is_integral_v<N>> >
1186T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1187{
1188 T r = init_val;
1189#ifdef AMREX_USE_OMP
1190#pragma omp parallel for reduction(min:r)
1191#endif
1192 for (N i = 0; i < n; ++i) {
1193 r = std::min(r,f(i));
1194 }
1195 return r;
1196}
1197
1198template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1199T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1200{
1201 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1202}
1203
1204template <typename T, typename N, typename F,
1205 typename M=std::enable_if_t<std::is_integral_v<N>> >
1206T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1207{
1208 T r = init_val;
1209#ifdef AMREX_USE_OMP
1210#pragma omp parallel for reduction(max:r)
1211#endif
1212 for (N i = 0; i < n; ++i) {
1213 r = std::max(r,f(i));
1214 }
1215 return r;
1216}
1217
1218template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1219T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1220{
1221 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1222}
1223
1224template <typename T, typename N, typename F,
1225 typename M=std::enable_if_t<std::is_integral_v<N>> >
1226std::pair<T,T> Min (N n, F const& f)
1227{
1228 T r_min = std::numeric_limits<T>::max();
1229 T r_max = std::numeric_limits<T>::lowest();
1230#ifdef AMREX_USE_OMP
1231#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1232#endif
1233 for (N i = 0; i < n; ++i) {
1234 T tmp = f(i);
1235 r_min = std::min(r_min,tmp);
1236 r_max = std::max(r_max,tmp);
1237 }
1238 return std::make_pair(r_min,r_max);
1239}
1240
1241template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1242std::pair<T,T> MinMax (N n, T const* v)
1243{
1244 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1245}
1246
1247template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1248bool AnyOf (N n, T const* v, P&& pred)
1249{
1250 return std::any_of(v, v+n, std::forward<P>(pred));
1251}
1252
1253template <typename P>
1254bool AnyOf (Box const& box, P const& pred)
1255{
1256 const auto lo = amrex::lbound(box);
1257 const auto hi = amrex::ubound(box);
1258 for (int k = lo.z; k <= hi.z; ++k) {
1259 for (int j = lo.y; j <= hi.y; ++j) {
1260 for (int i = lo.x; i <= hi.x; ++i) {
1261 if (pred(i,j,k)) { return true; }
1262 }}}
1263 return false;
1264}
1265
1266}
1267
1268#endif
1269
1274template <typename... Ts, typename... Ps>
1276constexpr GpuTuple<Ts...>
1278{
1279 GpuTuple<Ts...> r{};
1280 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1281 return r;
1282}
1283
1288template <typename... Ts, typename... Ps>
1290constexpr GpuTuple<Ts...>
1292{
1293 GpuTuple<Ts...> r{};
1294 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1295 return r;
1296}
1297
1298}
1299
1300#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:20
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition AMReX_SPACE.H:151
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:104
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
AMREX_GPU_HOST_DEVICE IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition AMReX_Box.H:127
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:346
Definition AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:626
Definition AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141
Definition AMReX_PODVector.H:262
T * data() noexcept
Definition AMReX_PODVector.H:609
Definition AMReX_Reduce.H:249
~ReduceData()
Definition AMReX_Reduce.H:271
int maxStreamIndex() const
Definition AMReX_Reduce.H:304
Type value()
Definition AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:305
Type * m_host_tuple
Definition AMReX_Reduce.H:312
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:300
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:254
GpuArray< int, AMREX_GPU_MAX_STREAMS > & nBlocks()
Definition AMReX_Reduce.H:299
int m_max_blocks
Definition AMReX_Reduce.H:310
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:293
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:287
Type * devicePtr()
Definition AMReX_Reduce.H:292
Type * m_device_tuple
Definition AMReX_Reduce.H:313
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition AMReX_Reduce.H:315
Type * hostPtr()
Definition AMReX_Reduce.H:297
int maxBlocks() const
Definition AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
GpuArray< int, AMREX_GPU_MAX_STREAMS > m_nblocks
Definition AMReX_Reduce.H:314
Definition AMReX_Reduce.H:364
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:660
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:553
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:494
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:441
void resetResultReadiness()
Definition AMReX_Reduce.H:748
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:469
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:613
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:345
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:395
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:445
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:204
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition AMReX_Reduce.H:344
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition AMReX_Reduce.H:38
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr void for_each_init(T &t)
Definition AMReX_Reduce.H:70
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition AMReX_Reduce.H:55
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:823
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:851
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition AMReX_ParallelReduce.H:92
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition AMReX_MFParallelForG.H:58
Definition AMReX_Amr.cpp:49
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:77
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
AMREX_GPU_HOST_DEVICE constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1277
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:315
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:308
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1211
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:322
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:656
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
const int[]
Definition AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition AMReX_Arena.cpp:616
Definition AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition AMReX_Box.H:2068
Definition AMReX_Array.H:34
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:128
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:125
Definition AMReX_GpuReduce.H:282
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:201
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:180
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:208
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:204
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:194
Definition AMReX_Reduce.H:212
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:226
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:236
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:240
Definition AMReX_Reduce.H:147
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:159
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:172
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:168
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:176
Definition AMReX_Reduce.H:114
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:126
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:139
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:135
Definition AMReX_Reduce.H:85
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:110
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:338
Struct for holding types.
Definition AMReX_TypeList.H:12