Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_ValLocPair.H>
10
11#include <algorithm>
12#include <functional>
13#include <limits>
14
15namespace amrex {
16
17namespace Reduce::detail {
18
19#ifdef AMREX_USE_GPU
20#ifdef AMREX_USE_SYCL
21 template <std::size_t I, typename T, typename P>
23 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
24 {
25 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
26 }
27
28 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
30 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
31 {
32 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
33 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
34 }
35#else
36 template <std::size_t I, typename T, typename P>
38 void for_each_parallel (T& d, T const& s)
39 {
40 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
41 }
42
43 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
45 void for_each_parallel (T& d, T const& s)
46 {
47 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
48 for_each_parallel<I+1,T,P1,Ps...>(d, s);
49 }
50#endif
51#endif
52
53 template <std::size_t I, typename T, typename P>
55 void for_each_local (T& d, T const& s)
56 {
57 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
58 }
59
60 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
62 void for_each_local (T& d, T const& s)
63 {
64 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
65 for_each_local<I+1,T,P1,Ps...>(d, s);
66 }
67
68 template <std::size_t I, typename T, typename P>
70 constexpr void for_each_init (T& t)
71 {
72 P().init(amrex::get<I>(t));
73 }
74
75 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
77 constexpr void for_each_init (T& t)
78 {
79 P().init(amrex::get<I>(t));
80 for_each_init<I+1,T,P1,Ps...>(t);
81 }
82}
83
85{
86
87#ifdef AMREX_USE_GPU
88#ifdef AMREX_USE_SYCL
89 template <typename T>
91 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
92 T r = Gpu::blockReduceSum(s,h);
93 if (h.threadIdx() == 0) { d += r; }
94 }
95#else
96 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
98 void parallel_update (T& d, T const& s) const noexcept {
99 T r = Gpu::blockReduceSum<MT>(s);
100 if (threadIdx.x == 0) { d += r; }
101 }
102#endif
103#endif
104
105 template <typename T>
107 void local_update (T& d, T const& s) const noexcept { d += s; }
108
109 template <typename T>
110 constexpr void init (T& t) const noexcept { t = 0; }
111};
112
114{
115#ifdef AMREX_USE_GPU
116#ifdef AMREX_USE_SYCL
117 template <typename T>
119 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
120 T r = Gpu::blockReduceMin(s,h);
121 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
122 }
123#else
124 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
126 void parallel_update (T& d, T const& s) const noexcept {
127 T r = Gpu::blockReduceMin<MT>(s);
128 if (threadIdx.x == 0) { d = amrex::min(d,r); }
129 }
130#endif
131#endif
132
133 template <typename T>
135 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
136
137 template <typename T>
138 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
139 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
140
141 template <typename T>
142 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
143 init (T& t) const noexcept { t = T::max(); }
144};
145
147{
148#ifdef AMREX_USE_GPU
149#ifdef AMREX_USE_SYCL
150 template <typename T>
152 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
153 T r = Gpu::blockReduceMax(s,h);
154 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
155 }
156#else
157 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
159 void parallel_update (T& d, T const& s) const noexcept {
160 T r = Gpu::blockReduceMax<MT>(s);
161 if (threadIdx.x == 0) { d = amrex::max(d,r); }
162 }
163#endif
164#endif
165
166 template <typename T>
168 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
169
170 template <typename T>
171 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
172 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
173
174 template <typename T>
175 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
176 init (T& t) const noexcept { t = T::lowest(); }
177};
178
180{
181#ifdef AMREX_USE_GPU
182#ifdef AMREX_USE_SYCL
183 template <typename T>
185 std::enable_if_t<std::is_integral_v<T>>
186 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
187 T r = Gpu::blockReduceLogicalAnd(s,h);
188 if (h.threadIdx() == 0) { d = d && r; }
189 }
190#else
191 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
193 std::enable_if_t<std::is_integral_v<T>>
194 parallel_update (T& d, T s) const noexcept {
195 T r = Gpu::blockReduceLogicalAnd<MT>(s);
196 if (threadIdx.x == 0) { d = d && r; }
197 }
198#endif
199#endif
200
201 template <typename T>
203 std::enable_if_t<std::is_integral_v<T>>
204 local_update (T& d, T s) const noexcept { d = d && s; }
205
206 template <typename T>
207 constexpr std::enable_if_t<std::is_integral_v<T>>
208 init (T& t) const noexcept { t = true; }
209};
210
212{
213#ifdef AMREX_USE_GPU
214#ifdef AMREX_USE_SYCL
215 template <typename T>
217 std::enable_if_t<std::is_integral_v<T>>
218 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
219 T r = Gpu::blockReduceLogicalOr(s,h);
220 if (h.threadIdx() == 0) { d = d || r; }
221 }
222#else
223 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
225 std::enable_if_t<std::is_integral_v<T>>
226 parallel_update (T& d, T s) const noexcept {
227 T r = Gpu::blockReduceLogicalOr<MT>(s);
228 if (threadIdx.x == 0) { d = d || r; }
229 }
230#endif
231#endif
232
233 template <typename T>
235 std::enable_if_t<std::is_integral_v<T>>
236 local_update (T& d, T s) const noexcept { d = d || s; }
237
238 template <typename T>
239 constexpr std::enable_if_t<std::is_integral_v<T>>
240 init (T& t) const noexcept { t = false; }
241};
242
243template <typename... Ps> class ReduceOps;
244
245#ifdef AMREX_USE_GPU
246
247template <typename... Ts>
249{
250public:
251 using Type = GpuTuple<Ts...>;
252
253 template <typename... Ps>
254 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
255 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
256 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
258 * m_max_blocks * sizeof(Type)))),
259 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
260 {
261 reduce_op.resetResultReadiness();
262 static_assert(std::is_trivially_copyable<Type>(),
263 "ReduceData::Type must be trivially copyable");
264 static_assert(std::is_trivially_destructible<Type>(),
265 "ReduceData::Type must be trivially destructible");
266
267 new (m_host_tuple) Type();
268 m_nblocks.fill(0);
269 }
270
272 The_Pinned_Arena()->free(m_host_tuple);
273 The_Arena()->free(m_device_tuple);
274 }
275
276 ReduceData (ReduceData<Ts...> const&) = delete;
278 void operator= (ReduceData<Ts...> const&) = delete;
279 void operator= (ReduceData<Ts...> &&) = delete;
280
282 {
283 return m_fn_value();
284 }
285
286 template <typename... Ps>
288 {
289 return reduce_op.value(*this);
290 }
291
292 Type* devicePtr () { return m_device_tuple; }
294 return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
295 }
296
297 Type* hostPtr () { return m_host_tuple; }
298
300 int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
301
302 int maxBlocks () const { return m_max_blocks; }
303
304 int maxStreamIndex () const { return m_max_stream_index; }
306 m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
307 }
308
309private:
311 int m_max_stream_index = 0;
312 Type* m_host_tuple = nullptr;
313 Type* m_device_tuple = nullptr;
315 std::function<Type()> m_fn_value;
316};
317
318namespace Reduce::detail {
319 template <typename F>
321 auto call_f (F const& f, int i, int j, int k, IndexType)
322 noexcept -> decltype(f(0,0,0))
323 {
324 return f(i,j,k);
325 }
326
327 template <typename F>
329 auto call_f (F const& f, int i, int j, int k, IndexType t)
330 noexcept -> decltype(f(Box()))
331 {
333 return f(Box(IntVect(AMREX_D_DECL(i,j,k)),
334 IntVect(AMREX_D_DECL(i,j,k)),
335 t));
336 }
337
338 struct iterate_box {};
340
341 template <typename I, typename F, typename T, typename... Ps,
342 std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
344 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
345 {
346 auto const& pr = f(ibox,i,j,k);
347 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
348 }
349
350 template <typename I, typename F, typename T, typename... Ps,
351 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
353 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
354 {
355 for (int n = 0; n < ncomp; ++n) {
356 auto const& pr = f(ibox,i,j,k,n);
357 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
358 }
359 }
360}
361
362template <typename... Ps>
364{
365public:
366
367 // This is public for CUDA
368 template <typename I, typename MF, typename D, typename F>
369 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
370 {
371 using ReduceTuple = typename D::Type;
372 const int nboxes = mf.local_size();
373 if (nboxes > 0) {
374 auto const& parforinfo = mf.getParForInfo(nghost);
375 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
376 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
377 const int nblocks = nblocks_per_box * nboxes;
378 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
379
380 auto const& stream = Gpu::gpuStream();
381 auto pdst = reduce_data.devicePtr(stream);
382 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
383 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
384 reduce_data.nBlocks(stream) = nblocks_ec;
385 reduce_data.updateMaxStreamIndex(stream);
386
387#ifdef AMREX_USE_SYCL
388 // device reduce needs local(i.e., shared) memory
389 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
390 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
391 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
392 {
393 Dim1 blockIdx {gh.blockIdx()};
394 Dim1 threadIdx{gh.threadIdx()};
395#else
396 amrex::launch_global<AMREX_GPU_MAX_THREADS>
397 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
398 ([=] AMREX_GPU_DEVICE () noexcept
399 {
400#endif
401 ReduceTuple r;
402 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
403 ReduceTuple& dst = pdst[blockIdx.x];
404 if (threadIdx.x == 0) {
405 dst = r;
406 }
407 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
408 int ibox = iblock / nblocks_per_box;
409 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
410
411 BoxIndexer const& indexer = dp_boxes[ibox];
412 if (icell < indexer.numPts()) {
413 auto [i, j, k] = indexer(icell);
414 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
415 (f, ibox, i, j, k, ncomp, r);
416 }
417 }
418#ifdef AMREX_USE_SYCL
419 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
420#else
421 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
422#endif
423 });
424 }
425 }
426
427 template <typename MF, typename D, typename F>
428 std::enable_if_t<IsFabArray<MF>::value
429#ifndef AMREX_USE_CUDA
431#endif
432 >
433 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
434 {
435 using ReduceTuple = typename D::Type;
436 const int nboxes = mf.local_size();
437 if (nboxes == 0) {
438 return;
439 } else if (!mf.isFusingCandidate()) {
440 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
441 Box const& b = amrex::grow(mfi.validbox(), nghost);
442 const int li = mfi.LocalIndex();
443 this->eval(b, reduce_data,
444 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
445 {
446 return f(li, i, j, k);
447 });
448 }
449 } else {
451 mf, nghost, 0, reduce_data, std::forward<F>(f));
452 }
453 }
454
455 template <typename MF, typename D, typename F>
456 std::enable_if_t<IsFabArray<MF>::value
457#ifndef AMREX_USE_CUDA
459#endif
460 >
461 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
462 {
463 using ReduceTuple = typename D::Type;
464
465 const int nboxes = mf.local_size();
466
467 if (nboxes == 0) {
468 return;
469 } else if (!mf.isFusingCandidate()) {
470 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
471 Box const& b = amrex::grow(mfi.validbox(), nghost);
472 const int li = mfi.LocalIndex();
473 this->eval(b, ncomp, reduce_data,
474 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
475 {
476 return f(li, i, j, k, n);
477 });
478 }
479 } else {
481 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
482 }
483 }
484
485 template <typename D, typename F>
486 void eval (Box const& box, D & reduce_data, F const& f)
487 {
488 using ReduceTuple = typename D::Type;
489 auto const& stream = Gpu::gpuStream();
490 auto dp = reduce_data.devicePtr(stream);
491 int& nblocks = reduce_data.nBlocks(stream);
492 int ncells = box.numPts();
493 const auto lo = amrex::lbound(box);
494 const auto len = amrex::length(box);
495 const auto lenxy = len.x*len.y;
496 const auto lenx = len.x;
497 IndexType ixtype = box.ixType();
498 constexpr int nitems_per_thread = 4;
499 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
500 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
501 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
502 reduce_data.updateMaxStreamIndex(stream);
503#ifdef AMREX_USE_SYCL
504 // device reduce needs local(i.e., shared) memory
505 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
506 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
507 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
508 {
509 Dim1 blockIdx {gh.blockIdx()};
510 Dim1 threadIdx{gh.threadIdx()};
511 Dim1 gridDim {gh.gridDim()};
512#else
513 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
514 [=] AMREX_GPU_DEVICE () noexcept
515 {
516#endif
517 ReduceTuple r;
518 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
519 ReduceTuple& dst = *(dp+blockIdx.x);
520 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
521 dst = r;
522 }
523 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
524 icell < ncells; icell += stride) {
525 int k = icell / lenxy;
526 int j = (icell - k*lenxy) / lenx;
527 int i = (icell - k*lenxy) - j*lenx;
528 i += lo.x;
529 j += lo.y;
530 k += lo.z;
531 auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
532 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
533 }
534#ifdef AMREX_USE_SYCL
535 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
536#else
537 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
538#endif
539 });
540 nblocks = std::max(nblocks, nblocks_ec);
541 }
542
543 template <typename N, typename D, typename F,
544 typename M=std::enable_if_t<std::is_integral_v<N>> >
545 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
546 {
547 using ReduceTuple = typename D::Type;
548 auto const& stream = Gpu::gpuStream();
549 auto dp = reduce_data.devicePtr(stream);
550 int& nblocks = reduce_data.nBlocks(stream);
551 int ncells = box.numPts();
552 const auto lo = amrex::lbound(box);
553 const auto len = amrex::length(box);
554 const auto lenxy = len.x*len.y;
555 const auto lenx = len.x;
556 constexpr int nitems_per_thread = 4;
557 int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
558 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
559 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
560 reduce_data.updateMaxStreamIndex(stream);
561#ifdef AMREX_USE_SYCL
562 // device reduce needs local(i.e., shared) memory
563 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
564 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
565 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
566 {
567 Dim1 blockIdx {gh.blockIdx()};
568 Dim1 threadIdx{gh.threadIdx()};
569 Dim1 gridDim {gh.gridDim()};
570#else
571 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
572 [=] AMREX_GPU_DEVICE () noexcept
573 {
574#endif
575 ReduceTuple r;
576 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
577 ReduceTuple& dst = *(dp+blockIdx.x);
578 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
579 dst = r;
580 }
581 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
582 icell < ncells; icell += stride) {
583 int k = icell / lenxy;
584 int j = (icell - k*lenxy) / lenx;
585 int i = (icell - k*lenxy) - j*lenx;
586 i += lo.x;
587 j += lo.y;
588 k += lo.z;
589 for (N n = 0; n < ncomp; ++n) {
590 auto pr = f(i,j,k,n);
591 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
592 }
593 }
594#ifdef AMREX_USE_SYCL
595 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
596#else
597 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
598#endif
599 });
600 nblocks = std::max(nblocks, nblocks_ec);
601 }
602
603 template <typename N, typename D, typename F,
604 typename M=std::enable_if_t<std::is_integral_v<N>> >
605 void eval (N n, D & reduce_data, F const& f)
606 {
607 if (n <= 0) { return; }
608 using ReduceTuple = typename D::Type;
609 auto const& stream = Gpu::gpuStream();
610 auto dp = reduce_data.devicePtr(stream);
611 int& nblocks = reduce_data.nBlocks(stream);
612 constexpr int nitems_per_thread = 4;
613 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
614 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
615 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
616 reduce_data.updateMaxStreamIndex(stream);
617#ifdef AMREX_USE_SYCL
618 // device reduce needs local(i.e., shared) memory
619 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
620 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
621 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
622 {
623 Dim1 blockIdx {gh.blockIdx()};
624 Dim1 threadIdx{gh.threadIdx()};
625 Dim1 gridDim {gh.gridDim()};
626#else
627 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
628 [=] AMREX_GPU_DEVICE () noexcept
629 {
630#endif
631 ReduceTuple r;
632 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
633 ReduceTuple& dst = *(dp+blockIdx.x);
634 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
635 dst = r;
636 }
637 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
638 i < n; i += stride) {
639 auto pr = f(i);
640 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
641 }
642#ifdef AMREX_USE_SYCL
643 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
644#else
645 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
646#endif
647 });
648 nblocks = amrex::max(nblocks, nblocks_ec);
649 }
650
651 template <typename D>
652 typename D::Type value (D & reduce_data)
653 {
654 auto hp = reduce_data.hostPtr();
655
656 if (m_result_is_ready) {
657 return *hp;
658 }
659
660 using ReduceTuple = typename D::Type;
661 auto const& stream = Gpu::gpuStream();
662 auto dp = reduce_data.devicePtr();
663 auto const& nblocks = reduce_data.nBlocks();
664#if defined(AMREX_USE_SYCL)
665 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
666 const int N = nblocks[0];
667 if (N == 0) {
668 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
669 } else {
671 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
672 Gpu::streamSynchronize();
673 for (int i = 1; i < N; ++i) {
674 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
675 }
676 *hp = tmp[0];
677 }
678 } else
679#endif
680 {
681 int maxblocks = reduce_data.maxBlocks();
682#ifdef AMREX_USE_SYCL
683 // device reduce needs local(i.e., shared) memory
684 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
685#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
686 // xxxxx SYCL todo: reduce bug workaround
688 auto presult = dtmp.data();
689#else
690 auto presult = hp;
691#endif
692 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
693 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
694 {
695 ReduceTuple r;
696 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
697 ReduceTuple dst = r;
698 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
699 auto dp_stream = dp+istream*maxblocks;
700 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
701 i < nblocks[istream]; i += stride) {
702 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
703 }
704 }
705 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
706 if (gh.threadIdx() == 0) { *presult = dst; }
707 });
708#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
709 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
710#endif
711#else
712 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
713 [=] AMREX_GPU_DEVICE () noexcept
714 {
715 ReduceTuple r;
716 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
717 ReduceTuple dst = r;
718 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
719 auto dp_stream = dp+istream*maxblocks;
720 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
721 i < nblocks[istream]; i += stride) {
722 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
723 }
724 }
725 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
726 if (threadIdx.x == 0) { *hp = dst; }
727 });
728#endif
729 Gpu::streamSynchronize();
730 }
731
732 m_result_is_ready = true;
733 return *hp;
734 }
735
736private:
737 bool m_result_is_ready = false;
738
739public:
740 void resetResultReadiness () { m_result_is_ready = false; }
741};
742
743namespace Reduce {
744
745template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
746T Sum (N n, T const* v, T init_val = 0)
747{
748 ReduceOps<ReduceOpSum> reduce_op;
749 ReduceData<T> reduce_data(reduce_op);
750 using ReduceTuple = typename decltype(reduce_data)::Type;
751 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
752 ReduceTuple hv = reduce_data.value(reduce_op);
753 return amrex::get<0>(hv) + init_val;
754}
755
756template <typename T, typename N, typename F,
757 typename M=std::enable_if_t<std::is_integral_v<N>> >
758T Sum (N n, F const& f, T init_val = 0)
759{
760 ReduceOps<ReduceOpSum> reduce_op;
761 ReduceData<T> reduce_data(reduce_op);
762 using ReduceTuple = typename decltype(reduce_data)::Type;
763 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
764 ReduceTuple hv = reduce_data.value(reduce_op);
765 return amrex::get<0>(hv) + init_val;
766}
767
768template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
769T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
770{
771 ReduceOps<ReduceOpMin> reduce_op;
772 ReduceData<T> reduce_data(reduce_op);
773 using ReduceTuple = typename decltype(reduce_data)::Type;
774 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
775 ReduceTuple hv = reduce_data.value(reduce_op);
776 return std::min(amrex::get<0>(hv),init_val);
777}
778
779template <typename T, typename N, typename F,
780 typename M=std::enable_if_t<std::is_integral_v<N>> >
781T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
782{
783 ReduceOps<ReduceOpMin> reduce_op;
784 ReduceData<T> reduce_data(reduce_op);
785 using ReduceTuple = typename decltype(reduce_data)::Type;
786 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
787 ReduceTuple hv = reduce_data.value(reduce_op);
788 return std::min(amrex::get<0>(hv),init_val);
789}
790
791template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
792T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
793{
794 ReduceOps<ReduceOpMax> reduce_op;
795 ReduceData<T> reduce_data(reduce_op);
796 using ReduceTuple = typename decltype(reduce_data)::Type;
797 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
798 ReduceTuple hv = reduce_data.value(reduce_op);
799 return std::max(amrex::get<0>(hv),init_val);
800}
801
802template <typename T, typename N, typename F,
803 typename M=std::enable_if_t<std::is_integral_v<N>> >
804T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
805{
806 ReduceOps<ReduceOpMax> reduce_op;
807 ReduceData<T> reduce_data(reduce_op);
808 using ReduceTuple = typename decltype(reduce_data)::Type;
809 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
810 ReduceTuple hv = reduce_data.value(reduce_op);
811 return std::max(amrex::get<0>(hv),init_val);
812}
813
814template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
815std::pair<T,T> MinMax (N n, T const* v)
816{
818 ReduceData<T,T> reduce_data(reduce_op);
819 using ReduceTuple = typename decltype(reduce_data)::Type;
820 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
821 return {v[i],v[i]};
822 });
823 auto hv = reduce_data.value(reduce_op);
824 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
825}
826
827template <typename T, typename N, typename F,
828 typename M=std::enable_if_t<std::is_integral_v<N>> >
829std::pair<T,T> MinMax (N n, F const& f)
830{
832 ReduceData<T,T> reduce_data(reduce_op);
833 using ReduceTuple = typename decltype(reduce_data)::Type;
834 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
835 T tmp = f(i);
836 return {tmp,tmp};
837 });
838 auto hv = reduce_data.value(reduce_op);
839 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
840}
841
842template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
843bool AnyOf (N n, T const* v, P const& pred)
844{
845 Gpu::LaunchSafeGuard lsg(true);
847 int* dp = ds.dataPtr();
848 auto ec = Gpu::ExecutionConfig(n);
849 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
850
851#ifdef AMREX_USE_SYCL
852 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
853 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
854 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
855 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
856 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
857 if (gh.threadIdx() == 0) { *has_any = *dp; }
858 gh.sharedBarrier();
859
860 if (!(*has_any))
861 {
862 int r = false;
863 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
864 i < n && !r; i += stride)
865 {
866 r = pred(v[i]) ? 1 : 0;
867 }
868
869 r = Gpu::blockReduce<Gpu::Device::warp_size>
870 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
871 if (gh.threadIdx() == 0 && r) { *dp = 1; }
872 }
873 });
874#else
875 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
876 [=] AMREX_GPU_DEVICE () noexcept {
877 __shared__ int has_any;
878 if (threadIdx.x == 0) { has_any = *dp; }
879 __syncthreads();
880
881 if (!has_any)
882 {
883 int r = false;
884 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
885 i < n && !r; i += stride)
886 {
887 r = pred(v[i]) ? 1 : 0;
888 }
889 r = Gpu::blockReduce<Gpu::Device::warp_size>
890 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
891 if (threadIdx.x == 0 && r) *dp = 1;
892 }
893 });
894#endif
895 return ds.dataValue();
896}
897
898template <typename P>
899bool AnyOf (Box const& box, P const& pred)
900{
901 Gpu::LaunchSafeGuard lsg(true);
903 int* dp = ds.dataPtr();
904 int ncells = box.numPts();
905 const auto lo = amrex::lbound(box);
906 const auto len = amrex::length(box);
907 const auto lenxy = len.x*len.y;
908 const auto lenx = len.x;
909 auto ec = Gpu::ExecutionConfig(ncells);
910 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
911
912#ifdef AMREX_USE_SYCL
913 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
914 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
915 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
916 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
917 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
918 if (gh.threadIdx() == 0) { *has_any = *dp; }
919 gh.sharedBarrier();
920
921 if (!(*has_any))
922 {
923 int r = false;
924 for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
925 icell < ncells && !r; icell += stride) {
926 int k = icell / lenxy;
927 int j = (icell - k*lenxy) / lenx;
928 int i = (icell - k*lenxy) - j*lenx;
929 i += lo.x;
930 j += lo.y;
931 k += lo.z;
932 r = pred(i,j,k) ? 1 : 0;
933 }
934 r = Gpu::blockReduce<Gpu::Device::warp_size>
935 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
936 if (gh.threadIdx() == 0 && r) { *dp = 1; }
937 }
938 });
939#else
940 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
941 Gpu::gpuStream(),
942 [=] AMREX_GPU_DEVICE () noexcept {
943 __shared__ int has_any;
944 if (threadIdx.x == 0) { has_any = *dp; }
945 __syncthreads();
946
947 if (!has_any)
948 {
949 int r = false;
950 for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
951 icell < ncells && !r; icell += stride) {
952 int k = icell / lenxy;
953 int j = (icell - k*lenxy) / lenx;
954 int i = (icell - k*lenxy) - j*lenx;
955 i += lo.x;
956 j += lo.y;
957 k += lo.z;
958 r = pred(i,j,k) ? 1 : 0;
959 }
960 r = Gpu::blockReduce<Gpu::Device::warp_size>
961 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
962 if (threadIdx.x == 0 && r) *dp = 1;
963 }
964 });
965#endif
966 return ds.dataValue();
967}
968
969}
970
971#else
972
973template <typename... Ts>
974class ReduceData
975{
976public:
977 using Type = GpuTuple<Ts...>;
978
979 template <typename... Ps>
980 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
981 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
982 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
983 {
984 reduce_op.resetResultReadiness();
985 for (auto& t : m_tuple) {
986 Reduce::detail::for_each_init<0, Type, Ps...>(t);
987 }
988 }
989
990 ~ReduceData () = default;
991 ReduceData (ReduceData<Ts...> const&) = delete;
992 ReduceData (ReduceData<Ts...> &&) = delete;
993 void operator= (ReduceData<Ts...> const&) = delete;
994 void operator= (ReduceData<Ts...> &&) = delete;
995
996 Type value () { return m_fn_value(); }
997
998 template <typename... Ps>
999 Type value (ReduceOps<Ps...>& reduce_op)
1000 {
1001 return reduce_op.value(*this);
1002 }
1003
1004 Vector<Type>& reference () { return m_tuple; }
1005
1006 Type& reference (int tid)
1007 {
1008 if (m_tuple.size() == 1) {
1009 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1010 return m_tuple[0];
1011 } else {
1012 return m_tuple[tid];
1013 }
1014 }
1015
1016private:
1017 Vector<Type> m_tuple;
1018 std::function<Type()> m_fn_value;
1019};
1020
1021template <typename... Ps>
1022class ReduceOps
1023{
1024private:
1025
1026 template <typename D, typename F>
1028 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1029 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,
1030 typename D::Type>>
1031 {
1032 using ReduceTuple = typename D::Type;
1033 const auto lo = amrex::lbound(box);
1034 const auto hi = amrex::ubound(box);
1035 for (int k = lo.z; k <= hi.z; ++k) {
1036 for (int j = lo.y; j <= hi.y; ++j) {
1037 for (int i = lo.x; i <= hi.x; ++i) {
1038 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));
1039 }}}
1040 }
1041
1042 template <typename D, typename F>
1044 static auto call_f (Box const& box, typename D::Type & r, F const& f)
1045 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,
1046 typename D::Type>>
1047 {
1048 using ReduceTuple = typename D::Type;
1049 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1050 }
1051
1052public:
1053
1054 template <typename MF, typename D, typename F>
1055 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1056 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1057 {
1058 using ReduceTuple = typename D::Type;
1059#ifdef AMREX_USE_OMP
1060#pragma omp parallel
1061#endif
1062 {
1063 ReduceTuple rr;
1064 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1065 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1066 Box const& b = mfi.growntilebox(nghost);
1067 const int li = mfi.LocalIndex();
1068 const auto lo = amrex::lbound(b);
1069 const auto hi = amrex::ubound(b);
1070 for (int k = lo.z; k <= hi.z; ++k) {
1071 for (int j = lo.y; j <= hi.y; ++j) {
1072 for (int i = lo.x; i <= hi.x; ++i) {
1073 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1074 }}}
1075 }
1076 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1077 reduce_data.reference(OpenMP::get_thread_num()), rr);
1078 }
1079 }
1080
1081 template <typename MF, typename D, typename F>
1082 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1083 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1084 {
1085 using ReduceTuple = typename D::Type;
1086#ifdef AMREX_USE_OMP
1087#pragma omp parallel
1088#endif
1089 {
1090 ReduceTuple rr;
1091 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1092 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1093 Box const& b = mfi.growntilebox(nghost);
1094 const int li = mfi.LocalIndex();
1095 const auto lo = amrex::lbound(b);
1096 const auto hi = amrex::ubound(b);
1097 for (int n = 0; n < ncomp; ++n) {
1098 for (int k = lo.z; k <= hi.z; ++k) {
1099 for (int j = lo.y; j <= hi.y; ++j) {
1100 for (int i = lo.x; i <= hi.x; ++i) {
1101 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1102 }}}}
1103 }
1104 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1105 reduce_data.reference(OpenMP::get_thread_num()), rr);
1106 }
1107 }
1108
1109 template <typename D, typename F>
1110 void eval (Box const& box, D & reduce_data, F&& f)
1111 {
1112 using ReduceTuple = typename D::Type;
1113 ReduceTuple rr;
1114 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1115 call_f<D>(box, rr, std::forward<F>(f));
1116 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1117 reduce_data.reference(OpenMP::get_thread_num()), rr);
1118 }
1119
1120 template <typename N, typename D, typename F,
1121 typename M=std::enable_if_t<std::is_integral_v<N>> >
1122 void eval (Box const& box, N ncomp, D & reduce_data, F const& f)
1123 {
1124 using ReduceTuple = typename D::Type;
1125 ReduceTuple rr;
1126 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1127 const auto lo = amrex::lbound(box);
1128 const auto hi = amrex::ubound(box);
1129 for (N n = 0; n < ncomp; ++n) {
1130 for (int k = lo.z; k <= hi.z; ++k) {
1131 for (int j = lo.y; j <= hi.y; ++j) {
1132 for (int i = lo.x; i <= hi.x; ++i) {
1133 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));
1134 }}}}
1135 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1136 reduce_data.reference(OpenMP::get_thread_num()), rr);
1137 }
1138
1139 template <typename N, typename D, typename F,
1140 typename M=std::enable_if_t<std::is_integral_v<N>> >
1141 void eval (N n, D & reduce_data, F const& f)
1142 {
1143 using ReduceTuple = typename D::Type;
1144 ReduceTuple rr;
1145 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1146 for (N i = 0; i < n; ++i) {
1147 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1148 }
1149 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1150 reduce_data.reference(OpenMP::get_thread_num()), rr);
1151 }
1152
1153 template <typename D>
1154 typename D::Type value (D & reduce_data)
1155 {
1156 auto& rrv = reduce_data.reference();
1157 if (! m_result_is_ready) {
1158 using ReduceTuple = typename D::Type;
1159 if (rrv.size() > 1) {
1160 for (int i = 1, N = rrv.size(); i < N; ++i) {
1161 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1162 }
1163 }
1164 m_result_is_ready = true;
1165 }
1166 return rrv[0];
1167 }
1168
1169 bool m_result_is_ready = false;
1170
1171 void resetResultReadiness () { m_result_is_ready = false; }
1172};
1173
1174namespace Reduce {
1175
1176template <typename T, typename N, typename F,
1177 typename M=std::enable_if_t<std::is_integral_v<N>> >
1178T Sum (N n, F const& f, T init_val = 0)
1179{
1180 T r = init_val;
1181#ifdef AMREX_USE_OMP
1182#pragma omp parallel for reduction(+:r)
1183#endif
1184 for (N i = 0; i < n; ++i) {
1185 r += f(i);
1186 }
1187 return r;
1188}
1189
1190template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1191T Sum (N n, T const* v, T init_val = 0)
1192{
1193 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1194}
1195
1196template <typename T, typename N, typename F,
1197 typename M=std::enable_if_t<std::is_integral_v<N>> >
1198T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1199{
1200 T r = init_val;
1201#ifdef AMREX_USE_OMP
1202#pragma omp parallel for reduction(min:r)
1203#endif
1204 for (N i = 0; i < n; ++i) {
1205 r = std::min(r,f(i));
1206 }
1207 return r;
1208}
1209
1210template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1211T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1212{
1213 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1214}
1215
1216template <typename T, typename N, typename F,
1217 typename M=std::enable_if_t<std::is_integral_v<N>> >
1218T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1219{
1220 T r = init_val;
1221#ifdef AMREX_USE_OMP
1222#pragma omp parallel for reduction(max:r)
1223#endif
1224 for (N i = 0; i < n; ++i) {
1225 r = std::max(r,f(i));
1226 }
1227 return r;
1228}
1229
1230template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1231T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1232{
1233 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1234}
1235
1236template <typename T, typename N, typename F,
1237 typename M=std::enable_if_t<std::is_integral_v<N>> >
1238std::pair<T,T> MinMax (N n, F const& f)
1239{
1240 T r_min = std::numeric_limits<T>::max();
1241 T r_max = std::numeric_limits<T>::lowest();
1242#ifdef AMREX_USE_OMP
1243#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1244#endif
1245 for (N i = 0; i < n; ++i) {
1246 T tmp = f(i);
1247 r_min = std::min(r_min,tmp);
1248 r_max = std::max(r_max,tmp);
1249 }
1250 return std::make_pair(r_min,r_max);
1251}
1252
1253template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1254std::pair<T,T> MinMax (N n, T const* v)
1255{
1256 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1257}
1258
1259template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1260bool AnyOf (N n, T const* v, P&& pred)
1261{
1262 return std::any_of(v, v+n, std::forward<P>(pred));
1263}
1264
1265template <typename P>
1266bool AnyOf (Box const& box, P const& pred)
1267{
1268 const auto lo = amrex::lbound(box);
1269 const auto hi = amrex::ubound(box);
1270 for (int k = lo.z; k <= hi.z; ++k) {
1271 for (int j = lo.y; j <= hi.y; ++j) {
1272 for (int i = lo.x; i <= hi.x; ++i) {
1273 if (pred(i,j,k)) { return true; }
1274 }}}
1275 return false;
1276}
1277
1278}
1279
1280#endif
1281
1286template <typename... Ts, typename... Ps>
1288constexpr GpuTuple<Ts...>
1290{
1291 GpuTuple<Ts...> r{};
1292 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1293 return r;
1294}
1295
1300template <typename... Ts, typename... Ps>
1302constexpr GpuTuple<Ts...>
1304{
1305 GpuTuple<Ts...> r{};
1306 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1307 return r;
1308}
1309
1310}
1311
1312#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_PICK(a, b, c)
Definition AMReX_SPACE.H:173
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
__host__ __device__ Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:349
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition AMReX_Box.H:130
Definition AMReX_Tuple.H:93
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:690
Definition AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141
Definition AMReX_PODVector.H:297
T * data() noexcept
Definition AMReX_PODVector.H:655
Definition AMReX_Reduce.H:249
~ReduceData()
Definition AMReX_Reduce.H:271
int maxStreamIndex() const
Definition AMReX_Reduce.H:304
Type value()
Definition AMReX_Reduce.H:281
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:305
Type * m_host_tuple
Definition AMReX_Reduce.H:312
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:300
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:254
int m_max_blocks
Definition AMReX_Reduce.H:310
GpuArray< int, 8 > m_nblocks
Definition AMReX_Reduce.H:314
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:293
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:287
Type * devicePtr()
Definition AMReX_Reduce.H:292
Type * m_device_tuple
Definition AMReX_Reduce.H:313
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:299
ReduceData(ReduceData< Ts... > const &)=delete
std::function< Type()> m_fn_value
Definition AMReX_Reduce.H:315
Type * hostPtr()
Definition AMReX_Reduce.H:297
int maxBlocks() const
Definition AMReX_Reduce.H:302
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:364
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:652
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:545
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:369
void eval(Box const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:486
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:433
void resetResultReadiness()
Definition AMReX_Reduce.H:740
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:461
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:605
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:453
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:398
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:348
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:204
__device__ void for_each_parallel(T &d, T const &s)
Definition AMReX_Reduce.H:38
__host__ __device__ constexpr void for_each_init(T &t)
Definition AMReX_Reduce.H:70
__host__ __device__ void for_each_local(T &d, T const &s)
Definition AMReX_Reduce.H:55
__device__ void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition AMReX_Reduce.H:344
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:815
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:843
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition AMReX_ParallelReduce.H:92
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition AMReX_MFParallelForG.H:44
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1289
BoxND< 3 > Box
Definition AMReX_BaseFwd.H:27
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:745
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1229
Arena * The_Arena()
Definition AMReX_Arena.cpp:705
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_Box.H:2045
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2086
Definition AMReX_Array.H:34
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:132
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:285
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:209
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:180
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:208
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:194
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:204
Definition AMReX_Reduce.H:212
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:240
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:236
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:226
Definition AMReX_Reduce.H:147
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:172
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:176
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:168
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:159
Definition AMReX_Reduce.H:114
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:126
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:135
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:139
Definition AMReX_Reduce.H:85
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:98
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:107
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:110
Definition AMReX_Reduce.H:339
Definition AMReX_Reduce.H:338
Struct for holding types.
Definition AMReX_TypeList.H:12