Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_Reduce.H
Go to the documentation of this file.
1#ifndef AMREX_REDUCE_H_
2#define AMREX_REDUCE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_Arena.H>
7#include <AMReX_OpenMP.H>
8#include <AMReX_MFIter.H>
9#include <AMReX_ValLocPair.H>
10
11#include <algorithm>
12#include <functional>
13#include <limits>
14
15namespace amrex {
16
18namespace Reduce::detail {
19
20#ifdef AMREX_USE_GPU
21#ifdef AMREX_USE_SYCL
22 template <std::size_t I, typename T, typename P>
24 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
25 {
26 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
27 }
28
29 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
31 void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)
32 {
33 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
34 for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
35 }
36#else
37 template <std::size_t I, typename T, typename P>
39 void for_each_parallel (T& d, T const& s)
40 {
41 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
42 }
43
44 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
46 void for_each_parallel (T& d, T const& s)
47 {
48 P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
49 for_each_parallel<I+1,T,P1,Ps...>(d, s);
50 }
51#endif
52#endif
53
54 template <std::size_t I, typename T, typename P>
56 void for_each_local (T& d, T const& s)
57 {
58 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
59 }
60
61 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
63 void for_each_local (T& d, T const& s)
64 {
65 P().local_update(amrex::get<I>(d), amrex::get<I>(s));
66 for_each_local<I+1,T,P1,Ps...>(d, s);
67 }
68
69 template <std::size_t I, typename T, typename P>
71 constexpr void for_each_init (T& t)
72 {
73 P().init(amrex::get<I>(t));
74 }
75
76 template <std::size_t I, typename T, typename P, typename P1, typename... Ps>
78 constexpr void for_each_init (T& t)
79 {
80 P().init(amrex::get<I>(t));
81 for_each_init<I+1,T,P1,Ps...>(t);
82 }
83}
85
88{
89
90#ifdef AMREX_USE_GPU
91#ifdef AMREX_USE_SYCL
92 template <typename T>
94 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
95 T r = Gpu::blockReduceSum(s,h);
96 if (h.threadIdx() == 0) { d += r; }
97 }
98#else
99 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
101 void parallel_update (T& d, T const& s) const noexcept {
102 T r = Gpu::blockReduceSum<MT>(s);
103 if (threadIdx.x == 0) { d += r; }
104 }
105#endif
106#endif
107
108 template <typename T>
110 void local_update (T& d, T const& s) const noexcept { d += s; }
111
112 template <typename T>
113 constexpr void init (T& t) const noexcept { t = 0; }
114};
115
118{
119#ifdef AMREX_USE_GPU
120#ifdef AMREX_USE_SYCL
121 template <typename T>
123 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
124 T r = Gpu::blockReduceMin(s,h);
125 if (h.threadIdx() == 0) { d = amrex::min(d,r); }
126 }
127#else
128 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
130 void parallel_update (T& d, T const& s) const noexcept {
131 T r = Gpu::blockReduceMin<MT>(s);
132 if (threadIdx.x == 0) { d = amrex::min(d,r); }
133 }
134#endif
135#endif
136
137 template <typename T>
139 void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
140
141 template <typename T>
142 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
143 init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
144
145 template <typename T>
146 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
147 init (T& t) const noexcept { t = T::max(); }
148};
149
152{
153#ifdef AMREX_USE_GPU
154#ifdef AMREX_USE_SYCL
155 template <typename T>
157 void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
158 T r = Gpu::blockReduceMax(s,h);
159 if (h.threadIdx() == 0) { d = amrex::max(d,r); }
160 }
161#else
162 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
164 void parallel_update (T& d, T const& s) const noexcept {
165 T r = Gpu::blockReduceMax<MT>(s);
166 if (threadIdx.x == 0) { d = amrex::max(d,r); }
167 }
168#endif
169#endif
170
171 template <typename T>
173 void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
174
175 template <typename T>
176 constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
177 init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
178
179 template <typename T>
180 constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
181 init (T& t) const noexcept { t = T::lowest(); }
182};
183
186{
187#ifdef AMREX_USE_GPU
188#ifdef AMREX_USE_SYCL
189 template <typename T>
191 std::enable_if_t<std::is_integral_v<T>>
192 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
194 if (h.threadIdx() == 0) { d = d && r; }
195 }
196#else
197 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
199 std::enable_if_t<std::is_integral_v<T>>
200 parallel_update (T& d, T s) const noexcept {
201 T r = Gpu::blockReduceLogicalAnd<MT>(s);
202 if (threadIdx.x == 0) { d = d && r; }
203 }
204#endif
205#endif
206
207 template <typename T>
209 std::enable_if_t<std::is_integral_v<T>>
210 local_update (T& d, T s) const noexcept { d = d && s; }
211
212 template <typename T>
213 constexpr std::enable_if_t<std::is_integral_v<T>>
214 init (T& t) const noexcept { t = true; }
215};
216
219{
220#ifdef AMREX_USE_GPU
221#ifdef AMREX_USE_SYCL
222 template <typename T>
224 std::enable_if_t<std::is_integral_v<T>>
225 parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {
226 T r = Gpu::blockReduceLogicalOr(s,h);
227 if (h.threadIdx() == 0) { d = d || r; }
228 }
229#else
230 template <typename T, int MT=AMREX_GPU_MAX_THREADS>
232 std::enable_if_t<std::is_integral_v<T>>
233 parallel_update (T& d, T s) const noexcept {
234 T r = Gpu::blockReduceLogicalOr<MT>(s);
235 if (threadIdx.x == 0) { d = d || r; }
236 }
237#endif
238#endif
239
240 template <typename T>
242 std::enable_if_t<std::is_integral_v<T>>
243 local_update (T& d, T s) const noexcept { d = d || s; }
244
245 template <typename T>
246 constexpr std::enable_if_t<std::is_integral_v<T>>
247 init (T& t) const noexcept { t = false; }
248};
249
250template <typename... Ps> class ReduceOps;
251
252#ifdef AMREX_USE_GPU
253
255template <typename... Ts>
257{
258public:
259 using Type = GpuTuple<Ts...>;
260
261 template <typename... Ps>
262 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
263 : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),
264 m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),
265 m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)
266 * m_max_blocks * sizeof(Type)))),
267 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
268 {
269 reduce_op.resetResultReadiness();
270 static_assert(std::is_trivially_copyable<Type>(),
271 "ReduceData::Type must be trivially copyable");
272 static_assert(std::is_trivially_destructible<Type>(),
273 "ReduceData::Type must be trivially destructible");
274
275 new (m_host_tuple) Type();
276 m_nblocks.fill(0);
277 }
278
280 The_Pinned_Arena()->free(m_host_tuple);
281 The_Arena()->free(m_device_tuple);
282 }
283
284 ReduceData (ReduceData<Ts...> const&) = delete;
286 void operator= (ReduceData<Ts...> const&) = delete;
287 void operator= (ReduceData<Ts...> &&) = delete;
288
290 {
291 return m_fn_value();
292 }
293
294 template <typename... Ps>
296 {
297 return reduce_op.value(*this);
298 }
299
300 Type* devicePtr () { return m_device_tuple; }
302 return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;
303 }
304
305 Type* hostPtr () { return m_host_tuple; }
306
308 int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }
309
310 int maxBlocks () const { return m_max_blocks; }
311
312 int maxStreamIndex () const { return m_max_stream_index; }
314 m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));
315 }
316
317private:
318 int m_max_blocks;
319 int m_max_stream_index = 0;
320 Type* m_host_tuple = nullptr;
321 Type* m_device_tuple = nullptr;
323 std::function<Type()> m_fn_value;
324};
325
327namespace Reduce::detail {
328
329 // call_f_intvect_box
330
331 template <typename F, int dim>
333 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
334 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
335 {
336 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
337 }
338
339 template <typename F, int dim>
341 auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
342 decltype(f(BoxND<dim>(iv, iv, t)))
343 {
344 return f(BoxND<dim>(iv, iv, t));
345 }
346
347 // call_f_intvect_n
348
349 template <typename F, typename T, int dim>
351 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
352 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
353 {
354 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
355 }
356
357 // mf_call_f
358
360 struct iterate_box {};
361 struct iterate_box_comp {};
363
364 template <typename I, typename F, typename T, typename... Ps,
365 std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>
367 void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept
368 {
369 auto const& pr = f(ibox,i,j,k);
370 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
371 }
372
373 template <typename I, typename F, typename T, typename... Ps,
374 std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>
376 void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept
377 {
378 for (int n = 0; n < ncomp; ++n) {
379 auto const& pr = f(ibox,i,j,k,n);
380 Reduce::detail::for_each_local<0, T, Ps...>(r, pr);
381 }
382 }
383}
385
387template <typename... Ps>
389{
390public:
391
392 // This is public for CUDA
393 template <typename I, typename MF, typename D, typename F>
394 void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)
395 {
396 using ReduceTuple = typename D::Type;
397 const int nboxes = mf.local_size();
398 if (nboxes > 0) {
399 auto const& parforinfo = mf.getParForInfo(nghost);
400 auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
401 AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
402 const int nblocks = nblocks_per_box * nboxes;
403 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
404
405 auto const& stream = Gpu::gpuStream();
406 auto pdst = reduce_data.devicePtr(stream);
407 int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());
408 AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));
409 reduce_data.nBlocks(stream) = nblocks_ec;
410 reduce_data.updateMaxStreamIndex(stream);
411
412#ifdef AMREX_USE_SYCL
413 // device reduce needs local(i.e., shared) memory
414 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
415 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
416 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
417 {
418 Dim1 blockIdx {gh.blockIdx()};
419 Dim1 threadIdx{gh.threadIdx()};
420#else
421 amrex::launch_global<AMREX_GPU_MAX_THREADS>
422 <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
423 ([=] AMREX_GPU_DEVICE () noexcept
424 {
425#endif
426 ReduceTuple r;
427 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
428 ReduceTuple& dst = pdst[blockIdx.x];
429 if (threadIdx.x == 0) {
430 dst = r;
431 }
432 for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
433 int ibox = iblock / nblocks_per_box;
434 auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
435
436 BoxIndexer const& indexer = dp_boxes[ibox];
437 if (icell < indexer.numPts()) {
438 auto [i, j, k] = indexer(icell);
439 Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>
440 (f, ibox, i, j, k, ncomp, r);
441 }
442 }
443#ifdef AMREX_USE_SYCL
444 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
445#else
446 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
447#endif
448 });
449 }
450 }
451
452 template <typename MF, typename D, typename F>
453 std::enable_if_t<IsFabArray<MF>::value
454#ifndef AMREX_USE_CUDA
456#endif
457 >
458 eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)
459 {
460 using ReduceTuple = typename D::Type;
461 const int nboxes = mf.local_size();
462 if (nboxes == 0) {
463 return;
464 } else if (!mf.isFusingCandidate()) {
465 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
466 Box const& b = amrex::grow(mfi.validbox(), nghost);
467 const int li = mfi.LocalIndex();
468 this->eval(b, reduce_data,
469 [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple
470 {
471 return f(li, i, j, k);
472 });
473 }
474 } else {
475 eval_mf(Reduce::detail::iterate_box{},
476 mf, nghost, 0, reduce_data, std::forward<F>(f));
477 }
478 }
479
480 template <typename MF, typename D, typename F>
481 std::enable_if_t<IsFabArray<MF>::value
482#ifndef AMREX_USE_CUDA
484#endif
485 >
486 eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)
487 {
488 using ReduceTuple = typename D::Type;
489
490 const int nboxes = mf.local_size();
491
492 if (nboxes == 0) {
493 return;
494 } else if (!mf.isFusingCandidate()) {
495 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
496 Box const& b = amrex::grow(mfi.validbox(), nghost);
497 const int li = mfi.LocalIndex();
498 this->eval(b, ncomp, reduce_data,
499 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple
500 {
501 return f(li, i, j, k, n);
502 });
503 }
504 } else {
505 eval_mf(Reduce::detail::iterate_box_comp{},
506 mf, nghost, ncomp, reduce_data, std::forward<F>(f));
507 }
508 }
509
510 template <typename D, typename F, int dim>
511 void eval (BoxND<dim> const& box, D & reduce_data, F const& f)
512 {
513 using ReduceTuple = typename D::Type;
514 auto const& stream = Gpu::gpuStream();
515 auto dp = reduce_data.devicePtr(stream);
516 int& nblocks = reduce_data.nBlocks(stream);
517 const BoxIndexerND<dim> indexer(box);
518 IndexTypeND<dim> ixtype = box.ixType();
519 constexpr int nitems_per_thread = 4;
520 Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
521 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
522 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
523 reduce_data.updateMaxStreamIndex(stream);
524#ifdef AMREX_USE_SYCL
525 // device reduce needs local(i.e., shared) memory
526 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
527 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
528 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
529 {
530 Dim1 blockIdx {gh.blockIdx()};
531 Dim1 threadIdx{gh.threadIdx()};
532 Dim1 gridDim {gh.gridDim()};
533#else
534 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
535 [=] AMREX_GPU_DEVICE () noexcept
536 {
537#endif
538 ReduceTuple r;
539 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
540 ReduceTuple& dst = *(dp+blockIdx.x);
541 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
542 dst = r;
543 }
544 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
545 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
546 icell < indexer.numPts();
547 icell += stride)
548 {
549 auto iv = indexer.intVect(icell);
550 auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
551 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
552 }
553#ifdef AMREX_USE_SYCL
554 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
555#else
556 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
557#endif
558 });
559 nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
560 }
561
562 template <typename N, typename D, typename F, int dim,
563 typename M=std::enable_if_t<std::is_integral_v<N>> >
564 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
565 {
566 using ReduceTuple = typename D::Type;
567 auto const& stream = Gpu::gpuStream();
568 auto dp = reduce_data.devicePtr(stream);
569 int& nblocks = reduce_data.nBlocks(stream);
570 const BoxIndexerND<dim> indexer(box);
571 constexpr int nitems_per_thread = 4;
572 Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
573 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
574 nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
575 reduce_data.updateMaxStreamIndex(stream);
576#ifdef AMREX_USE_SYCL
577 // device reduce needs local(i.e., shared) memory
578 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
579 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
580 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
581 {
582 Dim1 blockIdx {gh.blockIdx()};
583 Dim1 threadIdx{gh.threadIdx()};
584 Dim1 gridDim {gh.gridDim()};
585#else
586 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
587 [=] AMREX_GPU_DEVICE () noexcept
588 {
589#endif
590 ReduceTuple r;
591 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
592 ReduceTuple& dst = *(dp+blockIdx.x);
593 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
594 dst = r;
595 }
596 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
597 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
598 icell < indexer.numPts();
599 icell += stride)
600 {
601 auto iv = indexer.intVect(icell);
602 for (N n = 0; n < ncomp; ++n) {
603 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
604 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
605 }
606 }
607#ifdef AMREX_USE_SYCL
608 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
609#else
610 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
611#endif
612 });
613 nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
614 }
615
616 template <typename N, typename D, typename F,
617 typename M=std::enable_if_t<std::is_integral_v<N>> >
618 void eval (N n, D & reduce_data, F const& f)
619 {
620 if (n <= 0) { return; }
621 using ReduceTuple = typename D::Type;
622 auto const& stream = Gpu::gpuStream();
623 auto dp = reduce_data.devicePtr(stream);
624 int& nblocks = reduce_data.nBlocks(stream);
625 constexpr int nitems_per_thread = 4;
626 int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
627 / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
628 nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
629 reduce_data.updateMaxStreamIndex(stream);
630#ifdef AMREX_USE_SYCL
631 // device reduce needs local(i.e., shared) memory
632 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
633 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
634 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
635 {
636 Dim1 blockIdx {gh.blockIdx()};
637 Dim1 threadIdx{gh.threadIdx()};
638 Dim1 gridDim {gh.gridDim()};
639#else
640 amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
641 [=] AMREX_GPU_DEVICE () noexcept
642 {
643#endif
644 ReduceTuple r;
645 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
646 ReduceTuple& dst = *(dp+blockIdx.x);
647 if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
648 dst = r;
649 }
650 for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
651 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
652 i < n;
653 i += stride)
654 {
655 auto pr = f(i);
656 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
657 }
658#ifdef AMREX_USE_SYCL
659 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
660#else
661 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
662#endif
663 });
664 nblocks = amrex::max(nblocks, nblocks_ec);
665 }
666
667 template <typename D>
668 typename D::Type value (D & reduce_data)
669 {
670 auto hp = reduce_data.hostPtr();
671
672 if (m_result_is_ready) {
673 return *hp;
674 }
675
676 using ReduceTuple = typename D::Type;
677 auto const& stream = Gpu::gpuStream();
678 auto dp = reduce_data.devicePtr();
679 auto const& nblocks = reduce_data.nBlocks();
680#if defined(AMREX_USE_SYCL)
681 if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {
682 const int N = nblocks[0];
683 if (N == 0) {
684 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);
685 } else {
687 Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);
688 Gpu::streamSynchronize();
689 for (int i = 1; i < N; ++i) {
690 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);
691 }
692 *hp = tmp[0];
693 }
694 } else
695#endif
696 {
697 int maxblocks = reduce_data.maxBlocks();
698#ifdef AMREX_USE_SYCL
699 // device reduce needs local(i.e., shared) memory
700 constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
701#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
702 // xxxxx SYCL todo: reduce bug workaround
704 auto presult = dtmp.data();
705#else
706 auto presult = hp;
707#endif
708 amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
709 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
710 {
711 ReduceTuple r;
712 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
713 ReduceTuple dst = r;
714 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
715 auto dp_stream = dp+istream*maxblocks;
716 for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
717 i < nblocks[istream]; i += stride) {
718 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
719 }
720 }
721 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
722 if (gh.threadIdx() == 0) { *presult = dst; }
723 });
724#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
725 Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
726#endif
727#else
728 amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
729 [=] AMREX_GPU_DEVICE () noexcept
730 {
731 ReduceTuple r;
732 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
733 ReduceTuple dst = r;
734 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
735 auto dp_stream = dp+istream*maxblocks;
736 for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
737 i < nblocks[istream]; i += stride) {
738 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
739 }
740 }
741 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
742 if (threadIdx.x == 0) { *hp = dst; }
743 });
744#endif
745 Gpu::streamSynchronize();
746 }
747
748 m_result_is_ready = true;
749 return *hp;
750 }
751
752private:
753 bool m_result_is_ready = false;
754
755public:
756 void resetResultReadiness () { m_result_is_ready = false; }
757};
758
759namespace Reduce {
760
762template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
763T Sum (N n, T const* v, T init_val = 0)
764{
765 ReduceOps<ReduceOpSum> reduce_op;
766 ReduceData<T> reduce_data(reduce_op);
767 using ReduceTuple = typename decltype(reduce_data)::Type;
768 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
769 ReduceTuple hv = reduce_data.value(reduce_op);
770 return amrex::get<0>(hv) + init_val;
771}
772
774template <typename T, typename N, typename F,
775 typename M=std::enable_if_t<std::is_integral_v<N>> >
776T Sum (N n, F const& f, T init_val = 0)
777{
778 ReduceOps<ReduceOpSum> reduce_op;
779 ReduceData<T> reduce_data(reduce_op);
780 using ReduceTuple = typename decltype(reduce_data)::Type;
781 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
782 ReduceTuple hv = reduce_data.value(reduce_op);
783 return amrex::get<0>(hv) + init_val;
784}
785
787template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
788T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
789{
790 ReduceOps<ReduceOpMin> reduce_op;
791 ReduceData<T> reduce_data(reduce_op);
792 using ReduceTuple = typename decltype(reduce_data)::Type;
793 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
794 ReduceTuple hv = reduce_data.value(reduce_op);
795 return std::min(amrex::get<0>(hv),init_val);
796}
797
799template <typename T, typename N, typename F,
800 typename M=std::enable_if_t<std::is_integral_v<N>> >
801T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
802{
803 ReduceOps<ReduceOpMin> reduce_op;
804 ReduceData<T> reduce_data(reduce_op);
805 using ReduceTuple = typename decltype(reduce_data)::Type;
806 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
807 ReduceTuple hv = reduce_data.value(reduce_op);
808 return std::min(amrex::get<0>(hv),init_val);
809}
810
812template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
813T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
814{
815 ReduceOps<ReduceOpMax> reduce_op;
816 ReduceData<T> reduce_data(reduce_op);
817 using ReduceTuple = typename decltype(reduce_data)::Type;
818 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });
819 ReduceTuple hv = reduce_data.value(reduce_op);
820 return std::max(amrex::get<0>(hv),init_val);
821}
822
824template <typename T, typename N, typename F,
825 typename M=std::enable_if_t<std::is_integral_v<N>> >
826T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
827{
828 ReduceOps<ReduceOpMax> reduce_op;
829 ReduceData<T> reduce_data(reduce_op);
830 using ReduceTuple = typename decltype(reduce_data)::Type;
831 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });
832 ReduceTuple hv = reduce_data.value(reduce_op);
833 return std::max(amrex::get<0>(hv),init_val);
834}
835
837template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
838std::pair<T,T> MinMax (N n, T const* v)
839{
841 ReduceData<T,T> reduce_data(reduce_op);
842 using ReduceTuple = typename decltype(reduce_data)::Type;
843 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
844 return {v[i],v[i]};
845 });
846 auto hv = reduce_data.value(reduce_op);
847 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
848}
849
851template <typename T, typename N, typename F,
852 typename M=std::enable_if_t<std::is_integral_v<N>> >
853std::pair<T,T> MinMax (N n, F const& f)
854{
856 ReduceData<T,T> reduce_data(reduce_op);
857 using ReduceTuple = typename decltype(reduce_data)::Type;
858 reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {
859 T tmp = f(i);
860 return {tmp,tmp};
861 });
862 auto hv = reduce_data.value(reduce_op);
863 return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));
864}
865
867template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
868bool AnyOf (N n, T const* v, P const& pred)
869{
870 Gpu::LaunchSafeGuard lsg(true);
872 int* dp = ds.dataPtr();
873 auto ec = Gpu::ExecutionConfig(n);
874 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
875
876#ifdef AMREX_USE_SYCL
877 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
878 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
879 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
880 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
881 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
882 if (gh.threadIdx() == 0) { *has_any = *dp; }
883 gh.sharedBarrier();
884
885 if (!(*has_any))
886 {
887 int r = false;
888 for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
889 i < n && !r; i += stride)
890 {
891 r = pred(v[i]) ? 1 : 0;
892 }
893
894 r = Gpu::blockReduce<Gpu::Device::warp_size>
895 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
896 if (gh.threadIdx() == 0 && r) { *dp = 1; }
897 }
898 });
899#else
900 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
901 [=] AMREX_GPU_DEVICE () noexcept {
902 __shared__ int has_any;
903 if (threadIdx.x == 0) { has_any = *dp; }
904 __syncthreads();
905
906 if (!has_any)
907 {
908 int r = false;
909 for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
910 i < n && !r; i += stride)
911 {
912 r = pred(v[i]) ? 1 : 0;
913 }
914 r = Gpu::blockReduce<Gpu::Device::warp_size>
915 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
916 if (threadIdx.x == 0 && r) *dp = 1;
917 }
918 });
919#endif
920 return ds.dataValue();
921}
922
924template <typename P, int dim>
925bool AnyOf (BoxND<dim> const& box, P const& pred)
926{
927 Gpu::LaunchSafeGuard lsg(true);
929 int* dp = ds.dataPtr();
930 const BoxIndexerND<dim> indexer(box);
931 auto ec = Gpu::ExecutionConfig(box.numPts());
932 ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
933
934#ifdef AMREX_USE_SYCL
935 const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
936 const std::size_t shared_mem_bytes = num_ints*sizeof(int);
937 amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
938 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
939 int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
940 if (gh.threadIdx() == 0) { *has_any = *dp; }
941 gh.sharedBarrier();
942
943 if (!(*has_any))
944 {
945 int r = false;
946 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
947 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
948 icell < indexer.numPts() && !r;
949 icell += stride)
950 {
951 auto iv = indexer.intVect(icell);
952 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
953 }
954 r = Gpu::blockReduce<Gpu::Device::warp_size>
955 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
956 if (gh.threadIdx() == 0 && r) { *dp = 1; }
957 }
958 });
959#else
960 AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
961 Gpu::gpuStream(),
962 [=] AMREX_GPU_DEVICE () noexcept {
963 __shared__ int has_any;
964 if (threadIdx.x == 0) { has_any = *dp; }
965 __syncthreads();
966
967 if (!has_any)
968 {
969 int r = false;
970 for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
971 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
972 icell < indexer.numPts() && !r;
973 icell += stride)
974 {
975 auto iv = indexer.intVect(icell);
976 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
977 }
978 r = Gpu::blockReduce<Gpu::Device::warp_size>
979 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
980 if (threadIdx.x == 0 && r) *dp = 1;
981 }
982 });
983#endif
984 return ds.dataValue();
985}
986
987}
988
989#else
990
991template <typename... Ts>
992class ReduceData
993{
994public:
995 using Type = GpuTuple<Ts...>;
996
997 template <typename... Ps>
998 explicit ReduceData (ReduceOps<Ps...>& reduce_op)
999 : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),
1000 m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })
1001 {
1002 reduce_op.resetResultReadiness();
1003 for (auto& t : m_tuple) {
1004 Reduce::detail::for_each_init<0, Type, Ps...>(t);
1005 }
1006 }
1007
1008 ~ReduceData () = default;
1009 ReduceData (ReduceData<Ts...> const&) = delete;
1010 ReduceData (ReduceData<Ts...> &&) = delete;
1011 void operator= (ReduceData<Ts...> const&) = delete;
1012 void operator= (ReduceData<Ts...> &&) = delete;
1013
1014 Type value () { return m_fn_value(); }
1015
1016 template <typename... Ps>
1017 Type value (ReduceOps<Ps...>& reduce_op)
1018 {
1019 return reduce_op.value(*this);
1020 }
1021
1022 Vector<Type>& reference () { return m_tuple; }
1023
1024 Type& reference (int tid)
1025 {
1026 if (m_tuple.size() == 1) {
1027 // No OpenMP or already inside OpenMP parallel when reduce_data is constructed
1028 return m_tuple[0];
1029 } else {
1030 return m_tuple[tid];
1031 }
1032 }
1033
1034private:
1035 Vector<Type> m_tuple;
1036 std::function<Type()> m_fn_value;
1037};
1038
1039namespace Reduce::detail {
1040
1041 // call_f_intvect
1042
1043 template <typename F, int dim>
1045 auto call_f_intvect (F const& f, IntVectND<dim> iv) noexcept ->
1046 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
1047 {
1048 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
1049 }
1050
1051 // call_f_intvect_n
1052
1053 template <typename F, typename T, int dim>
1055 auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->
1056 decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))
1057 {
1058 return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);
1059 }
1060}
1061
1062template <typename... Ps>
1063class ReduceOps
1064{
1065private:
1066
1067 // call_f_box
1068
1069 template <typename D, typename F, int dim>
1071 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1072 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(
1073 Reduce::detail::call_f_intvect(f, IntVectND<dim>(0))
1074 )>, typename D::Type>>
1075 {
1076 using ReduceTuple = typename D::Type;
1077 For(box,
1078 [&] (IntVectND<dim> iv) {
1079 auto pr = Reduce::detail::call_f_intvect(f, iv);
1080 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
1081 });
1082 }
1083
1084 template <typename D, typename F, int dim>
1086 static auto call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f)
1087 noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(box))>,
1088 typename D::Type>>
1089 {
1090 using ReduceTuple = typename D::Type;
1091 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));
1092 }
1093
1094public:
1095
1096 template <typename MF, typename D, typename F>
1097 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>
1098 eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)
1099 {
1100 using ReduceTuple = typename D::Type;
1101#ifdef AMREX_USE_OMP
1102#pragma omp parallel
1103#endif
1104 {
1105 ReduceTuple rr;
1106 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1107 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1108 Box const& b = mfi.growntilebox(nghost);
1109 const int li = mfi.LocalIndex();
1110 const auto lo = amrex::lbound(b);
1111 const auto hi = amrex::ubound(b);
1112 for (int k = lo.z; k <= hi.z; ++k) {
1113 for (int j = lo.y; j <= hi.y; ++j) {
1114 for (int i = lo.x; i <= hi.x; ++i) {
1115 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));
1116 }}}
1117 }
1118 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1119 reduce_data.reference(OpenMP::get_thread_num()), rr);
1120 }
1121 }
1122
1123 template <typename MF, typename D, typename F>
1124 std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>
1125 eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)
1126 {
1127 using ReduceTuple = typename D::Type;
1128#ifdef AMREX_USE_OMP
1129#pragma omp parallel
1130#endif
1131 {
1132 ReduceTuple rr;
1133 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1134 for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
1135 Box const& b = mfi.growntilebox(nghost);
1136 const int li = mfi.LocalIndex();
1137 const auto lo = amrex::lbound(b);
1138 const auto hi = amrex::ubound(b);
1139 for (int n = 0; n < ncomp; ++n) {
1140 for (int k = lo.z; k <= hi.z; ++k) {
1141 for (int j = lo.y; j <= hi.y; ++j) {
1142 for (int i = lo.x; i <= hi.x; ++i) {
1143 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));
1144 }}}}
1145 }
1146 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1147 reduce_data.reference(OpenMP::get_thread_num()), rr);
1148 }
1149 }
1150
1151 template <typename D, typename F, int dim>
1152 void eval (BoxND<dim> const& box, D & reduce_data, F&& f)
1153 {
1154 using ReduceTuple = typename D::Type;
1155 ReduceTuple rr;
1156 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1157 call_f_box<D>(box, rr, std::forward<F>(f));
1158 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1159 reduce_data.reference(OpenMP::get_thread_num()), rr);
1160 }
1161
1162 template <typename N, typename D, typename F, int dim,
1163 typename M=std::enable_if_t<std::is_integral_v<N>> >
1164 void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)
1165 {
1166 using ReduceTuple = typename D::Type;
1167 ReduceTuple rr;
1168 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1169 For(box, ncomp,
1170 [&] (IntVectND<dim> iv, int n) {
1171 auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
1172 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);
1173 });
1174 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1175 reduce_data.reference(OpenMP::get_thread_num()), rr);
1176 }
1177
1178 template <typename N, typename D, typename F,
1179 typename M=std::enable_if_t<std::is_integral_v<N>> >
1180 void eval (N n, D & reduce_data, F const& f)
1181 {
1182 using ReduceTuple = typename D::Type;
1183 ReduceTuple rr;
1184 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);
1185 for (N i = 0; i < n; ++i) {
1186 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));
1187 }
1188 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(
1189 reduce_data.reference(OpenMP::get_thread_num()), rr);
1190 }
1191
1192 template <typename D>
1193 typename D::Type value (D & reduce_data)
1194 {
1195 auto& rrv = reduce_data.reference();
1196 if (! m_result_is_ready) {
1197 using ReduceTuple = typename D::Type;
1198 if (rrv.size() > 1) {
1199 for (int i = 1, N = rrv.size(); i < N; ++i) {
1200 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);
1201 }
1202 }
1203 m_result_is_ready = true;
1204 }
1205 return rrv[0];
1206 }
1207
1208 bool m_result_is_ready = false;
1209
1210 void resetResultReadiness () { m_result_is_ready = false; }
1211};
1212
1213namespace Reduce {
1214
1215template <typename T, typename N, typename F,
1216 typename M=std::enable_if_t<std::is_integral_v<N>> >
1217T Sum (N n, F const& f, T init_val = 0)
1218{
1219 T r = init_val;
1220#ifdef AMREX_USE_OMP
1221#pragma omp parallel for reduction(+:r)
1222#endif
1223 for (N i = 0; i < n; ++i) {
1224 r += f(i);
1225 }
1226 return r;
1227}
1228
1229template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1230T Sum (N n, T const* v, T init_val = 0)
1231{
1232 return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);
1233}
1234
1235template <typename T, typename N, typename F,
1236 typename M=std::enable_if_t<std::is_integral_v<N>> >
1237T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())
1238{
1239 T r = init_val;
1240#ifdef AMREX_USE_OMP
1241#pragma omp parallel for reduction(min:r)
1242#endif
1243 for (N i = 0; i < n; ++i) {
1244 r = std::min(r,f(i));
1245 }
1246 return r;
1247}
1248
1249template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1250T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())
1251{
1252 return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);
1253}
1254
1255template <typename T, typename N, typename F,
1256 typename M=std::enable_if_t<std::is_integral_v<N>> >
1257T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())
1258{
1259 T r = init_val;
1260#ifdef AMREX_USE_OMP
1261#pragma omp parallel for reduction(max:r)
1262#endif
1263 for (N i = 0; i < n; ++i) {
1264 r = std::max(r,f(i));
1265 }
1266 return r;
1267}
1268
1269template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1270T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())
1271{
1272 return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);
1273}
1274
1275template <typename T, typename N, typename F,
1276 typename M=std::enable_if_t<std::is_integral_v<N>> >
1277std::pair<T,T> MinMax (N n, F const& f)
1278{
1279 T r_min = std::numeric_limits<T>::max();
1280 T r_max = std::numeric_limits<T>::lowest();
1281#ifdef AMREX_USE_OMP
1282#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)
1283#endif
1284 for (N i = 0; i < n; ++i) {
1285 T tmp = f(i);
1286 r_min = std::min(r_min,tmp);
1287 r_max = std::max(r_max,tmp);
1288 }
1289 return std::make_pair(r_min,r_max);
1290}
1291
1292template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1293std::pair<T,T> MinMax (N n, T const* v)
1294{
1295 return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });
1296}
1297
1298template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1299bool AnyOf (N n, T const* v, P&& pred)
1300{
1301 return std::any_of(v, v+n, std::forward<P>(pred));
1302}
1303
1304template <typename P, int dim>
1305bool AnyOf (BoxND<dim> const& box, P const& pred)
1306{
1307 for (auto iv : box.iterator()) { // NOLINT(readability-use-anyofallof)
1308 if (Reduce::detail::call_f_intvect(pred, iv)) { return true; }
1309 }
1310 return false;
1311}
1312
1313}
1314
1315#endif
1316
1321template <typename... Ts, typename... Ps>
1323constexpr GpuTuple<Ts...>
1325{
1326 GpuTuple<Ts...> r{};
1327 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1328 return r;
1329}
1330
1335template <typename... Ts, typename... Ps>
1337constexpr GpuTuple<Ts...>
1339{
1340 GpuTuple<Ts...> r{};
1341 Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);
1342 return r;
1343}
1344
1345namespace Reduce {
1346
1348template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1349T Sum (N n, T* v, T init_val = 0)
1350{
1351 return Sum(n, (T const*)v, init_val);
1352}
1353
1355template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1356T Min (N n, T* v, T init_val = std::numeric_limits<T>::max())
1357{
1358 return Min(n, (T const*)v, init_val);
1359}
1360
1362template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1363T Max (N n, T* v, T init_val = std::numeric_limits<T>::lowest())
1364{
1365 return Max(n, (T const*)v, init_val);
1366}
1367
1369template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >
1370std::pair<T,T> MinMax (N n, T* v)
1371{
1372 return MinMax(n, (T const*)v);
1373}
1374
1376template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >
1377bool AnyOf (N n, T* v, P const& pred)
1378{
1379 return AnyOf(n, (T const*)v, pred);
1380}
1381
1382}
1383
1384}
1385
1386#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:135
GPU-compatible tuple.
Definition AMReX_Tuple.H:98
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:697
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:85
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_Reduce.H:257
~ReduceData()
Definition AMReX_Reduce.H:279
int maxStreamIndex() const
Definition AMReX_Reduce.H:312
Type value()
Definition AMReX_Reduce.H:289
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:313
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:308
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:262
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:301
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:295
Type * devicePtr()
Definition AMReX_Reduce.H:300
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:307
ReduceData(ReduceData< Ts... > const &)=delete
Type * hostPtr()
Definition AMReX_Reduce.H:305
int maxBlocks() const
Definition AMReX_Reduce.H:310
ReduceData(ReduceData< Ts... > &&)=delete
Definition AMReX_Reduce.H:389
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:668
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:511
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:394
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:458
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:564
void resetResultReadiness()
Definition AMReX_Reduce.H:756
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:486
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:618
amrex_long Long
Definition AMReX_INT.H:30
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:838
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:868
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:556
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:455
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:400
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:508
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:350
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:1005
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1324
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:154
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:998
Definition AMReX_Box.H:2152
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Definition AMReX_Box.H:2169
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2193
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:41
Definition AMReX_GpuMemory.H:56
T dataValue() const
Definition AMReX_GpuMemory.H:92
T * dataPtr()
Definition AMReX_GpuMemory.H:90
Definition AMReX_GpuLaunch.H:118
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuControl.H:131
Definition AMReX_GpuReduce.H:287
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213
Definition AMReX_Functional.H:14
Definition AMReX_Reduce.H:186
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:214
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:200
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:210
Definition AMReX_Reduce.H:219
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:247
__host__ __device__ std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:243
__device__ std::enable_if_t< std::is_integral_v< T > > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:233
Definition AMReX_Reduce.H:152
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:177
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:181
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:173
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:164
Definition AMReX_Reduce.H:118
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:147
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:130
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:139
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143
Definition AMReX_Reduce.H:88
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:101
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:110
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:113
Struct for holding types.
Definition AMReX_TypeList.H:12