Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_Scan.H
Go to the documentation of this file.
1#ifndef AMREX_SCAN_H_
2#define AMREX_SCAN_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Extension.H>
6#include <AMReX_Gpu.H>
7#include <AMReX_Arena.H>
8
9#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
10# include <cub/cub.cuh>
11# ifdef AMREX_CUDA_CCCL_VER_GE_3
12# include <thrust/iterator/transform_iterator.h>
13# endif
14#elif defined(AMREX_USE_HIP)
15# include <rocprim/rocprim.hpp>
16#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
17# include <oneapi/dpl/execution>
18# include <oneapi/dpl/numeric>
19#endif
20
21#include <cstdint>
22#include <numeric>
23#include <type_traits>
24
25namespace amrex {
26namespace Scan {
27
28struct RetSum {
29 bool flag = true;
30 explicit operator bool() const noexcept { return flag; }
31};
32static constexpr RetSum retSum{true};
33static constexpr RetSum noRetSum{false};
34
35namespace Type {
36 static constexpr struct Inclusive {} inclusive{};
37 static constexpr struct Exclusive {} exclusive{};
38}
39
40#if defined(AMREX_USE_GPU)
41
42namespace detail {
43
44template <typename T>
45struct STVA
46{
47 char status;
49};
50
51template <typename T, bool SINGLE_WORD> struct BlockStatus {};
52
53template <typename T>
54struct BlockStatus<T, true>
55{
56 template<typename U>
57 union Data {
59 uint64_t i;
60 void operator=(Data<U> const&) = delete;
61 void operator=(Data<U> &&) = delete;
62 };
63 Data<T> d;
64
66 void write (char a_status, T a_value) {
67#if defined(AMREX_USE_CUDA)
68 volatile uint64_t tmp;
69 reinterpret_cast<STVA<T> volatile&>(tmp).status = a_status;
70 reinterpret_cast<STVA<T> volatile&>(tmp).value = a_value;
71 reinterpret_cast<uint64_t&>(d.s) = tmp;
72#else
73 Data<T> tmp;
74 tmp.s = {a_status, a_value};
75 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
76 "HIP/SYCL: unsigned long long must be 64 bits");
77 Gpu::Atomic::Exch(reinterpret_cast<unsigned long long*>(&d),
78 reinterpret_cast<unsigned long long&>(tmp));
79#endif
80 }
81
83 T get_aggregate() const { return d.s.value; }
84
86 STVA<T> read () volatile {
87#if defined(AMREX_USE_CUDA)
88 volatile uint64_t tmp = reinterpret_cast<uint64_t volatile&>(d);
89 return {reinterpret_cast<STVA<T> volatile&>(tmp).status,
90 reinterpret_cast<STVA<T> volatile&>(tmp).value };
91#else
92 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
93 "HIP/SYCL: unsigned long long must be 64 bits");
94 unsigned long long tmp = Gpu::Atomic::Add
95 (reinterpret_cast<unsigned long long*>(const_cast<Data<T>*>(&d)), 0ull);
96 return (*reinterpret_cast<Data<T>*>(&tmp)).s;
97#endif
98 }
99
101 void set_status (char a_status) { d.s.status = a_status; }
102
104 STVA<T> wait () volatile {
105 STVA<T> r;
106 do {
107#if defined(AMREX_USE_SYCL)
108 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
109#else
110 __threadfence_block();
111#endif
112 r = read();
113 } while (r.status == 'x');
114 return r;
115 }
116};
117
118template <typename T>
119struct BlockStatus<T, false>
120{
123 char status;
124
126 void write (char a_status, T a_value) {
127 if (a_status == 'a') {
128 aggregate = a_value;
129 } else {
130 inclusive = a_value;
131 }
132#if defined(AMREX_USE_SYCL)
133 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
134#else
135 __threadfence();
136#endif
137 status = a_status;
138 }
139
141 T get_aggregate() const { return aggregate; }
142
144 STVA<T> read () volatile {
145#if defined(AMREX_USE_SYCL)
146 constexpr auto mo = sycl::memory_order::relaxed;
147 constexpr auto ms = sycl::memory_scope::device;
148 constexpr auto as = sycl::access::address_space::global_space;
149#endif
150 if (status == 'x') {
151 return {'x', 0};
152 } else if (status == 'a') {
153#if defined(AMREX_USE_SYCL)
154 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(aggregate)};
155 return {'a', ar.load()};
156#else
157 return {'a', aggregate};
158#endif
159 } else {
160#if defined(AMREX_USE_SYCL)
161 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(inclusive)};
162 return {'p', ar.load()};
163#else
164 return {'p', inclusive};
165#endif
166 }
167 }
168
170 void set_status (char a_status) { status = a_status; }
171
173 STVA<T> wait () volatile {
174 STVA<T> r;
175 do {
176 r = read();
177#if defined(AMREX_USE_SYCL)
178 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
179#else
180 __threadfence();
181#endif
182 } while (r.status == 'x');
183 return r;
184 }
185};
186
187}
188
189#if defined(AMREX_USE_SYCL)
190
191#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
192template <typename T, typename N, typename FIN, typename FOUT, typename TYPE>
193T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
194{
195 if (n <= 0) { return 0; }
196 constexpr int nwarps_per_block = 8;
197 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
198 constexpr int nchunks = 12;
199 constexpr int nelms_per_block = nthreads * nchunks;
200 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
201 std::numeric_limits<int>::max())*nelms_per_block);
202 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
203 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
204 auto stream = Gpu::gpuStream();
205
206 std::size_t nbytes_blockresult = Arena::align(sizeof(T)*n);
207 std::size_t nbytes_blocksum = Arena::align(sizeof(T)*nblocks);
208 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
209 auto dp = (char*)(The_Arena()->alloc(nbytes_blockresult
210 + nbytes_blocksum
211 + nbytes_totalsum));
212 T* blockresult_p = (T*)dp;
213 T* blocksum_p = (T*)(dp + nbytes_blockresult);
214 T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
215
216 amrex::launch<nthreads>(nblocks, sm, stream,
217 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
218 {
219 sycl::sub_group const& sg = gh.item->get_sub_group();
220 int lane = sg.get_local_id()[0];
221 int warp = sg.get_group_id()[0];
222 int nwarps = sg.get_group_range()[0];
223
224 int threadIdxx = gh.item->get_local_id(0);
225 int blockIdxx = gh.item->get_group_linear_id();
226 int blockDimx = gh.item->get_local_range(0);
227
228 T* shared = (T*)(gh.local);
229 T* shared2 = shared + Gpu::Device::warp_size;
230
231 // Each block processes [ibegin,iend).
232 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
233 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
234
235 // Each block is responsible for nchunks chunks of data,
236 // where each chunk has blockDim.x elements, one for each
237 // thread in the block.
238 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
239 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
240 N offset = ibegin + ichunk*blockDimx;
241 if (offset >= iend) { break; }
242
243 offset += threadIdxx;
244 T x0 = (offset < iend) ? fin(offset) : 0;
245 T x = x0;
246 // Scan within a warp
247 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
248 T s = sycl::shift_group_right(sg, x, i);
249 if (lane >= i) { x += s; }
250 }
251
252 // x now holds the inclusive sum within the warp. The
253 // last thread in each warp holds the inclusive sum of
254 // this warp. We will store it in shared memory.
255 if (lane == Gpu::Device::warp_size - 1) {
256 shared[warp] = x;
257 }
258
259 gh.item->barrier(sycl::access::fence_space::local_space);
260
261 // The first warp will do scan on the warp sums for the
262 // whole block.
263 if (warp == 0) {
264 T y = (lane < nwarps) ? shared[lane] : 0;
265 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
266 T s = sycl::shift_group_right(sg, y, i);
267 if (lane >= i) { y += s; }
268 }
269
270 if (lane < nwarps) { shared2[lane] = y; }
271 }
272
273 gh.item->barrier(sycl::access::fence_space::local_space);
274
275 // shared[0:nwarps) holds the inclusive sum of warp sums.
276
277 // Also note x still holds the inclusive sum within the
278 // warp. Given these two, we can compute the inclusive
279 // sum within this chunk.
280 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
281 T tmp_out = sum_prev_warp + sum_prev_chunk +
282 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
283 sum_prev_chunk += shared2[nwarps-1];
284
285 if (offset < iend) {
286 blockresult_p[offset] = tmp_out;
287 }
288 }
289
290 // sum_prev_chunk now holds the sum of the whole block.
291 if (threadIdxx == 0) {
292 blocksum_p[blockIdxx] = sum_prev_chunk;
293 }
294 });
295
296 amrex::launch<nthreads>(1, sm, stream,
297 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
298 {
299 sycl::sub_group const& sg = gh.item->get_sub_group();
300 int lane = sg.get_local_id()[0];
301 int warp = sg.get_group_id()[0];
302 int nwarps = sg.get_group_range()[0];
303
304 int threadIdxx = gh.item->get_local_id(0);
305 int blockDimx = gh.item->get_local_range(0);
306
307 T* shared = (T*)(gh.local);
308 T* shared2 = shared + Gpu::Device::warp_size;
309
310 T sum_prev_chunk = 0;
311 for (int offset = threadIdxx; offset - threadIdxx < nblocks; offset += blockDimx) {
312 T x = (offset < nblocks) ? blocksum_p[offset] : 0;
313 // Scan within a warp
314 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
315 T s = sycl::shift_group_right(sg, x, i);
316 if (lane >= i) { x += s; }
317 }
318
319 // x now holds the inclusive sum within the warp. The
320 // last thread in each warp holds the inclusive sum of
321 // this warp. We will store it in shared memory.
322 if (lane == Gpu::Device::warp_size - 1) {
323 shared[warp] = x;
324 }
325
326 gh.item->barrier(sycl::access::fence_space::local_space);
327
328 // The first warp will do scan on the warp sums for the
329 // whole block.
330 if (warp == 0) {
331 T y = (lane < nwarps) ? shared[lane] : 0;
332 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
333 T s = sycl::shift_group_right(sg, y, i);
334 if (lane >= i) { y += s; }
335 }
336
337 if (lane < nwarps) { shared2[lane] = y; }
338 }
339
340 gh.item->barrier(sycl::access::fence_space::local_space);
341
342 // shared[0:nwarps) holds the inclusive sum of warp sums.
343
344 // Also note x still holds the inclusive sum within the
345 // warp. Given these two, we can compute the inclusive
346 // sum within this chunk.
347 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
348 T tmp_out = sum_prev_warp + sum_prev_chunk + x;
349 sum_prev_chunk += shared2[nwarps-1];
350
351 if (offset < nblocks) {
352 blocksum_p[offset] = tmp_out;
353 }
354 }
355
356 // sum_prev_chunk now holds the total sum.
357 if (threadIdxx == 0) {
358 *totalsum_p = sum_prev_chunk;
359 }
360 });
361
362 amrex::launch<nthreads>(nblocks, 0, stream,
363 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
364 {
365 int threadIdxx = gh.item->get_local_id(0);
366 int blockIdxx = gh.item->get_group_linear_id();
367 int blockDimx = gh.item->get_local_range(0);
368
369 // Each block processes [ibegin,iend).
370 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
371 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
372 T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
373 for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
374 fout(offset, prev_sum + blockresult_p[offset]);
375 }
376 });
377
378 T totalsum = 0;
379 if (a_ret_sum) {
380 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
381
383 The_Arena()->free(dp);
384
386 } else {
388 }
389
390 return totalsum;
391}
392#endif
393
394template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
395 typename M=std::enable_if_t<std::is_integral_v<N> &&
396 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
397 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
398T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum)
399{
400 if (n <= 0) { return 0; }
401 constexpr int nwarps_per_block = 8;
402 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
403 constexpr int nchunks = 12;
404 constexpr int nelms_per_block = nthreads * nchunks;
405 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
406 std::numeric_limits<int>::max())*nelms_per_block);
407 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
408
409#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
410 if (nblocks > 1) {
411 return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
412 }
413#endif
414
415 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
416 auto stream = Gpu::gpuStream();
417
418 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
419 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
420
421 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
422 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
423 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
424 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
425 + nbytes_blockid
426 + nbytes_totalsum));
427 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
428 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
429 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
430
431 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
432 BlockStatusT& block_status = block_status_p[i];
433 block_status.set_status('x');
434 if (i == 0) {
435 *virtual_block_id_p = 0;
436 *totalsum_p = 0;
437 }
438 });
439
440 amrex::launch<nthreads>(nblocks, sm, stream,
441 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
442 {
443 sycl::sub_group const& sg = gh.item->get_sub_group();
444 int lane = sg.get_local_id()[0];
445 int warp = sg.get_group_id()[0];
446 int nwarps = sg.get_group_range()[0];
447
448 int threadIdxx = gh.item->get_local_id(0);
449 int blockDimx = gh.item->get_local_range(0);
450 int gridDimx = gh.item->get_group_range(0);
451
452 T* shared = (T*)(gh.local);
453 T* shared2 = shared + Gpu::Device::warp_size;
454
455 // First of all, get block virtual id. We must do this to
456 // avoid deadlock because blocks may be launched in any order.
457 // Anywhere in this function, we should not use blockIdx.
458 int virtual_block_id = 0;
459 if (gridDimx > 1) {
460 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
461 if (threadIdxx == 0) {
462 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
463 virtual_block_id_shared = bid;
464 }
465 gh.item->barrier(sycl::access::fence_space::local_space);
466 virtual_block_id = virtual_block_id_shared;
467 }
468
469 // Each block processes [ibegin,iend).
470 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
471 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
472 BlockStatusT& block_status = block_status_p[virtual_block_id];
473
474 //
475 // The overall algorithm is based on "Single-pass Parallel
476 // Prefix Scan with Decoupled Look-back" by D. Merrill &
477 // M. Garland.
478 //
479
480 // Each block is responsible for nchunks chunks of data,
481 // where each chunk has blockDim.x elements, one for each
482 // thread in the block.
483 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
484 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
485 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
486 N offset = ibegin + ichunk*blockDimx;
487 if (offset >= iend) { break; }
488
489 offset += threadIdxx;
490 T x0 = (offset < iend) ? fin(offset) : 0;
491 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
492 *totalsum_p += x0;
493 }
494 T x = x0;
495 // Scan within a warp
496 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
497 T s = sycl::shift_group_right(sg, x, i);
498 if (lane >= i) { x += s; }
499 }
500
501 // x now holds the inclusive sum within the warp. The
502 // last thread in each warp holds the inclusive sum of
503 // this warp. We will store it in shared memory.
504 if (lane == Gpu::Device::warp_size - 1) {
505 shared[warp] = x;
506 }
507
508 gh.item->barrier(sycl::access::fence_space::local_space);
509
510 // The first warp will do scan on the warp sums for the
511 // whole block.
512 if (warp == 0) {
513 T y = (lane < nwarps) ? shared[lane] : 0;
514 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
515 T s = sycl::shift_group_right(sg, y, i);
516 if (lane >= i) { y += s; }
517 }
518
519 if (lane < nwarps) { shared2[lane] = y; }
520 }
521
522 gh.item->barrier(sycl::access::fence_space::local_space);
523
524 // shared[0:nwarps) holds the inclusive sum of warp sums.
525
526 // Also note x still holds the inclusive sum within the
527 // warp. Given these two, we can compute the inclusive
528 // sum within this chunk.
529 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
530 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
531 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
532 sum_prev_chunk += shared2[nwarps-1];
533 }
534
535 // sum_prev_chunk now holds the sum of the whole block.
536 if (threadIdxx == 0 && gridDimx > 1) {
537 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
538 sum_prev_chunk);
539 }
540
541 if (virtual_block_id == 0) {
542 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
543 N offset = ibegin + ichunk*blockDimx + threadIdxx;
544 if (offset >= iend) { break; }
545 fout(offset, tmp_out[ichunk]);
546 if (offset == n-1) {
547 *totalsum_p += tmp_out[ichunk];
548 }
549 }
550 } else if (virtual_block_id > 0) {
551
552 if (warp == 0) {
553 T exclusive_prefix = 0;
554 BlockStatusT volatile* pbs = block_status_p;
555 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
556 {
557 int iblock = iblock0-lane;
558 detail::STVA<T> stva{'p', 0};
559 if (iblock >= 0) {
560 stva = pbs[iblock].wait();
561 }
562
563 T x = stva.value;
564
565 // implement our own __ballot
566 unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
567 for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
568 status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
569 }
570
571 bool stop_lookback = status_bf & 0x1u;
572 if (stop_lookback == false) {
573 if (status_bf != 0) {
574 T y = x;
575 if (lane > 0) { x = 0; }
576 unsigned int bit_mask = 0x1u;
577 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
578 bit_mask <<= 1;
579 if (i == lane) { x = y; }
580 if (status_bf & bit_mask) {
581 stop_lookback = true;
582 break;
583 }
584 }
585 }
586
587 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
588 x += sycl::shift_group_left(sg, x,i);
589 }
590 }
591
592 if (lane == 0) { exclusive_prefix += x; }
593 if (stop_lookback) { break; }
594 }
595
596 if (lane == 0) {
597 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
598 shared[0] = exclusive_prefix;
599 }
600 }
601
602 gh.item->barrier(sycl::access::fence_space::local_space);
603
604 T exclusive_prefix = shared[0];
605
606 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
607 N offset = ibegin + ichunk*blockDimx + threadIdxx;
608 if (offset >= iend) { break; }
609 T t = tmp_out[ichunk] + exclusive_prefix;
610 fout(offset, t);
611 if (offset == n-1) {
612 *totalsum_p += t;
613 }
614 }
615 }
616 });
617
618 T totalsum = 0;
619 if (a_ret_sum) {
620 // xxxxx SYCL todo: Should test if using pinned memory and thus
621 // avoiding memcpy is faster.
622 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
623
625 The_Arena()->free(dp);
626
628 } else {
630 }
631
632 return totalsum;
633}
634
635#elif defined(AMREX_USE_HIP)
636
637template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
638 typename M=std::enable_if_t<std::is_integral_v<N> &&
639 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
640 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
641T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
642{
643 if (n <= 0) { return 0; }
644 constexpr int nwarps_per_block = 4;
645 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
646 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
647 constexpr int nelms_per_block = nthreads * nelms_per_thread;
648 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
649 std::numeric_limits<int>::max())*nelms_per_block);
650 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
651 std::size_t sm = 0;
652 auto stream = Gpu::gpuStream();
653
654 using ScanTileState = rocprim::detail::lookback_scan_state<T>;
655 using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
656
657#if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
658 (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
659 defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
660
661 std::size_t nbytes_tile_state = rocprim::detail::align_size
662 (ScanTileState::get_storage_size(nblocks));
663 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
664
665 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
666
667 ScanTileState tile_state = ScanTileState::create(dp, nblocks);
668
669#else
670
671 std::size_t nbytes_tile_state;
672 AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
673 nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
674
675 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
676
677 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
678
679 ScanTileState tile_state;
680 AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
681
682#endif
683
684 auto ordered_block_id = OrderedBlockId::create
685 (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
686
687 // Init ScanTileState on device
688 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
689 {
690 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
691 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
692 const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
693 if (gid == 0) { scan_bid.reset(); }
694 scan_tile_state.initialize_prefix(gid, nblocks);
695 });
696
697 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
698
699 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
700 [=] AMREX_GPU_DEVICE () noexcept
701 {
702 using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
703 rocprim::block_load_method::block_load_transpose>;
704 using BlockScan = rocprim::block_scan<T, nthreads,
705 rocprim::block_scan_algorithm::using_warp_scan>;
706 using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
707 using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
708 <T, rocprim::plus<T>, ScanTileState>;
709
710 __shared__ struct TempStorage {
711 typename OrderedBlockId::storage_type ordered_bid;
712 union {
713 typename BlockLoad::storage_type load;
714 typename BlockExchange::storage_type exchange;
715 typename BlockScan::storage_type scan;
716 };
717 } temp_storage;
718
719 // Lambda captured tile_state is const. We have to cast the const away.
720 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
721 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
722
723 auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
724
725 // Each block processes [ibegin,iend).
726 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
727 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
728
729 auto input_begin = rocprim::make_transform_iterator(
730 rocprim::make_counting_iterator(N(0)),
731 [&] (N i) -> T { return fin(i+ibegin); });
732
733 T data[nelms_per_thread];
734 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
735 BlockLoad().load(input_begin, data, temp_storage.load);
736 } else {
737 // padding with 0
738 BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
739 }
740
741 __syncthreads();
742
743 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
744
745 if (virtual_block_id == 0) {
746 T block_agg;
747 AMREX_IF_CONSTEXPR(is_exclusive) {
748 BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
749 } else {
750 BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
751 }
752 if (threadIdx.x == 0) {
753 if (nblocks > 1) {
754 scan_tile_state.set_complete(0, block_agg);
755 } else if (nblocks == 1 && totalsum_p) {
756 *totalsum_p = block_agg;
757 }
758 }
759 } else {
760 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
761
762 LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
763 AMREX_IF_CONSTEXPR(is_exclusive) {
764 BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
765 rocprim::plus<T>());
766 } else {
767 BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
768 rocprim::plus<T>());
769 }
770 if (totalsum_p) {
771 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
772 T tsum = data[nelms_per_thread-1];
773 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
774 *totalsum_p = tsum;
775 }
776 }
777 }
778
779 __syncthreads();
780
781 BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
782
783 for (int i = 0; i < nelms_per_thread; ++i) {
784 N offset = ibegin + i*nthreads + threadIdx.x;
785 if (offset < iend) { fout(offset, data[i]); }
786 }
787 });
788
789 if (totalsum_p) {
792
793 The_Arena()->free(dp);
794 } else {
796 }
797
798 T ret = (a_ret_sum) ? *totalsum_p : T(0);
799 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
800
801 return ret;
802}
803
804#elif defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
805
806template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
807 typename M=std::enable_if_t<std::is_integral_v<N> &&
808 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
809 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
810T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
811{
812 if (n <= 0) { return 0; }
813 constexpr int nwarps_per_block = 8;
814 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
815 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
816 constexpr int nelms_per_block = nthreads * nelms_per_thread;
817 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
818 std::numeric_limits<int>::max())*nelms_per_block);
819 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
820 std::size_t sm = 0;
821 auto stream = Gpu::gpuStream();
822
823 using ScanTileState = cub::ScanTileState<T>;
824 std::size_t tile_state_size = 0;
825 ScanTileState::AllocationSize(nblocks, tile_state_size);
826
827 std::size_t nbytes_tile_state = Arena::align(tile_state_size);
828 auto tile_state_p = (char*)(The_Arena()->alloc(nbytes_tile_state));
829
830 ScanTileState tile_state;
831 tile_state.Init(nblocks, tile_state_p, tile_state_size); // Init ScanTileState on host
832
833 if (nblocks > 1) {
834 // Init ScanTileState on device
835 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
836 {
837 const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
838 });
839 }
840
841 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
842
843 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
844 [=] AMREX_GPU_DEVICE () noexcept
845 {
846 using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
847 using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
848 using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
849
850#ifdef AMREX_CUDA_CCCL_VER_GE_3
851 using Sum = cuda::std::plus<T>;
852#else
853 using Sum = cub::Sum;
854#endif
855 using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, Sum, ScanTileState>;
856
857 __shared__ union TempStorage
858 {
859 typename BlockLoad::TempStorage load;
860 typename BlockExchange::TempStorage exchange;
861 struct ScanStorage {
862 typename BlockScan::TempStorage scan;
863 typename TilePrefixCallbackOp::TempStorage prefix;
864 } scan_storeage;
865 } temp_storage;
866
867 // Lambda captured tile_state is const. We have to cast the const away.
868 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
869
870 int virtual_block_id = blockIdx.x;
871
872 // Each block processes [ibegin,iend).
873 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
874 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
875
876 auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
877#ifdef AMREX_CUDA_CCCL_VER_GE_3
878 thrust::transform_iterator<decltype(input_lambda),thrust::counting_iterator<N> >
879 input_begin(thrust::counting_iterator<N>(0), input_lambda);
880#else
881 cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
882 input_begin(cub::CountingInputIterator<N>(0), input_lambda);
883#endif
884
885 T data[nelms_per_thread];
886 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
887 BlockLoad(temp_storage.load).Load(input_begin, data);
888 } else {
889 BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0); // padding with 0
890 }
891
892 __syncthreads();
893
894 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
895
896 if (virtual_block_id == 0) {
897 T block_agg;
898 AMREX_IF_CONSTEXPR(is_exclusive) {
899 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
900 } else {
901 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
902 }
903 if (threadIdx.x == 0) {
904 if (nblocks > 1) {
905 scan_tile_state.SetInclusive(0, block_agg);
906 } else if (nblocks == 1 && totalsum_p) {
907 *totalsum_p = block_agg;
908 }
909 }
910 } else {
911 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
912
913 TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
914 Sum{}, virtual_block_id);
915 AMREX_IF_CONSTEXPR(is_exclusive) {
916 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
917 } else {
918 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
919 }
920 if (totalsum_p) {
921 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
922 T tsum = data[nelms_per_thread-1];
923 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
924 *totalsum_p = tsum;
925 }
926 }
927 }
928
929 __syncthreads();
930
931 BlockExchange(temp_storage.exchange).BlockedToStriped(data);
932
933 for (int i = 0; i < nelms_per_thread; ++i) {
934 N offset = ibegin + i*nthreads + threadIdx.x;
935 if (offset < iend) { fout(offset, data[i]); }
936 }
937 });
938
939 if (totalsum_p) {
942
943 The_Arena()->free(tile_state_p);
944 } else {
945 Gpu::freeAsync(The_Arena(), tile_state_p);
946 }
947
948 T ret = (a_ret_sum) ? *totalsum_p : T(0);
949 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
950
951 return ret;
952}
953
954#else
955
956template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
957 typename M=std::enable_if_t<std::is_integral_v<N> &&
958 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
959 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
960T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
961{
962 if (n <= 0) { return 0; }
963 constexpr int nwarps_per_block = 4;
964 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
965 constexpr int nchunks = 12;
966 constexpr int nelms_per_block = nthreads * nchunks;
967 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
968 std::numeric_limits<int>::max())*nelms_per_block);
969 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
970 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
971 auto stream = Gpu::gpuStream();
972
973 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
975
976 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
977 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
978 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
979 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
980 + nbytes_blockid
981 + nbytes_totalsum));
982 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
983 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
984 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
985
986 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
987 BlockStatusT& block_status = block_status_p[i];
988 block_status.set_status('x');
989 if (i == 0) {
990 *virtual_block_id_p = 0;
991 *totalsum_p = 0;
992 }
993 });
994
995 amrex::launch<nthreads>(nblocks, sm, stream,
996 [=] AMREX_GPU_DEVICE () noexcept
997 {
998 int lane = threadIdx.x % Gpu::Device::warp_size;
999 int warp = threadIdx.x / Gpu::Device::warp_size;
1000 int nwarps = nthreads / Gpu::Device::warp_size;
1001
1003 T* shared = gsm.dataPtr();
1004 T* shared2 = shared + Gpu::Device::warp_size;
1005
1006 // First of all, get block virtual id. We must do this to
1007 // avoid deadlock because CUDA may launch blocks in any order.
1008 // Anywhere in this function, we should not use blockIdx.
1009 int virtual_block_id = 0;
1010 if (gridDim.x > 1) {
1011 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
1012 if (threadIdx.x == 0) {
1013 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
1014 virtual_block_id_shared = bid;
1015 }
1016 __syncthreads();
1017 virtual_block_id = virtual_block_id_shared;
1018 }
1019
1020 // Each block processes [ibegin,iend).
1021 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
1022 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
1023 BlockStatusT& block_status = block_status_p[virtual_block_id];
1024
1025 //
1026 // The overall algorithm is based on "Single-pass Parallel
1027 // Prefix Scan with Decoupled Look-back" by D. Merrill &
1028 // M. Garland.
1029 //
1030
1031 // Each block is responsible for nchunks chunks of data,
1032 // where each chunk has blockDim.x elements, one for each
1033 // thread in the block.
1034 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
1035 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
1036 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1037 N offset = ibegin + ichunk*nthreads;
1038 if (offset >= iend) { break; }
1039
1040 offset += threadIdx.x;
1041 T x0 = (offset < iend) ? fin(offset) : 0;
1042 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
1043 *totalsum_p += x0;
1044 }
1045 T x = x0;
1046 // Scan within a warp
1047 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
1048 AMREX_HIP_OR_CUDA( T s = __shfl_up(x,i);,
1049 T s = __shfl_up_sync(0xffffffff, x, i); )
1050 if (lane >= i) { x += s; }
1051 }
1052
1053 // x now holds the inclusive sum within the warp. The
1054 // last thread in each warp holds the inclusive sum of
1055 // this warp. We will store it in shared memory.
1056 if (lane == Gpu::Device::warp_size - 1) {
1057 shared[warp] = x;
1058 }
1059
1060 __syncthreads();
1061
1062 // The first warp will do scan on the warp sums for the
1063 // whole block. Not all threads in the warp need to
1064 // participate.
1065#ifdef AMREX_USE_CUDA
1066 if (warp == 0 && lane < nwarps) {
1067 T y = shared[lane];
1068 int mask = (1 << nwarps) - 1;
1069 for (int i = 1; i <= nwarps; i *= 2) {
1070 T s = __shfl_up_sync(mask, y, i, nwarps);
1071 if (lane >= i) { y += s; }
1072 }
1073 shared2[lane] = y;
1074 }
1075#else
1076 if (warp == 0) {
1077 T y = 0;
1078 if (lane < nwarps) {
1079 y = shared[lane];
1080 }
1081 for (int i = 1; i <= nwarps; i *= 2) {
1082 T s = __shfl_up(y, i, nwarps);
1083 if (lane >= i) { y += s; }
1084 }
1085 if (lane < nwarps) {
1086 shared2[lane] = y;
1087 }
1088 }
1089#endif
1090
1091 __syncthreads();
1092
1093 // shared[0:nwarps) holds the inclusive sum of warp sums.
1094
1095 // Also note x still holds the inclusive sum within the
1096 // warp. Given these two, we can compute the inclusive
1097 // sum within this chunk.
1098 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
1099 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
1100 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
1101 sum_prev_chunk += shared2[nwarps-1];
1102 }
1103
1104 // sum_prev_chunk now holds the sum of the whole block.
1105 if (threadIdx.x == 0 && gridDim.x > 1) {
1106 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
1107 sum_prev_chunk);
1108 }
1109
1110 if (virtual_block_id == 0) {
1111 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1112 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1113 if (offset >= iend) { break; }
1114 fout(offset, tmp_out[ichunk]);
1115 if (offset == n-1) {
1116 *totalsum_p += tmp_out[ichunk];
1117 }
1118 }
1119 } else if (virtual_block_id > 0) {
1120
1121 if (warp == 0) {
1122 T exclusive_prefix = 0;
1123 BlockStatusT volatile* pbs = block_status_p;
1124 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
1125 {
1126 int iblock = iblock0-lane;
1127 detail::STVA<T> stva{'p', 0};
1128 if (iblock >= 0) {
1129 stva = pbs[iblock].wait();
1130 }
1131
1132 T x = stva.value;
1133
1134 AMREX_HIP_OR_CUDA( uint64_t const status_bf = __ballot(stva.status == 'p');,
1135 unsigned const status_bf = __ballot_sync(0xffffffff, stva.status == 'p'));
1136 bool stop_lookback = status_bf & 0x1u;
1137 if (stop_lookback == false) {
1138 if (status_bf != 0) {
1139 T y = x;
1140 if (lane > 0) { x = 0; }
1141 AMREX_HIP_OR_CUDA(uint64_t bit_mask = 0x1ull;,
1142 unsigned bit_mask = 0x1u);
1143 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
1144 bit_mask <<= 1;
1145 if (i == lane) { x = y; }
1146 if (status_bf & bit_mask) {
1147 stop_lookback = true;
1148 break;
1149 }
1150 }
1151 }
1152
1153 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
1154 AMREX_HIP_OR_CUDA( x += __shfl_down(x,i);,
1155 x += __shfl_down_sync(0xffffffff, x, i); )
1156 }
1157 }
1158
1159 if (lane == 0) { exclusive_prefix += x; }
1160 if (stop_lookback) { break; }
1161 }
1162
1163 if (lane == 0) {
1164 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
1165 shared[0] = exclusive_prefix;
1166 }
1167 }
1168
1169 __syncthreads();
1170
1171 T exclusive_prefix = shared[0];
1172
1173 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1174 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1175 if (offset >= iend) { break; }
1176 T t = tmp_out[ichunk] + exclusive_prefix;
1177 fout(offset, t);
1178 if (offset == n-1) {
1179 *totalsum_p += t;
1180 }
1181 }
1182 }
1183 });
1184
1185 T totalsum = 0;
1186 if (a_ret_sum) {
1187 // xxxxx CUDA < 11 todo: Should test if using pinned memory and thus
1188 // avoiding memcpy is faster.
1189 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
1190
1192 The_Arena()->free(dp);
1193
1195 } else {
1197 }
1198
1199 return totalsum;
1200}
1201
1202#endif
1203
1204// The return value is the total sum if a_ret_sum is true.
1205template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1206T InclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1207{
1208#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1209 void* d_temp = nullptr;
1210 std::size_t temp_bytes = 0;
1211 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1212 Gpu::gpuStream()));
1213 d_temp = The_Arena()->alloc(temp_bytes);
1214 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1215 Gpu::gpuStream()));
1216 T totalsum = 0;
1217 if (a_ret_sum) {
1218 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1219 }
1221 The_Arena()->free(d_temp);
1223 return totalsum;
1224#elif defined(AMREX_USE_HIP)
1225 void* d_temp = nullptr;
1226 std::size_t temp_bytes = 0;
1227 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1228 rocprim::plus<T>(), Gpu::gpuStream()));
1229 d_temp = The_Arena()->alloc(temp_bytes);
1230 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1231 rocprim::plus<T>(), Gpu::gpuStream()));
1232 T totalsum = 0;
1233 if (a_ret_sum) {
1234 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1235 }
1237 The_Arena()->free(d_temp);
1239 return totalsum;
1240#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1241 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1242 std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1243 T totalsum = 0;
1244 if (a_ret_sum) {
1245 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1246 }
1249 return totalsum;
1250#else
1251 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1252 return PrefixSum<T>(static_cast<int>(n),
1253 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1254 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1255 Type::inclusive, a_ret_sum);
1256 } else {
1257 return PrefixSum<T>(n,
1258 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1259 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1260 Type::inclusive, a_ret_sum);
1261 }
1262#endif
1263}
1264
1265// The return value is the total sum if a_ret_sum is true.
1266template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1267T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1268{
1269 if (n <= 0) { return 0; }
1270#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1271 T in_last = 0;
1272 if (a_ret_sum) {
1273 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1274 }
1275 void* d_temp = nullptr;
1276 std::size_t temp_bytes = 0;
1277 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1278 Gpu::gpuStream()));
1279 d_temp = The_Arena()->alloc(temp_bytes);
1280 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1281 Gpu::gpuStream()));
1282 T out_last = 0;
1283 if (a_ret_sum) {
1284 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1285 }
1287 The_Arena()->free(d_temp);
1289 return in_last+out_last;
1290#elif defined(AMREX_USE_HIP)
1291 T in_last = 0;
1292 if (a_ret_sum) {
1293 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1294 }
1295 void* d_temp = nullptr;
1296 std::size_t temp_bytes = 0;
1297 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1298 rocprim::plus<T>(), Gpu::gpuStream()));
1299 d_temp = The_Arena()->alloc(temp_bytes);
1300 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1301 rocprim::plus<T>(), Gpu::gpuStream()));
1302 T out_last = 0;
1303 if (a_ret_sum) {
1304 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1305 }
1307 The_Arena()->free(d_temp);
1309 return in_last+out_last;
1310#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1311 T in_last = 0;
1312 if (a_ret_sum) {
1313 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1314 }
1315 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1316 std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1317 T out_last = 0;
1318 if (a_ret_sum) {
1319 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1320 }
1323 return in_last+out_last;
1324#else
1325 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1326 return PrefixSum<T>(static_cast<int>(n),
1327 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1328 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1329 Type::exclusive, a_ret_sum);
1330 } else {
1331 return PrefixSum<T>(n,
1332 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1333 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1334 Type::exclusive, a_ret_sum);
1335 }
1336#endif
1337}
1338
1339#else
1340// !defined(AMREX_USE_GPU)
1341template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
1342 typename M=std::enable_if_t<std::is_integral_v<N> &&
1343 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
1344 std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
1345T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
1346{
1347 if (n <= 0) { return 0; }
1348 T totalsum = 0;
1349 for (N i = 0; i < n; ++i) {
1350 T x = fin(i);
1351 T y = totalsum;
1352 totalsum += x;
1353 AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
1354 y += x;
1355 }
1356 fout(i, y);
1357 }
1358 return totalsum;
1359}
1360
1361// The return value is the total sum.
1362template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1363T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1364{
1365#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1366 // GCC's __cplusplus is not a reliable indication for C++17 support
1367 std::inclusive_scan(in, in+n, out);
1368#else
1369 std::partial_sum(in, in+n, out);
1370#endif
1371 return (n > 0) ? out[n-1] : T(0);
1372}
1373
1374// The return value is the total sum.
1375template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1376T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1377{
1378 if (n <= 0) { return 0; }
1379
1380 auto in_last = in[n-1];
1381#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1382 // GCC's __cplusplus is not a reliable indication for C++17 support
1383 std::exclusive_scan(in, in+n, out, 0);
1384#else
1385 out[0] = 0;
1386 std::partial_sum(in, in+n-1, out+1);
1387#endif
1388 return in_last + out[n-1];
1389}
1390
1391#endif
1392
1393}
1394
1395namespace Gpu
1396{
1397 template<class InIter, class OutIter>
1398 OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
1399 {
1400#if defined(AMREX_USE_GPU)
1401 auto N = std::distance(begin, end);
1402 Scan::InclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1403 OutIter result_end = result;
1404 std::advance(result_end, N);
1405 return result_end;
1406#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1407 // GCC's __cplusplus is not a reliable indication for C++17 support
1408 return std::inclusive_scan(begin, end, result);
1409#else
1410 return std::partial_sum(begin, end, result);
1411#endif
1412 }
1413
1414 template<class InIter, class OutIter>
1415 OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
1416 {
1417#if defined(AMREX_USE_GPU)
1418 auto N = std::distance(begin, end);
1419 Scan::ExclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1420 OutIter result_end = result;
1421 std::advance(result_end, N);
1422 return result_end;
1423#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1424 // GCC's __cplusplus is not a reliable indication for C++17 support
1425 return std::exclusive_scan(begin, end, result, 0);
1426#else
1427 if (begin == end) { return result; }
1428
1429 typename std::iterator_traits<InIter>::value_type sum = *begin;
1430 *result++ = sum - *begin;
1431
1432 while (++begin != end) {
1433 sum = std::move(sum) + *begin;
1434 *result++ = sum - *begin;
1435 }
1436 return ++result;
1437#endif
1438 }
1439
1440}}
1441
1442#endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_RESTRICT
Definition AMReX_Extension.H:37
#define AMREX_IF_CONSTEXPR
Definition AMReX_Extension.H:269
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_SAFE_CALL(call)
Definition AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Array4< int const > mask
Definition AMReX_InterpFaceRegister.cpp:93
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:153
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:485
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:198
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:281
OutIter exclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1415
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:260
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:303
OutIter inclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1398
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:204
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr struct amrex::Scan::Type::Inclusive inclusive
T InclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:1206
T ExclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:1267
static constexpr RetSum noRetSum
Definition AMReX_Scan.H:33
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
T PrefixSum(N n, FIN const &fin, FOUT const &fout, TYPE, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:960
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1899
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:745
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1908
Arena * The_Arena()
Definition AMReX_Arena.cpp:705
Definition AMReX_FabArrayCommI.H:1000
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Definition AMReX_Scan.H:28
bool flag
Definition AMReX_Scan.H:29
Definition AMReX_Scan.H:37
Definition AMReX_Scan.H:36
T inclusive
Definition AMReX_Scan.H:122
T aggregate
Definition AMReX_Scan.H:121
__device__ void set_status(char a_status)
Definition AMReX_Scan.H:170
__device__ T get_aggregate() const
Definition AMReX_Scan.H:141
__device__ STVA< T > wait() volatile
Definition AMReX_Scan.H:173
char status
Definition AMReX_Scan.H:123
__device__ STVA< T > read() volatile
Definition AMReX_Scan.H:144
__device__ void write(char a_status, T a_value)
Definition AMReX_Scan.H:126
__device__ void set_status(char a_status)
Definition AMReX_Scan.H:101
__device__ STVA< T > wait() volatile
Definition AMReX_Scan.H:104
Data< T > d
Definition AMReX_Scan.H:63
__device__ STVA< T > read() volatile
Definition AMReX_Scan.H:86
__device__ T get_aggregate() const
Definition AMReX_Scan.H:83
__device__ void write(char a_status, T a_value)
Definition AMReX_Scan.H:66
Definition AMReX_Scan.H:51
Definition AMReX_Scan.H:46
char status
Definition AMReX_Scan.H:47
T value
Definition AMReX_Scan.H:48
STVA< U > s
Definition AMReX_Scan.H:58
uint64_t i
Definition AMReX_Scan.H:59
void operator=(Data< U > const &)=delete