Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_Scan.H
Go to the documentation of this file.
1#ifndef AMREX_SCAN_H_
2#define AMREX_SCAN_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Extension.H>
6#include <AMReX_Gpu.H>
7#include <AMReX_Arena.H>
8
9#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
10# include <cub/cub.cuh>
11# ifdef AMREX_CUDA_CCCL_VER_GE_3
12# include <thrust/iterator/transform_iterator.h>
13# endif
14#elif defined(AMREX_USE_HIP)
15# include <rocprim/rocprim.hpp>
16#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
17# include <oneapi/dpl/execution>
18# include <oneapi/dpl/numeric>
19#endif
20
21#include <cstdint>
22#include <numeric>
23#include <type_traits>
24
25namespace amrex {
26namespace Scan {
27
28struct RetSum {
29 bool flag = true;
30 explicit operator bool() const noexcept { return flag; }
31};
32static constexpr RetSum retSum{true};
33static constexpr RetSum noRetSum{false};
34
35namespace Type {
36 static constexpr struct Inclusive {} inclusive{};
37 static constexpr struct Exclusive {} exclusive{};
38}
39
40#if defined(AMREX_USE_GPU)
41
43namespace detail {
44
45template <typename T>
46struct STVA
47{
48 char status;
49 T value;
50};
51
52template <typename T, bool SINGLE_WORD> struct BlockStatus {};
53
54template <typename T>
55struct BlockStatus<T, true>
56{
57 template<typename U>
58 union Data {
59 STVA<U> s;
60 uint64_t i;
61 void operator=(Data<U> const&) = delete;
62 void operator=(Data<U> &&) = delete;
63 };
64 Data<T> d;
65
67 void write (char a_status, T a_value) {
68#if defined(AMREX_USE_CUDA)
69 volatile uint64_t tmp;
70 reinterpret_cast<STVA<T> volatile&>(tmp).status = a_status;
71 reinterpret_cast<STVA<T> volatile&>(tmp).value = a_value;
72 reinterpret_cast<uint64_t&>(d.s) = tmp;
73#else
74 Data<T> tmp;
75 tmp.s = {a_status, a_value};
76 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
77 "HIP/SYCL: unsigned long long must be 64 bits");
78 Gpu::Atomic::Exch(reinterpret_cast<unsigned long long*>(&d),
79 reinterpret_cast<unsigned long long&>(tmp));
80#endif
81 }
82
84 T get_aggregate() const { return d.s.value; }
85
87 STVA<T> read () volatile {
88#if defined(AMREX_USE_CUDA)
89 volatile uint64_t tmp = reinterpret_cast<uint64_t volatile&>(d);
90 return {reinterpret_cast<STVA<T> volatile&>(tmp).status,
91 reinterpret_cast<STVA<T> volatile&>(tmp).value };
92#else
93 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
94 "HIP/SYCL: unsigned long long must be 64 bits");
95 unsigned long long tmp = Gpu::Atomic::Add
96 (reinterpret_cast<unsigned long long*>(const_cast<Data<T>*>(&d)), 0ull);
97 return (*reinterpret_cast<Data<T>*>(&tmp)).s;
98#endif
99 }
100
102 void set_status (char a_status) { d.s.status = a_status; }
103
105 STVA<T> wait () volatile {
106 STVA<T> r;
107 do {
108#if defined(AMREX_USE_SYCL)
109 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
110#else
111 __threadfence_block();
112#endif
113 r = read();
114 } while (r.status == 'x');
115 return r;
116 }
117};
118
119template <typename T>
120struct BlockStatus<T, false>
121{
122 T aggregate;
123 T inclusive;
124 char status;
125
127 void write (char a_status, T a_value) {
128 if (a_status == 'a') {
129 aggregate = a_value;
130 } else {
131 inclusive = a_value;
132 }
133#if defined(AMREX_USE_SYCL)
134 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
135#else
136 __threadfence();
137#endif
138 status = a_status;
139 }
140
142 T get_aggregate() const { return aggregate; }
143
145 STVA<T> read () volatile {
146#if defined(AMREX_USE_SYCL)
147 constexpr auto mo = sycl::memory_order::relaxed;
148 constexpr auto ms = sycl::memory_scope::device;
149 constexpr auto as = sycl::access::address_space::global_space;
150#endif
151 if (status == 'x') {
152 return {'x', 0};
153 } else if (status == 'a') {
154#if defined(AMREX_USE_SYCL)
155 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(aggregate)};
156 return {'a', ar.load()};
157#else
158 return {'a', aggregate};
159#endif
160 } else {
161#if defined(AMREX_USE_SYCL)
162 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(inclusive)};
163 return {'p', ar.load()};
164#else
165 return {'p', inclusive};
166#endif
167 }
168 }
169
171 void set_status (char a_status) { status = a_status; }
172
174 STVA<T> wait () volatile {
175 STVA<T> r;
176 do {
177 r = read();
178#if defined(AMREX_USE_SYCL)
179 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
180#else
181 __threadfence();
182#endif
183 } while (r.status == 'x');
184 return r;
185 }
186};
187
188}
190
191#if defined(AMREX_USE_SYCL)
192
193#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
194template <typename T, typename N, typename FIN, typename FOUT, typename TYPE>
195T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
196{
197 if (n <= 0) { return 0; }
198 constexpr int nwarps_per_block = 8;
199 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
200 constexpr int nchunks = 12;
201 constexpr int nelms_per_block = nthreads * nchunks;
202 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
203 std::numeric_limits<int>::max())*nelms_per_block);
204 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
205 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
206 auto stream = Gpu::gpuStream();
207
208 std::size_t nbytes_blockresult = Arena::align(sizeof(T)*n);
209 std::size_t nbytes_blocksum = Arena::align(sizeof(T)*nblocks);
210 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
211 auto dp = (char*)(The_Arena()->alloc(nbytes_blockresult
212 + nbytes_blocksum
213 + nbytes_totalsum));
214 T* blockresult_p = (T*)dp;
215 T* blocksum_p = (T*)(dp + nbytes_blockresult);
216 T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
217
218 amrex::launch<nthreads>(nblocks, sm, stream,
219 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
220 {
221 sycl::sub_group const& sg = gh.item->get_sub_group();
222 int lane = sg.get_local_id()[0];
223 int warp = sg.get_group_id()[0];
224 int nwarps = sg.get_group_range()[0];
225
226 int threadIdxx = gh.item->get_local_id(0);
227 int blockIdxx = gh.item->get_group_linear_id();
228 int blockDimx = gh.item->get_local_range(0);
229
230 T* shared = (T*)(gh.local);
231 T* shared2 = shared + Gpu::Device::warp_size;
232
233 // Each block processes [ibegin,iend).
234 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
235 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
236
237 // Each block is responsible for nchunks chunks of data,
238 // where each chunk has blockDim.x elements, one for each
239 // thread in the block.
240 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
241 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
242 N offset = ibegin + ichunk*blockDimx;
243 if (offset >= iend) { break; }
244
245 offset += threadIdxx;
246 T x0 = (offset < iend) ? fin(offset) : 0;
247 T x = x0;
248 // Scan within a warp
249 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
250 T s = sycl::shift_group_right(sg, x, i);
251 if (lane >= i) { x += s; }
252 }
253
254 // x now holds the inclusive sum within the warp. The
255 // last thread in each warp holds the inclusive sum of
256 // this warp. We will store it in shared memory.
257 if (lane == Gpu::Device::warp_size - 1) {
258 shared[warp] = x;
259 }
260
261 gh.item->barrier(sycl::access::fence_space::local_space);
262
263 // The first warp will do scan on the warp sums for the
264 // whole block.
265 if (warp == 0) {
266 T y = (lane < nwarps) ? shared[lane] : 0;
267 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
268 T s = sycl::shift_group_right(sg, y, i);
269 if (lane >= i) { y += s; }
270 }
271
272 if (lane < nwarps) { shared2[lane] = y; }
273 }
274
275 gh.item->barrier(sycl::access::fence_space::local_space);
276
277 // shared[0:nwarps) holds the inclusive sum of warp sums.
278
279 // Also note x still holds the inclusive sum within the
280 // warp. Given these two, we can compute the inclusive
281 // sum within this chunk.
282 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
283 T tmp_out = sum_prev_warp + sum_prev_chunk +
284 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
285 sum_prev_chunk += shared2[nwarps-1];
286
287 if (offset < iend) {
288 blockresult_p[offset] = tmp_out;
289 }
290 }
291
292 // sum_prev_chunk now holds the sum of the whole block.
293 if (threadIdxx == 0) {
294 blocksum_p[blockIdxx] = sum_prev_chunk;
295 }
296 });
297
298 amrex::launch<nthreads>(1, sm, stream,
299 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
300 {
301 sycl::sub_group const& sg = gh.item->get_sub_group();
302 int lane = sg.get_local_id()[0];
303 int warp = sg.get_group_id()[0];
304 int nwarps = sg.get_group_range()[0];
305
306 int threadIdxx = gh.item->get_local_id(0);
307 int blockDimx = gh.item->get_local_range(0);
308
309 T* shared = (T*)(gh.local);
310 T* shared2 = shared + Gpu::Device::warp_size;
311
312 T sum_prev_chunk = 0;
313 for (int offset = threadIdxx; offset - threadIdxx < nblocks; offset += blockDimx) {
314 T x = (offset < nblocks) ? blocksum_p[offset] : 0;
315 // Scan within a warp
316 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
317 T s = sycl::shift_group_right(sg, x, i);
318 if (lane >= i) { x += s; }
319 }
320
321 // x now holds the inclusive sum within the warp. The
322 // last thread in each warp holds the inclusive sum of
323 // this warp. We will store it in shared memory.
324 if (lane == Gpu::Device::warp_size - 1) {
325 shared[warp] = x;
326 }
327
328 gh.item->barrier(sycl::access::fence_space::local_space);
329
330 // The first warp will do scan on the warp sums for the
331 // whole block.
332 if (warp == 0) {
333 T y = (lane < nwarps) ? shared[lane] : 0;
334 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
335 T s = sycl::shift_group_right(sg, y, i);
336 if (lane >= i) { y += s; }
337 }
338
339 if (lane < nwarps) { shared2[lane] = y; }
340 }
341
342 gh.item->barrier(sycl::access::fence_space::local_space);
343
344 // shared[0:nwarps) holds the inclusive sum of warp sums.
345
346 // Also note x still holds the inclusive sum within the
347 // warp. Given these two, we can compute the inclusive
348 // sum within this chunk.
349 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
350 T tmp_out = sum_prev_warp + sum_prev_chunk + x;
351 sum_prev_chunk += shared2[nwarps-1];
352
353 if (offset < nblocks) {
354 blocksum_p[offset] = tmp_out;
355 }
356 }
357
358 // sum_prev_chunk now holds the total sum.
359 if (threadIdxx == 0) {
360 *totalsum_p = sum_prev_chunk;
361 }
362 });
363
364 amrex::launch<nthreads>(nblocks, 0, stream,
365 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
366 {
367 int threadIdxx = gh.item->get_local_id(0);
368 int blockIdxx = gh.item->get_group_linear_id();
369 int blockDimx = gh.item->get_local_range(0);
370
371 // Each block processes [ibegin,iend).
372 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
373 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
374 T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
375 for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
376 fout(offset, prev_sum + blockresult_p[offset]);
377 }
378 });
379
380 T totalsum = 0;
381 if (a_ret_sum) {
382 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
383
385 The_Arena()->free(dp);
386
388 } else {
390 }
391
392 return totalsum;
393}
394#endif
395
396template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
397 typename M=std::enable_if_t<std::is_integral_v<N> &&
398 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
399 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
400T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum)
401{
402 if (n <= 0) { return 0; }
403 constexpr int nwarps_per_block = 8;
404 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
405 constexpr int nchunks = 12;
406 constexpr int nelms_per_block = nthreads * nchunks;
407 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
408 std::numeric_limits<int>::max())*nelms_per_block);
409 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
410
411#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
412 if (nblocks > 1) {
413 return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
414 }
415#endif
416
417 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
418 auto stream = Gpu::gpuStream();
419
420 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
421 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
422
423 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
424 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
425 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
426 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
427 + nbytes_blockid
428 + nbytes_totalsum));
429 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
430 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
431 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
432
433 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
434 BlockStatusT& block_status = block_status_p[i];
435 block_status.set_status('x');
436 if (i == 0) {
437 *virtual_block_id_p = 0;
438 *totalsum_p = 0;
439 }
440 });
441
442 amrex::launch<nthreads>(nblocks, sm, stream,
443 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
444 {
445 sycl::sub_group const& sg = gh.item->get_sub_group();
446 int lane = sg.get_local_id()[0];
447 int warp = sg.get_group_id()[0];
448 int nwarps = sg.get_group_range()[0];
449
450 int threadIdxx = gh.item->get_local_id(0);
451 int blockDimx = gh.item->get_local_range(0);
452 int gridDimx = gh.item->get_group_range(0);
453
454 T* shared = (T*)(gh.local);
455 T* shared2 = shared + Gpu::Device::warp_size;
456
457 // First of all, get block virtual id. We must do this to
458 // avoid deadlock because blocks may be launched in any order.
459 // Anywhere in this function, we should not use blockIdx.
460 int virtual_block_id = 0;
461 if (gridDimx > 1) {
462 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
463 if (threadIdxx == 0) {
464 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
465 virtual_block_id_shared = bid;
466 }
467 gh.item->barrier(sycl::access::fence_space::local_space);
468 virtual_block_id = virtual_block_id_shared;
469 }
470
471 // Each block processes [ibegin,iend).
472 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
473 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
474 BlockStatusT& block_status = block_status_p[virtual_block_id];
475
476 //
477 // The overall algorithm is based on "Single-pass Parallel
478 // Prefix Scan with Decoupled Look-back" by D. Merrill &
479 // M. Garland.
480 //
481
482 // Each block is responsible for nchunks chunks of data,
483 // where each chunk has blockDim.x elements, one for each
484 // thread in the block.
485 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
486 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
487 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
488 N offset = ibegin + ichunk*blockDimx;
489 if (offset >= iend) { break; }
490
491 offset += threadIdxx;
492 T x0 = (offset < iend) ? fin(offset) : 0;
493 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
494 *totalsum_p += x0;
495 }
496 T x = x0;
497 // Scan within a warp
498 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
499 T s = sycl::shift_group_right(sg, x, i);
500 if (lane >= i) { x += s; }
501 }
502
503 // x now holds the inclusive sum within the warp. The
504 // last thread in each warp holds the inclusive sum of
505 // this warp. We will store it in shared memory.
506 if (lane == Gpu::Device::warp_size - 1) {
507 shared[warp] = x;
508 }
509
510 gh.item->barrier(sycl::access::fence_space::local_space);
511
512 // The first warp will do scan on the warp sums for the
513 // whole block.
514 if (warp == 0) {
515 T y = (lane < nwarps) ? shared[lane] : 0;
516 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
517 T s = sycl::shift_group_right(sg, y, i);
518 if (lane >= i) { y += s; }
519 }
520
521 if (lane < nwarps) { shared2[lane] = y; }
522 }
523
524 gh.item->barrier(sycl::access::fence_space::local_space);
525
526 // shared[0:nwarps) holds the inclusive sum of warp sums.
527
528 // Also note x still holds the inclusive sum within the
529 // warp. Given these two, we can compute the inclusive
530 // sum within this chunk.
531 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
532 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
533 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
534 sum_prev_chunk += shared2[nwarps-1];
535 }
536
537 // sum_prev_chunk now holds the sum of the whole block.
538 if (threadIdxx == 0 && gridDimx > 1) {
539 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
540 sum_prev_chunk);
541 }
542
543 if (virtual_block_id == 0) {
544 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
545 N offset = ibegin + ichunk*blockDimx + threadIdxx;
546 if (offset >= iend) { break; }
547 fout(offset, tmp_out[ichunk]);
548 if (offset == n-1) {
549 *totalsum_p += tmp_out[ichunk];
550 }
551 }
552 } else if (virtual_block_id > 0) {
553
554 if (warp == 0) {
555 T exclusive_prefix = 0;
556 BlockStatusT volatile* pbs = block_status_p;
557 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
558 {
559 int iblock = iblock0-lane;
560 detail::STVA<T> stva{'p', 0};
561 if (iblock >= 0) {
562 stva = pbs[iblock].wait();
563 }
564
565 T x = stva.value;
566
567 // implement our own __ballot
568 unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
569 for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
570 status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
571 }
572
573 bool stop_lookback = status_bf & 0x1u;
574 if (stop_lookback == false) {
575 if (status_bf != 0) {
576 T y = x;
577 if (lane > 0) { x = 0; }
578 unsigned int bit_mask = 0x1u;
579 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
580 bit_mask <<= 1;
581 if (i == lane) { x = y; }
582 if (status_bf & bit_mask) {
583 stop_lookback = true;
584 break;
585 }
586 }
587 }
588
589 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
590 x += sycl::shift_group_left(sg, x,i);
591 }
592 }
593
594 if (lane == 0) { exclusive_prefix += x; }
595 if (stop_lookback) { break; }
596 }
597
598 if (lane == 0) {
599 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
600 shared[0] = exclusive_prefix;
601 }
602 }
603
604 gh.item->barrier(sycl::access::fence_space::local_space);
605
606 T exclusive_prefix = shared[0];
607
608 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
609 N offset = ibegin + ichunk*blockDimx + threadIdxx;
610 if (offset >= iend) { break; }
611 T t = tmp_out[ichunk] + exclusive_prefix;
612 fout(offset, t);
613 if (offset == n-1) {
614 *totalsum_p += t;
615 }
616 }
617 }
618 });
619
620 T totalsum = 0;
621 if (a_ret_sum) {
622 // xxxxx SYCL todo: Should test if using pinned memory and thus
623 // avoiding memcpy is faster.
624 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
625
627 The_Arena()->free(dp);
628
630 } else {
632 }
633
634 return totalsum;
635}
636
637#elif defined(AMREX_USE_HIP)
638
639template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
640 typename M=std::enable_if_t<std::is_integral_v<N> &&
641 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
642 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
643T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
644{
645 if (n <= 0) { return 0; }
646 constexpr int nwarps_per_block = 4;
647 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
648 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
649 constexpr int nelms_per_block = nthreads * nelms_per_thread;
650 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
651 std::numeric_limits<int>::max())*nelms_per_block);
652 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
653 std::size_t sm = 0;
654 auto stream = Gpu::gpuStream();
655
656 using ScanTileState = rocprim::detail::lookback_scan_state<T>;
657 using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
658
659#if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
660 (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
661 defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
662
663 std::size_t nbytes_tile_state = rocprim::detail::align_size
664 (ScanTileState::get_storage_size(nblocks));
665 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
666
667 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
668
669 ScanTileState tile_state = ScanTileState::create(dp, nblocks);
670
671#else
672
673 std::size_t nbytes_tile_state;
674 AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
675 nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
676
677 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
678
679 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
680
681 ScanTileState tile_state;
682 AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
683
684#endif
685
686 auto ordered_block_id = OrderedBlockId::create
687 (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
688
689 // Init ScanTileState on device
690 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
691 {
692 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
693 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
694 const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
695 if (gid == 0) { scan_bid.reset(); }
696 scan_tile_state.initialize_prefix(gid, nblocks);
697 });
698
699 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
700
701 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
702 [=] AMREX_GPU_DEVICE () noexcept
703 {
704 using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
705 rocprim::block_load_method::block_load_transpose>;
706 using BlockScan = rocprim::block_scan<T, nthreads,
707 rocprim::block_scan_algorithm::using_warp_scan>;
708 using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
709 using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
710 <T, rocprim::plus<T>, ScanTileState>;
711
712 __shared__ struct TempStorage {
713 typename OrderedBlockId::storage_type ordered_bid;
714 union {
715 typename BlockLoad::storage_type load;
716 typename BlockExchange::storage_type exchange;
717 typename BlockScan::storage_type scan;
718 };
719 } temp_storage;
720
721 // Lambda captured tile_state is const. We have to cast the const away.
722 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
723 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
724
725 auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
726
727 // Each block processes [ibegin,iend).
728 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
729 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
730
731 auto input_begin = rocprim::make_transform_iterator(
732 rocprim::make_counting_iterator(N(0)),
733 [&] (N i) -> T { return fin(i+ibegin); });
734
735 T data[nelms_per_thread];
736 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
737 BlockLoad().load(input_begin, data, temp_storage.load);
738 } else {
739 // padding with 0
740 BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
741 }
742
743 __syncthreads();
744
745 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
746
747 if (virtual_block_id == 0) {
748 T block_agg;
749 AMREX_IF_CONSTEXPR(is_exclusive) {
750 BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
751 } else {
752 BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
753 }
754 if (threadIdx.x == 0) {
755 if (nblocks > 1) {
756 scan_tile_state.set_complete(0, block_agg);
757 } else if (nblocks == 1 && totalsum_p) {
758 *totalsum_p = block_agg;
759 }
760 }
761 } else {
762 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
763
764 LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
765 AMREX_IF_CONSTEXPR(is_exclusive) {
766 BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
767 rocprim::plus<T>());
768 } else {
769 BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
770 rocprim::plus<T>());
771 }
772 if (totalsum_p) {
773 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
774 T tsum = data[nelms_per_thread-1];
775 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
776 *totalsum_p = tsum;
777 }
778 }
779 }
780
781 __syncthreads();
782
783 BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
784
785 for (int i = 0; i < nelms_per_thread; ++i) {
786 N offset = ibegin + i*nthreads + threadIdx.x;
787 if (offset < iend) { fout(offset, data[i]); }
788 }
789 });
790
791 if (totalsum_p) {
794
795 The_Arena()->free(dp);
796 } else {
798 }
799
800 T ret = (a_ret_sum) ? *totalsum_p : T(0);
801 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
802
803 return ret;
804}
805
806#elif defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
807
808template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
809 typename M=std::enable_if_t<std::is_integral_v<N> &&
810 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
811 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
812T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
813{
814 if (n <= 0) { return 0; }
815 constexpr int nwarps_per_block = 8;
816 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
817 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
818 constexpr int nelms_per_block = nthreads * nelms_per_thread;
819 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
820 std::numeric_limits<int>::max())*nelms_per_block);
821 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
822 std::size_t sm = 0;
823 auto stream = Gpu::gpuStream();
824
825 using ScanTileState = cub::ScanTileState<T>;
826 std::size_t tile_state_size = 0;
827 ScanTileState::AllocationSize(nblocks, tile_state_size);
828
829 std::size_t nbytes_tile_state = Arena::align(tile_state_size);
830 auto tile_state_p = (char*)(The_Arena()->alloc(nbytes_tile_state));
831
832 ScanTileState tile_state;
833 tile_state.Init(nblocks, tile_state_p, tile_state_size); // Init ScanTileState on host
834
835 if (nblocks > 1) {
836 // Init ScanTileState on device
837 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
838 {
839 const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
840 });
841 }
842
843 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
844
845 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
846 [=] AMREX_GPU_DEVICE () noexcept
847 {
848 using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
849 using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
850 using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
851
852#ifdef AMREX_CUDA_CCCL_VER_GE_3
853 using Sum = cuda::std::plus<T>;
854#else
855 using Sum = cub::Sum;
856#endif
857 using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, Sum, ScanTileState>;
858
859 __shared__ union TempStorage
860 {
861 typename BlockLoad::TempStorage load;
862 typename BlockExchange::TempStorage exchange;
863 struct ScanStorage {
864 typename BlockScan::TempStorage scan;
865 typename TilePrefixCallbackOp::TempStorage prefix;
866 } scan_storeage;
867 } temp_storage;
868
869 // Lambda captured tile_state is const. We have to cast the const away.
870 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
871
872 int virtual_block_id = blockIdx.x;
873
874 // Each block processes [ibegin,iend).
875 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
876 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
877
878 auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
879#ifdef AMREX_CUDA_CCCL_VER_GE_3
880 thrust::transform_iterator<decltype(input_lambda),thrust::counting_iterator<N> >
881 input_begin(thrust::counting_iterator<N>(0), input_lambda);
882#else
883 cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
884 input_begin(cub::CountingInputIterator<N>(0), input_lambda);
885#endif
886
887 T data[nelms_per_thread];
888 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
889 BlockLoad(temp_storage.load).Load(input_begin, data);
890 } else {
891 BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0); // padding with 0
892 }
893
894 __syncthreads();
895
896 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
897
898 if (virtual_block_id == 0) {
899 T block_agg;
900 AMREX_IF_CONSTEXPR(is_exclusive) {
901 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
902 } else {
903 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
904 }
905 if (threadIdx.x == 0) {
906 if (nblocks > 1) {
907 scan_tile_state.SetInclusive(0, block_agg);
908 } else if (nblocks == 1 && totalsum_p) {
909 *totalsum_p = block_agg;
910 }
911 }
912 } else {
913 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
914
915 TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
916 Sum{}, virtual_block_id);
917 AMREX_IF_CONSTEXPR(is_exclusive) {
918 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
919 } else {
920 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
921 }
922 if (totalsum_p) {
923 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
924 T tsum = data[nelms_per_thread-1];
925 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
926 *totalsum_p = tsum;
927 }
928 }
929 }
930
931 __syncthreads();
932
933 BlockExchange(temp_storage.exchange).BlockedToStriped(data);
934
935 for (int i = 0; i < nelms_per_thread; ++i) {
936 N offset = ibegin + i*nthreads + threadIdx.x;
937 if (offset < iend) { fout(offset, data[i]); }
938 }
939 });
940
941 if (totalsum_p) {
944
945 The_Arena()->free(tile_state_p);
946 } else {
947 Gpu::freeAsync(The_Arena(), tile_state_p);
948 }
949
950 T ret = (a_ret_sum) ? *totalsum_p : T(0);
951 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
952
953 return ret;
954}
955
956#else
957
959template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
960 typename M=std::enable_if_t<std::is_integral_v<N> &&
961 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
962 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
963T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
964{
965 if (n <= 0) { return 0; }
966 constexpr int nwarps_per_block = 4;
967 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
968 constexpr int nchunks = 12;
969 constexpr int nelms_per_block = nthreads * nchunks;
970 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
971 std::numeric_limits<int>::max())*nelms_per_block);
972 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
973 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
974 auto stream = Gpu::gpuStream();
975
976 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
977 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
978
979 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
980 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
981 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
982 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
983 + nbytes_blockid
984 + nbytes_totalsum));
985 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
986 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
987 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
988
989 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
990 BlockStatusT& block_status = block_status_p[i];
991 block_status.set_status('x');
992 if (i == 0) {
993 *virtual_block_id_p = 0;
994 *totalsum_p = 0;
995 }
996 });
997
998 amrex::launch<nthreads>(nblocks, sm, stream,
999 [=] AMREX_GPU_DEVICE () noexcept
1000 {
1001 int lane = threadIdx.x % Gpu::Device::warp_size;
1002 int warp = threadIdx.x / Gpu::Device::warp_size;
1003 int nwarps = nthreads / Gpu::Device::warp_size;
1004
1006 T* shared = gsm.dataPtr();
1007 T* shared2 = shared + Gpu::Device::warp_size;
1008
1009 // First of all, get block virtual id. We must do this to
1010 // avoid deadlock because CUDA may launch blocks in any order.
1011 // Anywhere in this function, we should not use blockIdx.
1012 int virtual_block_id = 0;
1013 if (gridDim.x > 1) {
1014 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
1015 if (threadIdx.x == 0) {
1016 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
1017 virtual_block_id_shared = bid;
1018 }
1019 __syncthreads();
1020 virtual_block_id = virtual_block_id_shared;
1021 }
1022
1023 // Each block processes [ibegin,iend).
1024 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
1025 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
1026 BlockStatusT& block_status = block_status_p[virtual_block_id];
1027
1028 //
1029 // The overall algorithm is based on "Single-pass Parallel
1030 // Prefix Scan with Decoupled Look-back" by D. Merrill &
1031 // M. Garland.
1032 //
1033
1034 // Each block is responsible for nchunks chunks of data,
1035 // where each chunk has blockDim.x elements, one for each
1036 // thread in the block.
1037 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
1038 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
1039 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1040 N offset = ibegin + ichunk*nthreads;
1041 if (offset >= iend) { break; }
1042
1043 offset += threadIdx.x;
1044 T x0 = (offset < iend) ? fin(offset) : 0;
1045 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
1046 *totalsum_p += x0;
1047 }
1048 T x = x0;
1049 // Scan within a warp
1050 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
1051 AMREX_HIP_OR_CUDA( T s = __shfl_up(x,i);,
1052 T s = __shfl_up_sync(0xffffffff, x, i); )
1053 if (lane >= i) { x += s; }
1054 }
1055
1056 // x now holds the inclusive sum within the warp. The
1057 // last thread in each warp holds the inclusive sum of
1058 // this warp. We will store it in shared memory.
1059 if (lane == Gpu::Device::warp_size - 1) {
1060 shared[warp] = x;
1061 }
1062
1063 __syncthreads();
1064
1065 // The first warp will do scan on the warp sums for the
1066 // whole block. Not all threads in the warp need to
1067 // participate.
1068#ifdef AMREX_USE_CUDA
1069 if (warp == 0 && lane < nwarps) {
1070 T y = shared[lane];
1071 int mask = (1 << nwarps) - 1;
1072 for (int i = 1; i <= nwarps; i *= 2) {
1073 T s = __shfl_up_sync(mask, y, i, nwarps);
1074 if (lane >= i) { y += s; }
1075 }
1076 shared2[lane] = y;
1077 }
1078#else
1079 if (warp == 0) {
1080 T y = 0;
1081 if (lane < nwarps) {
1082 y = shared[lane];
1083 }
1084 for (int i = 1; i <= nwarps; i *= 2) {
1085 T s = __shfl_up(y, i, nwarps);
1086 if (lane >= i) { y += s; }
1087 }
1088 if (lane < nwarps) {
1089 shared2[lane] = y;
1090 }
1091 }
1092#endif
1093
1094 __syncthreads();
1095
1096 // shared[0:nwarps) holds the inclusive sum of warp sums.
1097
1098 // Also note x still holds the inclusive sum within the
1099 // warp. Given these two, we can compute the inclusive
1100 // sum within this chunk.
1101 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
1102 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
1103 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
1104 sum_prev_chunk += shared2[nwarps-1];
1105 }
1106
1107 // sum_prev_chunk now holds the sum of the whole block.
1108 if (threadIdx.x == 0 && gridDim.x > 1) {
1109 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
1110 sum_prev_chunk);
1111 }
1112
1113 if (virtual_block_id == 0) {
1114 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1115 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1116 if (offset >= iend) { break; }
1117 fout(offset, tmp_out[ichunk]);
1118 if (offset == n-1) {
1119 *totalsum_p += tmp_out[ichunk];
1120 }
1121 }
1122 } else if (virtual_block_id > 0) {
1123
1124 if (warp == 0) {
1125 T exclusive_prefix = 0;
1126 BlockStatusT volatile* pbs = block_status_p;
1127 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
1128 {
1129 int iblock = iblock0-lane;
1130 detail::STVA<T> stva{'p', 0};
1131 if (iblock >= 0) {
1132 stva = pbs[iblock].wait();
1133 }
1134
1135 T x = stva.value;
1136
1137 AMREX_HIP_OR_CUDA( uint64_t const status_bf = __ballot(stva.status == 'p');,
1138 unsigned const status_bf = __ballot_sync(0xffffffff, stva.status == 'p'));
1139 bool stop_lookback = status_bf & 0x1u;
1140 if (stop_lookback == false) {
1141 if (status_bf != 0) {
1142 T y = x;
1143 if (lane > 0) { x = 0; }
1144 AMREX_HIP_OR_CUDA(uint64_t bit_mask = 0x1ull;,
1145 unsigned bit_mask = 0x1u);
1146 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
1147 bit_mask <<= 1;
1148 if (i == lane) { x = y; }
1149 if (status_bf & bit_mask) {
1150 stop_lookback = true;
1151 break;
1152 }
1153 }
1154 }
1155
1156 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
1157 AMREX_HIP_OR_CUDA( x += __shfl_down(x,i);,
1158 x += __shfl_down_sync(0xffffffff, x, i); )
1159 }
1160 }
1161
1162 if (lane == 0) { exclusive_prefix += x; }
1163 if (stop_lookback) { break; }
1164 }
1165
1166 if (lane == 0) {
1167 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
1168 shared[0] = exclusive_prefix;
1169 }
1170 }
1171
1172 __syncthreads();
1173
1174 T exclusive_prefix = shared[0];
1175
1176 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1177 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1178 if (offset >= iend) { break; }
1179 T t = tmp_out[ichunk] + exclusive_prefix;
1180 fout(offset, t);
1181 if (offset == n-1) {
1182 *totalsum_p += t;
1183 }
1184 }
1185 }
1186 });
1187
1188 T totalsum = 0;
1189 if (a_ret_sum) {
1190 // xxxxx CUDA < 11 todo: Should test if using pinned memory and thus
1191 // avoiding memcpy is faster.
1192 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
1193
1195 The_Arena()->free(dp);
1196
1198 } else {
1200 }
1201
1202 return totalsum;
1203}
1204
1205#endif
1206
1218template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1219T InclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1220{
1221#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1222 void* d_temp = nullptr;
1223 std::size_t temp_bytes = 0;
1224 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1225 Gpu::gpuStream()));
1226 d_temp = The_Arena()->alloc(temp_bytes);
1227 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1228 Gpu::gpuStream()));
1229 T totalsum = 0;
1230 if (a_ret_sum) {
1231 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1232 }
1234 The_Arena()->free(d_temp);
1236 return totalsum;
1237#elif defined(AMREX_USE_HIP)
1238 void* d_temp = nullptr;
1239 std::size_t temp_bytes = 0;
1240 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1241 rocprim::plus<T>(), Gpu::gpuStream()));
1242 d_temp = The_Arena()->alloc(temp_bytes);
1243 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1244 rocprim::plus<T>(), Gpu::gpuStream()));
1245 T totalsum = 0;
1246 if (a_ret_sum) {
1247 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1248 }
1250 The_Arena()->free(d_temp);
1252 return totalsum;
1253#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1254 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1255 std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1256 T totalsum = 0;
1257 if (a_ret_sum) {
1258 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1259 }
1262 return totalsum;
1263#else
1264 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1265 return PrefixSum<T>(static_cast<int>(n),
1266 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1267 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1268 Type::inclusive, a_ret_sum);
1269 } else {
1270 return PrefixSum<T>(n,
1271 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1272 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1273 Type::inclusive, a_ret_sum);
1274 }
1275#endif
1276}
1277
1289template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1290T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1291{
1292 if (n <= 0) { return 0; }
1293#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1294 T in_last = 0;
1295 if (a_ret_sum) {
1296 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1297 }
1298 void* d_temp = nullptr;
1299 std::size_t temp_bytes = 0;
1300 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1301 Gpu::gpuStream()));
1302 d_temp = The_Arena()->alloc(temp_bytes);
1303 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1304 Gpu::gpuStream()));
1305 T out_last = 0;
1306 if (a_ret_sum) {
1307 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1308 }
1310 The_Arena()->free(d_temp);
1312 return in_last+out_last;
1313#elif defined(AMREX_USE_HIP)
1314 T in_last = 0;
1315 if (a_ret_sum) {
1316 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1317 }
1318 void* d_temp = nullptr;
1319 std::size_t temp_bytes = 0;
1320 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1321 rocprim::plus<T>(), Gpu::gpuStream()));
1322 d_temp = The_Arena()->alloc(temp_bytes);
1323 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1324 rocprim::plus<T>(), Gpu::gpuStream()));
1325 T out_last = 0;
1326 if (a_ret_sum) {
1327 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1328 }
1330 The_Arena()->free(d_temp);
1332 return in_last+out_last;
1333#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1334 T in_last = 0;
1335 if (a_ret_sum) {
1336 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1337 }
1338 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1339 std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1340 T out_last = 0;
1341 if (a_ret_sum) {
1342 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1343 }
1346 return in_last+out_last;
1347#else
1348 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1349 return PrefixSum<T>(static_cast<int>(n),
1350 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1351 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1352 Type::exclusive, a_ret_sum);
1353 } else {
1354 return PrefixSum<T>(n,
1355 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1356 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1357 Type::exclusive, a_ret_sum);
1358 }
1359#endif
1360}
1361
1362#else
1363// !defined(AMREX_USE_GPU)
1364template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
1365 typename M=std::enable_if_t<std::is_integral_v<N> &&
1366 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
1367 std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
1368T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
1369{
1370 if (n <= 0) { return 0; }
1371 T totalsum = 0;
1372 for (N i = 0; i < n; ++i) {
1373 T x = fin(i);
1374 T y = totalsum;
1375 totalsum += x;
1376 AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
1377 y += x;
1378 }
1379 fout(i, y);
1380 }
1381 return totalsum;
1382}
1383
1384// The return value is the total sum.
1385template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1386T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1387{
1388#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1389 // GCC's __cplusplus is not a reliable indication for C++17 support
1390 std::inclusive_scan(in, in+n, out);
1391#else
1392 std::partial_sum(in, in+n, out);
1393#endif
1394 return (n > 0) ? out[n-1] : T(0);
1395}
1396
1397// The return value is the total sum.
1398template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1399T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1400{
1401 if (n <= 0) { return 0; }
1402
1403 auto in_last = in[n-1];
1404#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1405 // GCC's __cplusplus is not a reliable indication for C++17 support
1406 std::exclusive_scan(in, in+n, out, 0);
1407#else
1408 out[0] = 0;
1409 std::partial_sum(in, in+n-1, out+1);
1410#endif
1411 return in_last + out[n-1];
1412}
1413
1414#endif
1415
1416}
1417
1418namespace Gpu
1419{
1421 template<class InIter, class OutIter>
1422 OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
1423 {
1424#if defined(AMREX_USE_GPU)
1425 auto N = std::distance(begin, end);
1426 Scan::InclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1427 OutIter result_end = result;
1428 std::advance(result_end, N);
1429 return result_end;
1430#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1431 // GCC's __cplusplus is not a reliable indication for C++17 support
1432 return std::inclusive_scan(begin, end, result);
1433#else
1434 return std::partial_sum(begin, end, result);
1435#endif
1436 }
1437
1439 template<class InIter, class OutIter>
1440 OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
1441 {
1442#if defined(AMREX_USE_GPU)
1443 auto N = std::distance(begin, end);
1444 Scan::ExclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1445 OutIter result_end = result;
1446 std::advance(result_end, N);
1447 return result_end;
1448#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1449 // GCC's __cplusplus is not a reliable indication for C++17 support
1450 return std::exclusive_scan(begin, end, result, 0);
1451#else
1452 if (begin == end) { return result; }
1453
1454 typename std::iterator_traits<InIter>::value_type sum = *begin;
1455 *result++ = sum - *begin;
1456
1457 while (++begin != end) {
1458 sum = std::move(sum) + *begin;
1459 *result++ = sum - *begin;
1460 }
1461 return ++result;
1462#endif
1463 }
1464
1465}}
1466
1467#endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_IF_CONSTEXPR
Definition AMReX_Extension.H:269
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_SAFE_CALL(call)
Definition AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Array4< int const > mask
Definition AMReX_InterpFaceRegister.cpp:93
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:152
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
amrex_long Long
Definition AMReX_INT.H:30
T InclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Inclusive sum.
Definition AMReX_Scan.H:1219
T ExclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Exclusive sum.
Definition AMReX_Scan.H:1290
OutIter exclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1440
OutIter inclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1422
T PrefixSum(N n, FIN const &fin, FOUT const &fout, TYPE, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:963
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:284
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:315
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr struct amrex::Scan::Type::Inclusive inclusive
static constexpr RetSum noRetSum
Definition AMReX_Scan.H:33
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Definition AMReX_Scan.H:28
bool flag
Definition AMReX_Scan.H:29
Definition AMReX_Scan.H:37
Definition AMReX_Scan.H:36