Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_Scan.H
Go to the documentation of this file.
1#ifndef AMREX_SCAN_H_
2#define AMREX_SCAN_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Extension.H>
6#include <AMReX_Gpu.H>
7#include <AMReX_Arena.H>
8
9#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
10# include <cub/cub.cuh>
11#elif defined(AMREX_USE_HIP)
12# include <rocprim/rocprim.hpp>
13#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
14# include <oneapi/dpl/execution>
15# include <oneapi/dpl/numeric>
16#endif
17
18#include <cstdint>
19#include <numeric>
20#include <type_traits>
21
22namespace amrex {
23namespace Scan {
24
25struct RetSum {
26 bool flag = true;
27 explicit operator bool() const noexcept { return flag; }
28};
29static constexpr RetSum retSum{true};
30static constexpr RetSum noRetSum{false};
31
32namespace Type {
33 static constexpr struct Inclusive {} inclusive{};
34 static constexpr struct Exclusive {} exclusive{};
35}
36
37#if defined(AMREX_USE_GPU)
38
39namespace detail {
40
41template <typename T>
42struct STVA
43{
44 char status;
46};
47
48template <typename T, bool SINGLE_WORD> struct BlockStatus {};
49
50template <typename T>
51struct BlockStatus<T, true>
52{
53 template<typename U>
54 union Data {
56 uint64_t i;
57 void operator=(Data<U> const&) = delete;
58 void operator=(Data<U> &&) = delete;
59 };
60 Data<T> d;
61
63 void write (char a_status, T a_value) {
64#if defined(AMREX_USE_CUDA)
65 volatile uint64_t tmp;
66 reinterpret_cast<STVA<T> volatile&>(tmp).status = a_status;
67 reinterpret_cast<STVA<T> volatile&>(tmp).value = a_value;
68 reinterpret_cast<uint64_t&>(d.s) = tmp;
69#else
70 Data<T> tmp;
71 tmp.s = {a_status, a_value};
72 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
73 "HIP/SYCL: unsigned long long must be 64 bits");
74 Gpu::Atomic::Exch(reinterpret_cast<unsigned long long*>(&d),
75 reinterpret_cast<unsigned long long&>(tmp));
76#endif
77 }
78
80 T get_aggregate() const { return d.s.value; }
81
83 STVA<T> read () volatile {
84#if defined(AMREX_USE_CUDA)
85 volatile uint64_t tmp = reinterpret_cast<uint64_t volatile&>(d);
86 return {reinterpret_cast<STVA<T> volatile&>(tmp).status,
87 reinterpret_cast<STVA<T> volatile&>(tmp).value };
88#else
89 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
90 "HIP/SYCL: unsigned long long must be 64 bits");
91 unsigned long long tmp = Gpu::Atomic::Add
92 (reinterpret_cast<unsigned long long*>(const_cast<Data<T>*>(&d)), 0ull);
93 return (*reinterpret_cast<Data<T>*>(&tmp)).s;
94#endif
95 }
96
98 void set_status (char a_status) { d.s.status = a_status; }
99
101 STVA<T> wait () volatile {
102 STVA<T> r;
103 do {
104#if defined(AMREX_USE_SYCL)
105 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
106#else
107 __threadfence_block();
108#endif
109 r = read();
110 } while (r.status == 'x');
111 return r;
112 }
113};
114
115template <typename T>
116struct BlockStatus<T, false>
117{
120 char status;
121
123 void write (char a_status, T a_value) {
124 if (a_status == 'a') {
125 aggregate = a_value;
126 } else {
127 inclusive = a_value;
128 }
129#if defined(AMREX_USE_SYCL)
130 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
131#else
132 __threadfence();
133#endif
134 status = a_status;
135 }
136
138 T get_aggregate() const { return aggregate; }
139
141 STVA<T> read () volatile {
142#if defined(AMREX_USE_SYCL)
143 constexpr auto mo = sycl::memory_order::relaxed;
144 constexpr auto ms = sycl::memory_scope::device;
145 constexpr auto as = sycl::access::address_space::global_space;
146#endif
147 if (status == 'x') {
148 return {'x', 0};
149 } else if (status == 'a') {
150#if defined(AMREX_USE_SYCL)
151 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(aggregate)};
152 return {'a', ar.load()};
153#else
154 return {'a', aggregate};
155#endif
156 } else {
157#if defined(AMREX_USE_SYCL)
158 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(inclusive)};
159 return {'p', ar.load()};
160#else
161 return {'p', inclusive};
162#endif
163 }
164 }
165
167 void set_status (char a_status) { status = a_status; }
168
170 STVA<T> wait () volatile {
171 STVA<T> r;
172 do {
173 r = read();
174#if defined(AMREX_USE_SYCL)
175 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
176#else
177 __threadfence();
178#endif
179 } while (r.status == 'x');
180 return r;
181 }
182};
183
184}
185
186#if defined(AMREX_USE_SYCL)
187
188#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
189template <typename T, typename N, typename FIN, typename FOUT, typename TYPE>
190T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
191{
192 if (n <= 0) { return 0; }
193 constexpr int nwarps_per_block = 8;
194 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
195 constexpr int nchunks = 12;
196 constexpr int nelms_per_block = nthreads * nchunks;
197 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
198 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
199 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
200 auto stream = Gpu::gpuStream();
201
202 std::size_t nbytes_blockresult = Arena::align(sizeof(T)*n);
203 std::size_t nbytes_blocksum = Arena::align(sizeof(T)*nblocks);
204 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
205 auto dp = (char*)(The_Arena()->alloc(nbytes_blockresult
206 + nbytes_blocksum
207 + nbytes_totalsum));
208 T* blockresult_p = (T*)dp;
209 T* blocksum_p = (T*)(dp + nbytes_blockresult);
210 T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
211
212 amrex::launch(nblocks, nthreads, sm, stream,
213 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
214 {
215 sycl::sub_group const& sg = gh.item->get_sub_group();
216 int lane = sg.get_local_id()[0];
217 int warp = sg.get_group_id()[0];
218 int nwarps = sg.get_group_range()[0];
219
220 int threadIdxx = gh.item->get_local_id(0);
221 int blockIdxx = gh.item->get_group_linear_id();
222 int blockDimx = gh.item->get_local_range(0);
223
224 T* shared = (T*)(gh.local);
225 T* shared2 = shared + Gpu::Device::warp_size;
226
227 // Each block processes [ibegin,iend).
228 N ibegin = nelms_per_block * blockIdxx;
229 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
230
231 // Each block is responsible for nchunks chunks of data,
232 // where each chunk has blockDim.x elements, one for each
233 // thread in the block.
234 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
235 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
236 N offset = ibegin + ichunk*blockDimx;
237 if (offset >= iend) { break; }
238
239 offset += threadIdxx;
240 T x0 = (offset < iend) ? fin(offset) : 0;
241 T x = x0;
242 // Scan within a warp
243 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
244 T s = sycl::shift_group_right(sg, x, i);
245 if (lane >= i) { x += s; }
246 }
247
248 // x now holds the inclusive sum within the warp. The
249 // last thread in each warp holds the inclusive sum of
250 // this warp. We will store it in shared memory.
251 if (lane == Gpu::Device::warp_size - 1) {
252 shared[warp] = x;
253 }
254
255 gh.item->barrier(sycl::access::fence_space::local_space);
256
257 // The first warp will do scan on the warp sums for the
258 // whole block.
259 if (warp == 0) {
260 T y = (lane < nwarps) ? shared[lane] : 0;
261 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
262 T s = sycl::shift_group_right(sg, y, i);
263 if (lane >= i) { y += s; }
264 }
265
266 if (lane < nwarps) { shared2[lane] = y; }
267 }
268
269 gh.item->barrier(sycl::access::fence_space::local_space);
270
271 // shared[0:nwarps) holds the inclusive sum of warp sums.
272
273 // Also note x still holds the inclusive sum within the
274 // warp. Given these two, we can compute the inclusive
275 // sum within this chunk.
276 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
277 T tmp_out = sum_prev_warp + sum_prev_chunk +
278 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
279 sum_prev_chunk += shared2[nwarps-1];
280
281 if (offset < iend) {
282 blockresult_p[offset] = tmp_out;
283 }
284 }
285
286 // sum_prev_chunk now holds the sum of the whole block.
287 if (threadIdxx == 0) {
288 blocksum_p[blockIdxx] = sum_prev_chunk;
289 }
290 });
291
292 amrex::launch(1, nthreads, sm, stream,
293 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
294 {
295 sycl::sub_group const& sg = gh.item->get_sub_group();
296 int lane = sg.get_local_id()[0];
297 int warp = sg.get_group_id()[0];
298 int nwarps = sg.get_group_range()[0];
299
300 int threadIdxx = gh.item->get_local_id(0);
301 int blockDimx = gh.item->get_local_range(0);
302
303 T* shared = (T*)(gh.local);
304 T* shared2 = shared + Gpu::Device::warp_size;
305
306 T sum_prev_chunk = 0;
307 for (int offset = threadIdxx; offset - threadIdxx < nblocks; offset += blockDimx) {
308 T x = (offset < nblocks) ? blocksum_p[offset] : 0;
309 // Scan within a warp
310 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
311 T s = sycl::shift_group_right(sg, x, i);
312 if (lane >= i) { x += s; }
313 }
314
315 // x now holds the inclusive sum within the warp. The
316 // last thread in each warp holds the inclusive sum of
317 // this warp. We will store it in shared memory.
318 if (lane == Gpu::Device::warp_size - 1) {
319 shared[warp] = x;
320 }
321
322 gh.item->barrier(sycl::access::fence_space::local_space);
323
324 // The first warp will do scan on the warp sums for the
325 // whole block.
326 if (warp == 0) {
327 T y = (lane < nwarps) ? shared[lane] : 0;
328 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
329 T s = sycl::shift_group_right(sg, y, i);
330 if (lane >= i) { y += s; }
331 }
332
333 if (lane < nwarps) { shared2[lane] = y; }
334 }
335
336 gh.item->barrier(sycl::access::fence_space::local_space);
337
338 // shared[0:nwarps) holds the inclusive sum of warp sums.
339
340 // Also note x still holds the inclusive sum within the
341 // warp. Given these two, we can compute the inclusive
342 // sum within this chunk.
343 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
344 T tmp_out = sum_prev_warp + sum_prev_chunk + x;
345 sum_prev_chunk += shared2[nwarps-1];
346
347 if (offset < nblocks) {
348 blocksum_p[offset] = tmp_out;
349 }
350 }
351
352 // sum_prev_chunk now holds the total sum.
353 if (threadIdxx == 0) {
354 *totalsum_p = sum_prev_chunk;
355 }
356 });
357
358 amrex::launch(nblocks, nthreads, 0, stream,
359 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
360 {
361 int threadIdxx = gh.item->get_local_id(0);
362 int blockIdxx = gh.item->get_group_linear_id();
363 int blockDimx = gh.item->get_local_range(0);
364
365 // Each block processes [ibegin,iend).
366 N ibegin = nelms_per_block * blockIdxx;
367 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
368 T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
369 for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
370 fout(offset, prev_sum + blockresult_p[offset]);
371 }
372 });
373
374 T totalsum = 0;
375 if (a_ret_sum) {
376 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
377 }
379 The_Arena()->free(dp);
380
382
383 return totalsum;
384}
385#endif
386
387template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
388 typename M=std::enable_if_t<std::is_integral<N>::value &&
389 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
390 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
391T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum)
392{
393 if (n <= 0) { return 0; }
394 constexpr int nwarps_per_block = 8;
395 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
396 constexpr int nchunks = 12;
397 constexpr int nelms_per_block = nthreads * nchunks;
398 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
399 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
400
401#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
402 if (nblocks > 1) {
403 return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
404 }
405#endif
406
407 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
408 auto stream = Gpu::gpuStream();
409
410 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
411 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
412
413 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
414 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
415 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
416 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
417 + nbytes_blockid
418 + nbytes_totalsum));
419 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
420 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
421 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
422
423 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
424 BlockStatusT& block_status = block_status_p[i];
425 block_status.set_status('x');
426 if (i == 0) {
427 *virtual_block_id_p = 0;
428 *totalsum_p = 0;
429 }
430 });
431
432 amrex::launch(nblocks, nthreads, sm, stream,
433 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
434 {
435 sycl::sub_group const& sg = gh.item->get_sub_group();
436 int lane = sg.get_local_id()[0];
437 int warp = sg.get_group_id()[0];
438 int nwarps = sg.get_group_range()[0];
439
440 int threadIdxx = gh.item->get_local_id(0);
441 int blockDimx = gh.item->get_local_range(0);
442 int gridDimx = gh.item->get_group_range(0);
443
444 T* shared = (T*)(gh.local);
445 T* shared2 = shared + Gpu::Device::warp_size;
446
447 // First of all, get block virtual id. We must do this to
448 // avoid deadlock because blocks may be launched in any order.
449 // Anywhere in this function, we should not use blockIdx.
450 int virtual_block_id = 0;
451 if (gridDimx > 1) {
452 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
453 if (threadIdxx == 0) {
454 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
455 virtual_block_id_shared = bid;
456 }
457 gh.item->barrier(sycl::access::fence_space::local_space);
458 virtual_block_id = virtual_block_id_shared;
459 }
460
461 // Each block processes [ibegin,iend).
462 N ibegin = nelms_per_block * virtual_block_id;
463 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
464 BlockStatusT& block_status = block_status_p[virtual_block_id];
465
466 //
467 // The overall algorithm is based on "Single-pass Parallel
468 // Prefix Scan with Decoupled Look-back" by D. Merrill &
469 // M. Garland.
470 //
471
472 // Each block is responsible for nchunks chunks of data,
473 // where each chunk has blockDim.x elements, one for each
474 // thread in the block.
475 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
476 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
477 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
478 N offset = ibegin + ichunk*blockDimx;
479 if (offset >= iend) { break; }
480
481 offset += threadIdxx;
482 T x0 = (offset < iend) ? fin(offset) : 0;
483 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
484 *totalsum_p += x0;
485 }
486 T x = x0;
487 // Scan within a warp
488 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
489 T s = sycl::shift_group_right(sg, x, i);
490 if (lane >= i) { x += s; }
491 }
492
493 // x now holds the inclusive sum within the warp. The
494 // last thread in each warp holds the inclusive sum of
495 // this warp. We will store it in shared memory.
496 if (lane == Gpu::Device::warp_size - 1) {
497 shared[warp] = x;
498 }
499
500 gh.item->barrier(sycl::access::fence_space::local_space);
501
502 // The first warp will do scan on the warp sums for the
503 // whole block.
504 if (warp == 0) {
505 T y = (lane < nwarps) ? shared[lane] : 0;
506 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
507 T s = sycl::shift_group_right(sg, y, i);
508 if (lane >= i) { y += s; }
509 }
510
511 if (lane < nwarps) { shared2[lane] = y; }
512 }
513
514 gh.item->barrier(sycl::access::fence_space::local_space);
515
516 // shared[0:nwarps) holds the inclusive sum of warp sums.
517
518 // Also note x still holds the inclusive sum within the
519 // warp. Given these two, we can compute the inclusive
520 // sum within this chunk.
521 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
522 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
523 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
524 sum_prev_chunk += shared2[nwarps-1];
525 }
526
527 // sum_prev_chunk now holds the sum of the whole block.
528 if (threadIdxx == 0 && gridDimx > 1) {
529 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
530 sum_prev_chunk);
531 }
532
533 if (virtual_block_id == 0) {
534 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
535 N offset = ibegin + ichunk*blockDimx + threadIdxx;
536 if (offset >= iend) { break; }
537 fout(offset, tmp_out[ichunk]);
538 if (offset == n-1) {
539 *totalsum_p += tmp_out[ichunk];
540 }
541 }
542 } else if (virtual_block_id > 0) {
543
544 if (warp == 0) {
545 T exclusive_prefix = 0;
546 BlockStatusT volatile* pbs = block_status_p;
547 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
548 {
549 int iblock = iblock0-lane;
550 detail::STVA<T> stva{'p', 0};
551 if (iblock >= 0) {
552 stva = pbs[iblock].wait();
553 }
554
555 T x = stva.value;
556
557 // implement our own __ballot
558 unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
559 for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
560 status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
561 }
562
563 bool stop_lookback = status_bf & 0x1u;
564 if (stop_lookback == false) {
565 if (status_bf != 0) {
566 T y = x;
567 if (lane > 0) { x = 0; }
568 unsigned int bit_mask = 0x1u;
569 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
570 bit_mask <<= 1;
571 if (i == lane) { x = y; }
572 if (status_bf & bit_mask) {
573 stop_lookback = true;
574 break;
575 }
576 }
577 }
578
579 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
580 x += sycl::shift_group_left(sg, x,i);
581 }
582 }
583
584 if (lane == 0) { exclusive_prefix += x; }
585 if (stop_lookback) { break; }
586 }
587
588 if (lane == 0) {
589 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
590 shared[0] = exclusive_prefix;
591 }
592 }
593
594 gh.item->barrier(sycl::access::fence_space::local_space);
595
596 T exclusive_prefix = shared[0];
597
598 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
599 N offset = ibegin + ichunk*blockDimx + threadIdxx;
600 if (offset >= iend) { break; }
601 T t = tmp_out[ichunk] + exclusive_prefix;
602 fout(offset, t);
603 if (offset == n-1) {
604 *totalsum_p += t;
605 }
606 }
607 }
608 });
609
610 T totalsum = 0;
611 if (a_ret_sum) {
612 // xxxxx SYCL todo: Should test if using pinned memory and thus
613 // avoiding memcpy is faster.
614 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
615 }
617 The_Arena()->free(dp);
618
620
621 return totalsum;
622}
623
624#elif defined(AMREX_USE_HIP)
625
626template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
627 typename M=std::enable_if_t<std::is_integral<N>::value &&
628 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
629 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
630T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
631{
632 if (n <= 0) { return 0; }
633 constexpr int nwarps_per_block = 4;
634 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
635 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
636 constexpr int nelms_per_block = nthreads * nelms_per_thread;
637 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
638 std::size_t sm = 0;
639 auto stream = Gpu::gpuStream();
640
641 using ScanTileState = rocprim::detail::lookback_scan_state<T>;
642 using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
643
644#if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
645 (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
646 defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
647
648 std::size_t nbytes_tile_state = rocprim::detail::align_size
649 (ScanTileState::get_storage_size(nblocks));
650 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
651
652 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
653
654 ScanTileState tile_state = ScanTileState::create(dp, nblocks);
655
656#else
657
658 std::size_t nbytes_tile_state;
659 AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
660 nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
661
662 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
663
664 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
665
666 ScanTileState tile_state;
667 AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
668
669#endif
670
671 auto ordered_block_id = OrderedBlockId::create
672 (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
673
674 // Init ScanTileState on device
675 amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
676 {
677 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
678 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
679 const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
680 if (gid == 0) { scan_bid.reset(); }
681 scan_tile_state.initialize_prefix(gid, nblocks);
682 });
683
684 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
685
686 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
687 [=] AMREX_GPU_DEVICE () noexcept
688 {
689 using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
690 rocprim::block_load_method::block_load_transpose>;
691 using BlockScan = rocprim::block_scan<T, nthreads,
692 rocprim::block_scan_algorithm::using_warp_scan>;
693 using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
694 using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
695 <T, rocprim::plus<T>, ScanTileState>;
696
697 __shared__ struct TempStorage {
698 typename OrderedBlockId::storage_type ordered_bid;
699 union {
700 typename BlockLoad::storage_type load;
701 typename BlockExchange::storage_type exchange;
702 typename BlockScan::storage_type scan;
703 };
704 } temp_storage;
705
706 // Lambda captured tile_state is const. We have to cast the const away.
707 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
708 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
709
710 auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
711
712 // Each block processes [ibegin,iend).
713 N ibegin = nelms_per_block * virtual_block_id;
714 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
715
716 auto input_begin = rocprim::make_transform_iterator(
717 rocprim::make_counting_iterator(N(0)),
718 [&] (N i) -> T { return fin(i+ibegin); });
719
720 T data[nelms_per_thread];
721 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
722 BlockLoad().load(input_begin, data, temp_storage.load);
723 } else {
724 // padding with 0
725 BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
726 }
727
728 __syncthreads();
729
730 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
731
732 if (virtual_block_id == 0) {
733 T block_agg;
734 AMREX_IF_CONSTEXPR(is_exclusive) {
735 BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
736 } else {
737 BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
738 }
739 if (threadIdx.x == 0) {
740 if (nblocks > 1) {
741 scan_tile_state.set_complete(0, block_agg);
742 } else if (nblocks == 1 && totalsum_p) {
743 *totalsum_p = block_agg;
744 }
745 }
746 } else {
747 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
748
749 LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
750 AMREX_IF_CONSTEXPR(is_exclusive) {
751 BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
752 rocprim::plus<T>());
753 } else {
754 BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
755 rocprim::plus<T>());
756 }
757 if (totalsum_p) {
758 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
759 T tsum = data[nelms_per_thread-1];
760 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
761 *totalsum_p = tsum;
762 }
763 }
764 }
765
766 __syncthreads();
767
768 BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
769
770 for (int i = 0; i < nelms_per_thread; ++i) {
771 N offset = ibegin + i*nthreads + threadIdx.x;
772 if (offset < iend) { fout(offset, data[i]); }
773 }
774 });
775
778
779 The_Arena()->free(dp);
780
781 T ret = (a_ret_sum) ? *totalsum_p : T(0);
782 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
783
784 return ret;
785}
786
787#elif defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
788
789template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
790 typename M=std::enable_if_t<std::is_integral<N>::value &&
791 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
792 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
793T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
794{
795 if (n <= 0) { return 0; }
796 constexpr int nwarps_per_block = 8;
797 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
798 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
799 constexpr int nelms_per_block = nthreads * nelms_per_thread;
800 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
801 std::size_t sm = 0;
802 auto stream = Gpu::gpuStream();
803
804 using ScanTileState = cub::ScanTileState<T>;
805 std::size_t tile_state_size = 0;
806 ScanTileState::AllocationSize(nblocks, tile_state_size);
807
808 std::size_t nbytes_tile_state = Arena::align(tile_state_size);
809 auto tile_state_p = (char*)(The_Arena()->alloc(nbytes_tile_state));
810
811 ScanTileState tile_state;
812 tile_state.Init(nblocks, tile_state_p, tile_state_size); // Init ScanTileState on host
813
814 if (nblocks > 1) {
815 // Init ScanTileState on device
816 amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
817 {
818 const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
819 });
820 }
821
822 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
823
824 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
825 [=] AMREX_GPU_DEVICE () noexcept
826 {
827 using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
828 using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
829 using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
830 using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, cub::Sum, ScanTileState>;
831
832 __shared__ union TempStorage
833 {
834 typename BlockLoad::TempStorage load;
835 typename BlockExchange::TempStorage exchange;
836 struct ScanStorage {
837 typename BlockScan::TempStorage scan;
838 typename TilePrefixCallbackOp::TempStorage prefix;
839 } scan_storeage;
840 } temp_storage;
841
842 // Lambda captured tile_state is const. We have to cast the const away.
843 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
844
845 int virtual_block_id = blockIdx.x;
846
847 // Each block processes [ibegin,iend).
848 N ibegin = nelms_per_block * virtual_block_id;
849 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
850
851 auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
852 cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
853 input_begin(cub::CountingInputIterator<N>(0), input_lambda);
854
855 T data[nelms_per_thread];
856 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
857 BlockLoad(temp_storage.load).Load(input_begin, data);
858 } else {
859 BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0); // padding with 0
860 }
861
862 __syncthreads();
863
864 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
865
866 if (virtual_block_id == 0) {
867 T block_agg;
868 AMREX_IF_CONSTEXPR(is_exclusive) {
869 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
870 } else {
871 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
872 }
873 if (threadIdx.x == 0) {
874 if (nblocks > 1) {
875 scan_tile_state.SetInclusive(0, block_agg);
876 } else if (nblocks == 1 && totalsum_p) {
877 *totalsum_p = block_agg;
878 }
879 }
880 } else {
881 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
882
883 TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
884 cub::Sum{}, virtual_block_id);
885 AMREX_IF_CONSTEXPR(is_exclusive) {
886 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
887 } else {
888 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
889 }
890 if (totalsum_p) {
891 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
892 T tsum = data[nelms_per_thread-1];
893 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
894 *totalsum_p = tsum;
895 }
896 }
897 }
898
899 __syncthreads();
900
901 BlockExchange(temp_storage.exchange).BlockedToStriped(data);
902
903 for (int i = 0; i < nelms_per_thread; ++i) {
904 N offset = ibegin + i*nthreads + threadIdx.x;
905 if (offset < iend) { fout(offset, data[i]); }
906 }
907 });
908
911
912 The_Arena()->free(tile_state_p);
913
914 T ret = (a_ret_sum) ? *totalsum_p : T(0);
915 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
916
917 return ret;
918}
919
920#else
921
922template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
923 typename M=std::enable_if_t<std::is_integral<N>::value &&
924 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
925 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
926T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
927{
928 if (n <= 0) { return 0; }
929 constexpr int nwarps_per_block = 4;
930 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
931 constexpr int nchunks = 12;
932 constexpr int nelms_per_block = nthreads * nchunks;
933 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
934 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
935 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
936 auto stream = Gpu::gpuStream();
937
938 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
940
941 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
942 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
943 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
944 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
945 + nbytes_blockid
946 + nbytes_totalsum));
947 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
948 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
949 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
950
951 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
952 BlockStatusT& block_status = block_status_p[i];
953 block_status.set_status('x');
954 if (i == 0) {
955 *virtual_block_id_p = 0;
956 *totalsum_p = 0;
957 }
958 });
959
960 amrex::launch(nblocks, nthreads, sm, stream,
961 [=] AMREX_GPU_DEVICE () noexcept
962 {
963 int lane = threadIdx.x % Gpu::Device::warp_size;
964 int warp = threadIdx.x / Gpu::Device::warp_size;
965 int nwarps = nthreads / Gpu::Device::warp_size;
966
968 T* shared = gsm.dataPtr();
969 T* shared2 = shared + Gpu::Device::warp_size;
970
971 // First of all, get block virtual id. We must do this to
972 // avoid deadlock because CUDA may launch blocks in any order.
973 // Anywhere in this function, we should not use blockIdx.
974 int virtual_block_id = 0;
975 if (gridDim.x > 1) {
976 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
977 if (threadIdx.x == 0) {
978 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
979 virtual_block_id_shared = bid;
980 }
981 __syncthreads();
982 virtual_block_id = virtual_block_id_shared;
983 }
984
985 // Each block processes [ibegin,iend).
986 N ibegin = nelms_per_block * virtual_block_id;
987 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
988 BlockStatusT& block_status = block_status_p[virtual_block_id];
989
990 //
991 // The overall algorithm is based on "Single-pass Parallel
992 // Prefix Scan with Decoupled Look-back" by D. Merrill &
993 // M. Garland.
994 //
995
996 // Each block is responsible for nchunks chunks of data,
997 // where each chunk has blockDim.x elements, one for each
998 // thread in the block.
999 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
1000 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
1001 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1002 N offset = ibegin + ichunk*nthreads;
1003 if (offset >= iend) { break; }
1004
1005 offset += threadIdx.x;
1006 T x0 = (offset < iend) ? fin(offset) : 0;
1007 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
1008 *totalsum_p += x0;
1009 }
1010 T x = x0;
1011 // Scan within a warp
1012 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
1013 AMREX_HIP_OR_CUDA( T s = __shfl_up(x,i);,
1014 T s = __shfl_up_sync(0xffffffff, x, i); )
1015 if (lane >= i) { x += s; }
1016 }
1017
1018 // x now holds the inclusive sum within the warp. The
1019 // last thread in each warp holds the inclusive sum of
1020 // this warp. We will store it in shared memory.
1021 if (lane == Gpu::Device::warp_size - 1) {
1022 shared[warp] = x;
1023 }
1024
1025 __syncthreads();
1026
1027 // The first warp will do scan on the warp sums for the
1028 // whole block. Not all threads in the warp need to
1029 // participate.
1030#ifdef AMREX_USE_CUDA
1031 if (warp == 0 && lane < nwarps) {
1032 T y = shared[lane];
1033 int mask = (1 << nwarps) - 1;
1034 for (int i = 1; i <= nwarps; i *= 2) {
1035 T s = __shfl_up_sync(mask, y, i, nwarps);
1036 if (lane >= i) { y += s; }
1037 }
1038 shared2[lane] = y;
1039 }
1040#else
1041 if (warp == 0) {
1042 T y = 0;
1043 if (lane < nwarps) {
1044 y = shared[lane];
1045 }
1046 for (int i = 1; i <= nwarps; i *= 2) {
1047 T s = __shfl_up(y, i, nwarps);
1048 if (lane >= i) { y += s; }
1049 }
1050 if (lane < nwarps) {
1051 shared2[lane] = y;
1052 }
1053 }
1054#endif
1055
1056 __syncthreads();
1057
1058 // shared[0:nwarps) holds the inclusive sum of warp sums.
1059
1060 // Also note x still holds the inclusive sum within the
1061 // warp. Given these two, we can compute the inclusive
1062 // sum within this chunk.
1063 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
1064 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
1065 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
1066 sum_prev_chunk += shared2[nwarps-1];
1067 }
1068
1069 // sum_prev_chunk now holds the sum of the whole block.
1070 if (threadIdx.x == 0 && gridDim.x > 1) {
1071 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
1072 sum_prev_chunk);
1073 }
1074
1075 if (virtual_block_id == 0) {
1076 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1077 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1078 if (offset >= iend) { break; }
1079 fout(offset, tmp_out[ichunk]);
1080 if (offset == n-1) {
1081 *totalsum_p += tmp_out[ichunk];
1082 }
1083 }
1084 } else if (virtual_block_id > 0) {
1085
1086 if (warp == 0) {
1087 T exclusive_prefix = 0;
1088 BlockStatusT volatile* pbs = block_status_p;
1089 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
1090 {
1091 int iblock = iblock0-lane;
1092 detail::STVA<T> stva{'p', 0};
1093 if (iblock >= 0) {
1094 stva = pbs[iblock].wait();
1095 }
1096
1097 T x = stva.value;
1098
1099 AMREX_HIP_OR_CUDA( uint64_t const status_bf = __ballot(stva.status == 'p');,
1100 unsigned const status_bf = __ballot_sync(0xffffffff, stva.status == 'p'));
1101 bool stop_lookback = status_bf & 0x1u;
1102 if (stop_lookback == false) {
1103 if (status_bf != 0) {
1104 T y = x;
1105 if (lane > 0) { x = 0; }
1106 AMREX_HIP_OR_CUDA(uint64_t bit_mask = 0x1ull;,
1107 unsigned bit_mask = 0x1u);
1108 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
1109 bit_mask <<= 1;
1110 if (i == lane) { x = y; }
1111 if (status_bf & bit_mask) {
1112 stop_lookback = true;
1113 break;
1114 }
1115 }
1116 }
1117
1118 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
1119 AMREX_HIP_OR_CUDA( x += __shfl_down(x,i);,
1120 x += __shfl_down_sync(0xffffffff, x, i); )
1121 }
1122 }
1123
1124 if (lane == 0) { exclusive_prefix += x; }
1125 if (stop_lookback) { break; }
1126 }
1127
1128 if (lane == 0) {
1129 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
1130 shared[0] = exclusive_prefix;
1131 }
1132 }
1133
1134 __syncthreads();
1135
1136 T exclusive_prefix = shared[0];
1137
1138 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1139 N offset = ibegin + ichunk*nthreads + threadIdx.x;
1140 if (offset >= iend) { break; }
1141 T t = tmp_out[ichunk] + exclusive_prefix;
1142 fout(offset, t);
1143 if (offset == n-1) {
1144 *totalsum_p += t;
1145 }
1146 }
1147 }
1148 });
1149
1150 T totalsum = 0;
1151 if (a_ret_sum) {
1152 // xxxxx CUDA < 11 todo: Should test if using pinned memory and thus
1153 // avoiding memcpy is faster.
1154 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
1155 }
1157 The_Arena()->free(dp);
1158
1160
1161 return totalsum;
1162}
1163
1164#endif
1165
1166// The return value is the total sum if a_ret_sum is true.
1167template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
1168T InclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1169{
1170#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1171 void* d_temp = nullptr;
1172 std::size_t temp_bytes = 0;
1173 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1174 Gpu::gpuStream()));
1175 d_temp = The_Arena()->alloc(temp_bytes);
1176 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1177 Gpu::gpuStream()));
1178 T totalsum = 0;
1179 if (a_ret_sum) {
1180 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1181 }
1183 The_Arena()->free(d_temp);
1185 return totalsum;
1186#elif defined(AMREX_USE_HIP)
1187 void* d_temp = nullptr;
1188 std::size_t temp_bytes = 0;
1189 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1190 rocprim::plus<T>(), Gpu::gpuStream()));
1191 d_temp = The_Arena()->alloc(temp_bytes);
1192 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1193 rocprim::plus<T>(), Gpu::gpuStream()));
1194 T totalsum = 0;
1195 if (a_ret_sum) {
1196 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1197 }
1199 The_Arena()->free(d_temp);
1201 return totalsum;
1202#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1203 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1204 std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1205 T totalsum = 0;
1206 if (a_ret_sum) {
1207 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1208 }
1211 return totalsum;
1212#else
1213 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1214 return PrefixSum<T>(static_cast<int>(n),
1215 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1216 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1217 Type::inclusive, a_ret_sum);
1218 } else {
1219 return PrefixSum<T>(n,
1220 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1221 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1222 Type::inclusive, a_ret_sum);
1223 }
1224#endif
1225}
1226
1227// The return value is the total sum if a_ret_sum is true.
1228template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
1229T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1230{
1231 if (n <= 0) { return 0; }
1232#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1233 T in_last = 0;
1234 if (a_ret_sum) {
1235 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1236 }
1237 void* d_temp = nullptr;
1238 std::size_t temp_bytes = 0;
1239 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1240 Gpu::gpuStream()));
1241 d_temp = The_Arena()->alloc(temp_bytes);
1242 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1243 Gpu::gpuStream()));
1244 T out_last = 0;
1245 if (a_ret_sum) {
1246 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1247 }
1249 The_Arena()->free(d_temp);
1251 return in_last+out_last;
1252#elif defined(AMREX_USE_HIP)
1253 T in_last = 0;
1254 if (a_ret_sum) {
1255 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1256 }
1257 void* d_temp = nullptr;
1258 std::size_t temp_bytes = 0;
1259 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1260 rocprim::plus<T>(), Gpu::gpuStream()));
1261 d_temp = The_Arena()->alloc(temp_bytes);
1262 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1263 rocprim::plus<T>(), Gpu::gpuStream()));
1264 T out_last = 0;
1265 if (a_ret_sum) {
1266 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1267 }
1269 The_Arena()->free(d_temp);
1271 return in_last+out_last;
1272#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1273 T in_last = 0;
1274 if (a_ret_sum) {
1275 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1276 }
1277 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1278 std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1279 T out_last = 0;
1280 if (a_ret_sum) {
1281 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1282 }
1285 return in_last+out_last;
1286#else
1287 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1288 return PrefixSum<T>(static_cast<int>(n),
1289 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1290 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1291 Type::exclusive, a_ret_sum);
1292 } else {
1293 return PrefixSum<T>(n,
1294 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1295 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1296 Type::exclusive, a_ret_sum);
1297 }
1298#endif
1299}
1300
1301#else
1302// !defined(AMREX_USE_GPU)
1303template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
1304 typename M=std::enable_if_t<std::is_integral_v<N> &&
1305 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
1306 std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
1307T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
1308{
1309 if (n <= 0) { return 0; }
1310 T totalsum = 0;
1311 for (N i = 0; i < n; ++i) {
1312 T x = fin(i);
1313 T y = totalsum;
1314 totalsum += x;
1315 AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
1316 y += x;
1317 }
1318 fout(i, y);
1319 }
1320 return totalsum;
1321}
1322
1323// The return value is the total sum.
1324template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1325T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1326{
1327#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1328 // GCC's __cplusplus is not a reliable indication for C++17 support
1329 std::inclusive_scan(in, in+n, out);
1330#else
1331 std::partial_sum(in, in+n, out);
1332#endif
1333 return (n > 0) ? out[n-1] : T(0);
1334}
1335
1336// The return value is the total sum.
1337template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1338T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1339{
1340 if (n <= 0) { return 0; }
1341
1342 auto in_last = in[n-1];
1343#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1344 // GCC's __cplusplus is not a reliable indication for C++17 support
1345 std::exclusive_scan(in, in+n, out, 0);
1346#else
1347 out[0] = 0;
1348 std::partial_sum(in, in+n-1, out+1);
1349#endif
1350 return in_last + out[n-1];
1351}
1352
1353#endif
1354
1355}
1356
1357namespace Gpu
1358{
1359 template<class InIter, class OutIter>
1360 OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
1361 {
1362#if defined(AMREX_USE_GPU)
1363 auto N = std::distance(begin, end);
1364 Scan::InclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1365 OutIter result_end = result;
1366 std::advance(result_end, N);
1367 return result_end;
1368#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1369 // GCC's __cplusplus is not a reliable indication for C++17 support
1370 return std::inclusive_scan(begin, end, result);
1371#else
1372 return std::partial_sum(begin, end, result);
1373#endif
1374 }
1375
1376 template<class InIter, class OutIter>
1377 OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
1378 {
1379#if defined(AMREX_USE_GPU)
1380 auto N = std::distance(begin, end);
1381 Scan::ExclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1382 OutIter result_end = result;
1383 std::advance(result_end, N);
1384 return result_end;
1385#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1386 // GCC's __cplusplus is not a reliable indication for C++17 support
1387 return std::exclusive_scan(begin, end, result, 0);
1388#else
1389 if (begin == end) { return result; }
1390
1391 typename std::iterator_traits<InIter>::value_type sum = *begin;
1392 *result++ = sum - *begin;
1393
1394 while (++begin != end) {
1395 sum = std::move(sum) + *begin;
1396 *result++ = sum - *begin;
1397 }
1398 return ++result;
1399#endif
1400 }
1401
1402}}
1403
1404#endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_RESTRICT
Definition AMReX_Extension.H:37
#define AMREX_IF_CONSTEXPR
Definition AMReX_Extension.H:269
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_GPU_SAFE_CALL(call)
Definition AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Array4< int const > mask
Definition AMReX_InterpFaceRegister.cpp:93
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:143
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:485
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:198
OutIter exclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1377
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:265
OutIter inclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1360
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr struct amrex::Scan::Type::Inclusive inclusive
T InclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:1168
T ExclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:1229
static constexpr RetSum noRetSum
Definition AMReX_Scan.H:30
static constexpr RetSum retSum
Definition AMReX_Scan.H:29
T PrefixSum(N n, FIN const &fin, FOUT const &fout, TYPE, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:926
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:120
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1890
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:656
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1881
const int[]
Definition AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition AMReX_Arena.cpp:616
Definition AMReX_FabArrayCommI.H:896
integer, parameter dp
Definition AMReX_SDCquadrature.F90:8
Definition AMReX_GpuMemory.H:125
AMREX_GPU_DEVICE T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Definition AMReX_Scan.H:25
bool flag
Definition AMReX_Scan.H:26
Definition AMReX_Scan.H:34
Definition AMReX_Scan.H:33
T inclusive
Definition AMReX_Scan.H:119
T aggregate
Definition AMReX_Scan.H:118
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void write(char a_status, T a_value)
Definition AMReX_Scan.H:123
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > read() volatile
Definition AMReX_Scan.H:141
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T get_aggregate() const
Definition AMReX_Scan.H:138
char status
Definition AMReX_Scan.H:120
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void set_status(char a_status)
Definition AMReX_Scan.H:167
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > wait() volatile
Definition AMReX_Scan.H:170
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void set_status(char a_status)
Definition AMReX_Scan.H:98
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T get_aggregate() const
Definition AMReX_Scan.H:80
Data< T > d
Definition AMReX_Scan.H:60
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > wait() volatile
Definition AMReX_Scan.H:101
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > read() volatile
Definition AMReX_Scan.H:83
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void write(char a_status, T a_value)
Definition AMReX_Scan.H:63
Definition AMReX_Scan.H:48
Definition AMReX_Scan.H:43
char status
Definition AMReX_Scan.H:44
T value
Definition AMReX_Scan.H:45
STVA< U > s
Definition AMReX_Scan.H:55
uint64_t i
Definition AMReX_Scan.H:56
void operator=(Data< U > const &)=delete