Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_Scan.H
Go to the documentation of this file.
1#ifndef AMREX_SCAN_H_
2#define AMREX_SCAN_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Extension.H>
6#include <AMReX_Gpu.H>
7#include <AMReX_Arena.H>
8
9#if defined(AMREX_USE_CUDA)
10# include <cub/cub.cuh>
11# ifdef AMREX_CUDA_CCCL_VER_GE_2_8
12# include <thrust/iterator/transform_iterator.h>
13# endif
14#elif defined(AMREX_USE_HIP)
15# include <rocprim/rocprim.hpp>
16#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
17# include <oneapi/dpl/execution>
18# include <oneapi/dpl/numeric>
19#endif
20
21#include <concepts>
22#include <cstdint>
23#include <numeric>
24#include <type_traits>
25
26namespace amrex {
27namespace Scan {
28
29struct RetSum {
30 bool flag = true;
31 explicit operator bool() const noexcept { return flag; }
32};
33static constexpr RetSum retSum{true};
34static constexpr RetSum noRetSum{false};
35
36namespace Type {
37 static constexpr struct Inclusive {} inclusive{};
38 static constexpr struct Exclusive {} exclusive{};
39}
40
41#if defined(AMREX_USE_GPU)
42
44namespace detail {
45
46template <typename T>
47struct STVA
48{
49 char status;
50 T value;
51};
52
53template <typename T, bool SINGLE_WORD> struct BlockStatus {};
54
55template <typename T>
56struct BlockStatus<T, true>
57{
58 template<typename U>
59 union Data {
60 STVA<U> s;
61 uint64_t i;
62 void operator=(Data<U> const&) = delete;
63 void operator=(Data<U> &&) = delete;
64 };
65 Data<T> d;
66
68 void write (char a_status, T a_value) {
69#if defined(AMREX_USE_CUDA)
70 volatile uint64_t tmp;
71 reinterpret_cast<STVA<T> volatile&>(tmp).status = a_status;
72 reinterpret_cast<STVA<T> volatile&>(tmp).value = a_value;
73 reinterpret_cast<uint64_t&>(d.s) = tmp;
74#else
75 Data<T> tmp;
76 tmp.s = {a_status, a_value};
77 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
78 "HIP/SYCL: unsigned long long must be 64 bits");
79 Gpu::Atomic::Exch(reinterpret_cast<unsigned long long*>(&d),
80 reinterpret_cast<unsigned long long&>(tmp));
81#endif
82 }
83
85 T get_aggregate() const { return d.s.value; }
86
88 STVA<T> read () volatile {
89#if defined(AMREX_USE_CUDA)
90 volatile uint64_t tmp = reinterpret_cast<uint64_t volatile&>(d);
91 return {reinterpret_cast<STVA<T> volatile&>(tmp).status,
92 reinterpret_cast<STVA<T> volatile&>(tmp).value };
93#else
94 static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
95 "HIP/SYCL: unsigned long long must be 64 bits");
96 unsigned long long tmp = Gpu::Atomic::Add
97 (reinterpret_cast<unsigned long long*>(const_cast<Data<T>*>(&d)), 0ull);
98 return (*reinterpret_cast<Data<T>*>(&tmp)).s;
99#endif
100 }
101
103 void set_status (char a_status) { d.s.status = a_status; }
104
106 STVA<T> wait () volatile {
107 STVA<T> r;
108 do {
109#if defined(AMREX_USE_SYCL)
110 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
111#else
112 __threadfence_block();
113#endif
114 r = read();
115 } while (r.status == 'x');
116 return r;
117 }
118};
119
120template <typename T>
121struct BlockStatus<T, false>
122{
123 T aggregate;
124 T inclusive;
125 char status;
126
128 void write (char a_status, T a_value) {
129 if (a_status == 'a') {
130 aggregate = a_value;
131 } else {
132 inclusive = a_value;
133 }
134#if defined(AMREX_USE_SYCL)
135 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
136#else
137 __threadfence();
138#endif
139 status = a_status;
140 }
141
143 T get_aggregate() const { return aggregate; }
144
146 STVA<T> read () volatile {
147#if defined(AMREX_USE_SYCL)
148 constexpr auto mo = sycl::memory_order::relaxed;
149 constexpr auto ms = sycl::memory_scope::device;
150 constexpr auto as = sycl::access::address_space::global_space;
151#endif
152 if (status == 'x') {
153 return {'x', 0};
154 } else if (status == 'a') {
155#if defined(AMREX_USE_SYCL)
156 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(aggregate)};
157 return {'a', ar.load()};
158#else
159 return {'a', aggregate};
160#endif
161 } else {
162#if defined(AMREX_USE_SYCL)
163 sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(inclusive)};
164 return {'p', ar.load()};
165#else
166 return {'p', inclusive};
167#endif
168 }
169 }
170
172 void set_status (char a_status) { status = a_status; }
173
175 STVA<T> wait () volatile {
176 STVA<T> r;
177 do {
178 r = read();
179#if defined(AMREX_USE_SYCL)
180 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
181#else
182 __threadfence();
183#endif
184 } while (r.status == 'x');
185 return r;
186 }
187};
188
189}
191
192#if defined(AMREX_USE_SYCL)
193
194#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
195template <typename T, std::integral N, typename FIN, typename FOUT, typename TYPE>
196T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
197{
198 if (n <= 0) { return 0; }
199 constexpr int nwarps_per_block = 8;
200 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
201 constexpr int nchunks = 12;
202 constexpr int nelms_per_block = nthreads * nchunks;
203 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
204 std::numeric_limits<int>::max())*nelms_per_block);
205 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
206 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
207 auto stream = Gpu::gpuStream();
208
209 std::size_t nbytes_blockresult = Arena::align(sizeof(T)*n);
210 std::size_t nbytes_blocksum = Arena::align(sizeof(T)*nblocks);
211 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
212 auto dp = (char*)(The_Arena()->alloc(nbytes_blockresult
213 + nbytes_blocksum
214 + nbytes_totalsum));
215 T* blockresult_p = (T*)dp;
216 T* blocksum_p = (T*)(dp + nbytes_blockresult);
217 T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
218
219 amrex::launch<nthreads>(nblocks, sm, stream,
220 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
221 {
222 sycl::sub_group const& sg = gh.item->get_sub_group();
223 int lane = sg.get_local_id()[0];
224 int warp = sg.get_group_id()[0];
225 int nwarps = sg.get_group_range()[0];
226
227 int threadIdxx = gh.item->get_local_id(0);
228 int blockIdxx = gh.item->get_group_linear_id();
229 int blockDimx = gh.item->get_local_range(0);
230
231 T* shared = (T*)(gh.local);
232 T* shared2 = shared + Gpu::Device::warp_size;
233
234 // Each block processes [ibegin,iend).
235 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
236 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
237
238 // Each block is responsible for nchunks chunks of data,
239 // where each chunk has blockDim.x elements, one for each
240 // thread in the block.
241 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
242 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
243 N offset = ibegin + ichunk*blockDimx;
244 if (offset >= iend) { break; }
245
246 offset += threadIdxx;
247 T x0 = (offset < iend) ? fin(offset) : 0;
248 T x = x0;
249 // Scan within a warp
250 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
251 T s = sycl::shift_group_right(sg, x, i);
252 if (lane >= i) { x += s; }
253 }
254
255 // x now holds the inclusive sum within the warp. The
256 // last thread in each warp holds the inclusive sum of
257 // this warp. We will store it in shared memory.
258 if (lane == Gpu::Device::warp_size - 1) {
259 shared[warp] = x;
260 }
261
262 gh.item->barrier(sycl::access::fence_space::local_space);
263
264 // The first warp will do scan on the warp sums for the
265 // whole block.
266 if (warp == 0) {
267 T y = (lane < nwarps) ? shared[lane] : 0;
268 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
269 T s = sycl::shift_group_right(sg, y, i);
270 if (lane >= i) { y += s; }
271 }
272
273 if (lane < nwarps) { shared2[lane] = y; }
274 }
275
276 gh.item->barrier(sycl::access::fence_space::local_space);
277
278 // shared[0:nwarps) holds the inclusive sum of warp sums.
279
280 // Also note x still holds the inclusive sum within the
281 // warp. Given these two, we can compute the inclusive
282 // sum within this chunk.
283 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
284 T tmp_out = sum_prev_warp + sum_prev_chunk +
285 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ? x : x-x0);
286 sum_prev_chunk += shared2[nwarps-1];
287
288 if (offset < iend) {
289 blockresult_p[offset] = tmp_out;
290 }
291 }
292
293 // sum_prev_chunk now holds the sum of the whole block.
294 if (threadIdxx == 0) {
295 blocksum_p[blockIdxx] = sum_prev_chunk;
296 }
297 });
298
299 amrex::launch<nthreads>(1, sm, stream,
300 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
301 {
302 sycl::sub_group const& sg = gh.item->get_sub_group();
303 int lane = sg.get_local_id()[0];
304 int warp = sg.get_group_id()[0];
305 int nwarps = sg.get_group_range()[0];
306
307 int threadIdxx = gh.item->get_local_id(0);
308 int blockDimx = gh.item->get_local_range(0);
309
310 T* shared = (T*)(gh.local);
311 T* shared2 = shared + Gpu::Device::warp_size;
312
313 T sum_prev_chunk = 0;
314 for (int offset = threadIdxx; offset - threadIdxx < nblocks; offset += blockDimx) {
315 T x = (offset < nblocks) ? blocksum_p[offset] : 0;
316 // Scan within a warp
317 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
318 T s = sycl::shift_group_right(sg, x, i);
319 if (lane >= i) { x += s; }
320 }
321
322 // x now holds the inclusive sum within the warp. The
323 // last thread in each warp holds the inclusive sum of
324 // this warp. We will store it in shared memory.
325 if (lane == Gpu::Device::warp_size - 1) {
326 shared[warp] = x;
327 }
328
329 gh.item->barrier(sycl::access::fence_space::local_space);
330
331 // The first warp will do scan on the warp sums for the
332 // whole block.
333 if (warp == 0) {
334 T y = (lane < nwarps) ? shared[lane] : 0;
335 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
336 T s = sycl::shift_group_right(sg, y, i);
337 if (lane >= i) { y += s; }
338 }
339
340 if (lane < nwarps) { shared2[lane] = y; }
341 }
342
343 gh.item->barrier(sycl::access::fence_space::local_space);
344
345 // shared[0:nwarps) holds the inclusive sum of warp sums.
346
347 // Also note x still holds the inclusive sum within the
348 // warp. Given these two, we can compute the inclusive
349 // sum within this chunk.
350 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
351 T tmp_out = sum_prev_warp + sum_prev_chunk + x;
352 sum_prev_chunk += shared2[nwarps-1];
353
354 if (offset < nblocks) {
355 blocksum_p[offset] = tmp_out;
356 }
357 }
358
359 // sum_prev_chunk now holds the total sum.
360 if (threadIdxx == 0) {
361 *totalsum_p = sum_prev_chunk;
362 }
363 });
364
365 amrex::launch<nthreads>(nblocks, 0, stream,
366 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
367 {
368 int threadIdxx = gh.item->get_local_id(0);
369 int blockIdxx = gh.item->get_group_linear_id();
370 int blockDimx = gh.item->get_local_range(0);
371
372 // Each block processes [ibegin,iend).
373 N ibegin = static_cast<N>(nelms_per_block) * blockIdxx;
374 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
375 T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
376 for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
377 fout(offset, prev_sum + blockresult_p[offset]);
378 }
379 });
380
381 T totalsum = 0;
382 if (a_ret_sum) {
383 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
384
386 The_Arena()->free(dp);
387
389 } else {
391 }
392
393 return totalsum;
394}
395#endif
396
397template <typename T, std::integral N, typename FIN, typename FOUT, typename TYPE>
398requires (std::same_as<std::decay_t<TYPE>,Type::Inclusive> ||
399 std::same_as<std::decay_t<TYPE>,Type::Exclusive>)
400T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum)
401{
402 if (n <= 0) { return 0; }
403 constexpr int nwarps_per_block = 8;
404 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
405 constexpr int nchunks = 12;
406 constexpr int nelms_per_block = nthreads * nchunks;
407 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
408 std::numeric_limits<int>::max())*nelms_per_block);
409 int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
410
411#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
412 if (nblocks > 1) {
413 return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
414 }
415#endif
416
417 std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
418 auto stream = Gpu::gpuStream();
419
420 using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
421 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
422
423 std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
424 std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
425 std::size_t nbytes_totalsum = Arena::align(sizeof(T));
426 auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
427 + nbytes_blockid
428 + nbytes_totalsum));
429 BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
430 unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
431 T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
432
433 amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
434 BlockStatusT& block_status = block_status_p[i];
435 block_status.set_status('x');
436 if (i == 0) {
437 *virtual_block_id_p = 0;
438 *totalsum_p = 0;
439 }
440 });
441
442 amrex::launch<nthreads>(nblocks, sm, stream,
443 [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
444 {
445 sycl::sub_group const& sg = gh.item->get_sub_group();
446 int lane = sg.get_local_id()[0];
447 int warp = sg.get_group_id()[0];
448 int nwarps = sg.get_group_range()[0];
449
450 int threadIdxx = gh.item->get_local_id(0);
451 int blockDimx = gh.item->get_local_range(0);
452 int gridDimx = gh.item->get_group_range(0);
453
454 T* shared = (T*)(gh.local);
455 T* shared2 = shared + Gpu::Device::warp_size;
456
457 // First of all, get block virtual id. We must do this to
458 // avoid deadlock because blocks may be launched in any order.
459 // Anywhere in this function, we should not use blockIdx.
460 int virtual_block_id = 0;
461 if (gridDimx > 1) {
462 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
463 if (threadIdxx == 0) {
464 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
465 virtual_block_id_shared = bid;
466 }
467 gh.item->barrier(sycl::access::fence_space::local_space);
468 virtual_block_id = virtual_block_id_shared;
469 }
470
471 // Each block processes [ibegin,iend).
472 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
473 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
474 BlockStatusT& block_status = block_status_p[virtual_block_id];
475
476 //
477 // The overall algorithm is based on "Single-pass Parallel
478 // Prefix Scan with Decoupled Look-back" by D. Merrill &
479 // M. Garland.
480 //
481
482 // Each block is responsible for nchunks chunks of data,
483 // where each chunk has blockDim.x elements, one for each
484 // thread in the block.
485 T sum_prev_chunk = 0; // inclusive sum from previous chunks.
486 T tmp_out[nchunks]; // block-wide inclusive sum for chunks
487 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
488 N offset = ibegin + ichunk*blockDimx;
489 if (offset >= iend) { break; }
490
491 offset += threadIdxx;
492 T x0 = (offset < iend) ? fin(offset) : 0;
493 if (std::is_same_v<std::decay_t<TYPE>,Type::Exclusive> && offset == n-1) {
494 *totalsum_p += x0;
495 }
496 T x = x0;
497 // Scan within a warp
498 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
499 T s = sycl::shift_group_right(sg, x, i);
500 if (lane >= i) { x += s; }
501 }
502
503 // x now holds the inclusive sum within the warp. The
504 // last thread in each warp holds the inclusive sum of
505 // this warp. We will store it in shared memory.
506 if (lane == Gpu::Device::warp_size - 1) {
507 shared[warp] = x;
508 }
509
510 gh.item->barrier(sycl::access::fence_space::local_space);
511
512 // The first warp will do scan on the warp sums for the
513 // whole block.
514 if (warp == 0) {
515 T y = (lane < nwarps) ? shared[lane] : 0;
516 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
517 T s = sycl::shift_group_right(sg, y, i);
518 if (lane >= i) { y += s; }
519 }
520
521 if (lane < nwarps) { shared2[lane] = y; }
522 }
523
524 gh.item->barrier(sycl::access::fence_space::local_space);
525
526 // shared[0:nwarps) holds the inclusive sum of warp sums.
527
528 // Also note x still holds the inclusive sum within the
529 // warp. Given these two, we can compute the inclusive
530 // sum within this chunk.
531 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
532 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
533 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ? x : x-x0);
534 sum_prev_chunk += shared2[nwarps-1];
535 }
536
537 // sum_prev_chunk now holds the sum of the whole block.
538 if (threadIdxx == 0 && gridDimx > 1) {
539 block_status.write((virtual_block_id == 0) ? 'p' : 'a',
540 sum_prev_chunk);
541 }
542
543 if (virtual_block_id == 0) {
544 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
545 N offset = ibegin + ichunk*blockDimx + threadIdxx;
546 if (offset >= iend) { break; }
547 fout(offset, tmp_out[ichunk]);
548 if (offset == n-1) {
549 *totalsum_p += tmp_out[ichunk];
550 }
551 }
552 } else if (virtual_block_id > 0) {
553
554 if (warp == 0) {
555 T exclusive_prefix = 0;
556 BlockStatusT volatile* pbs = block_status_p;
557 for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
558 {
559 int iblock = iblock0-lane;
560 detail::STVA<T> stva{'p', 0};
561 if (iblock >= 0) {
562 stva = pbs[iblock].wait();
563 }
564
565 T x = stva.value;
566
567 // implement our own __ballot
568 unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
569 for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
570 status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
571 }
572
573 bool stop_lookback = status_bf & 0x1u;
574 if (stop_lookback == false) {
575 if (status_bf != 0) {
576 T y = x;
577 if (lane > 0) { x = 0; }
578 unsigned int bit_mask = 0x1u;
579 for (int i = 1; i < Gpu::Device::warp_size; ++i) {
580 bit_mask <<= 1;
581 if (i == lane) { x = y; }
582 if (status_bf & bit_mask) {
583 stop_lookback = true;
584 break;
585 }
586 }
587 }
588
589 for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
590 x += sycl::shift_group_left(sg, x,i);
591 }
592 }
593
594 if (lane == 0) { exclusive_prefix += x; }
595 if (stop_lookback) { break; }
596 }
597
598 if (lane == 0) {
599 block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
600 shared[0] = exclusive_prefix;
601 }
602 }
603
604 gh.item->barrier(sycl::access::fence_space::local_space);
605
606 T exclusive_prefix = shared[0];
607
608 for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
609 N offset = ibegin + ichunk*blockDimx + threadIdxx;
610 if (offset >= iend) { break; }
611 T t = tmp_out[ichunk] + exclusive_prefix;
612 fout(offset, t);
613 if (offset == n-1) {
614 *totalsum_p += t;
615 }
616 }
617 }
618 });
619
620 T totalsum = 0;
621 if (a_ret_sum) {
622 // xxxxx SYCL todo: Should test if using pinned memory and thus
623 // avoiding memcpy is faster.
624 Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
625
627 The_Arena()->free(dp);
628
630 } else {
632 }
633
634 return totalsum;
635}
636
637#elif defined(AMREX_USE_HIP)
638
639template <typename T, std::integral N, typename FIN, typename FOUT, typename TYPE>
640requires (std::same_as<std::decay_t<TYPE>,Type::Inclusive> ||
641 std::same_as<std::decay_t<TYPE>,Type::Exclusive>)
642T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
643{
644 if (n <= 0) { return 0; }
645 constexpr int nwarps_per_block = 4;
646 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
647 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
648 constexpr int nelms_per_block = nthreads * nelms_per_thread;
649 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
650 std::numeric_limits<int>::max())*nelms_per_block);
651 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
652 std::size_t sm = 0;
653 auto stream = Gpu::gpuStream();
654
655 using ScanTileState = rocprim::detail::lookback_scan_state<T>;
656 using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
657
658#if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
659 (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
660 defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
661
662 std::size_t nbytes_tile_state = rocprim::detail::align_size
663 (ScanTileState::get_storage_size(nblocks));
664 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
665
666 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
667
668 ScanTileState tile_state = ScanTileState::create(dp, nblocks);
669
670#else
671
672 std::size_t nbytes_tile_state;
673 AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
674 nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
675
676 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
677
678 auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
679
680 ScanTileState tile_state;
681 AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
682
683#endif
684
685 auto ordered_block_id = OrderedBlockId::create
686 (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
687
688 // Init ScanTileState on device
689 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
690 {
691 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
692 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
693 const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
694 if (gid == 0) { scan_bid.reset(); }
695 scan_tile_state.initialize_prefix(gid, nblocks);
696 });
697
698 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
699
700 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
701 [=] AMREX_GPU_DEVICE () noexcept
702 {
703 using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
704 rocprim::block_load_method::block_load_transpose>;
705 using BlockScan = rocprim::block_scan<T, nthreads,
706 rocprim::block_scan_algorithm::using_warp_scan>;
707 using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
708 using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
709 <T, rocprim::plus<T>, ScanTileState>;
710
711 __shared__ struct TempStorage {
712 typename OrderedBlockId::storage_type ordered_bid;
713 union {
714 typename BlockLoad::storage_type load;
715 typename BlockExchange::storage_type exchange;
716 typename BlockScan::storage_type scan;
717 };
718 } temp_storage;
719
720 // Lambda captured tile_state is const. We have to cast the const away.
721 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
722 auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
723
724 auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
725
726 // Each block processes [ibegin,iend).
727 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
728 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
729
730 auto input_begin = rocprim::make_transform_iterator(
731 rocprim::make_counting_iterator(N(0)),
732 [&] (N i) -> T { return fin(i+ibegin); });
733
734 T data[nelms_per_thread];
735 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
736 BlockLoad().load(input_begin, data, temp_storage.load);
737 } else {
738 // padding with 0
739 BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
740 }
741
742 __syncthreads();
743
744 constexpr bool is_exclusive = std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>;
745
746 if (virtual_block_id == 0) {
747 T block_agg;
748 AMREX_IF_CONSTEXPR(is_exclusive) {
749 BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
750 } else {
751 BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
752 }
753 if (threadIdx.x == 0) {
754 if (nblocks > 1) {
755 scan_tile_state.set_complete(0, block_agg);
756 } else if (nblocks == 1 && totalsum_p) {
757 *totalsum_p = block_agg;
758 }
759 }
760 } else {
761 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
762
763 LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
764 AMREX_IF_CONSTEXPR(is_exclusive) {
765 BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
766 rocprim::plus<T>());
767 } else {
768 BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
769 rocprim::plus<T>());
770 }
771 if (totalsum_p) {
772 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
773 T tsum = data[nelms_per_thread-1];
774 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
775 *totalsum_p = tsum;
776 }
777 }
778 }
779
780 __syncthreads();
781
782 BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
783
784 for (int i = 0; i < nelms_per_thread; ++i) {
785 N offset = ibegin + i*nthreads + threadIdx.x;
786 if (offset < iend) { fout(offset, data[i]); }
787 }
788 });
789
790 if (totalsum_p) {
793
794 The_Arena()->free(dp);
795 } else {
797 }
798
799 T ret = (a_ret_sum) ? *totalsum_p : T(0);
800 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
801
802 return ret;
803}
804
805#elif defined(AMREX_USE_CUDA)
806
807template <typename T, std::integral N, typename FIN, typename FOUT, typename TYPE>
808requires (std::same_as<std::decay_t<TYPE>,Type::Inclusive> ||
809 std::same_as<std::decay_t<TYPE>,Type::Exclusive>)
810T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
811{
812 if (n <= 0) { return 0; }
813 constexpr int nwarps_per_block = 8;
814 constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
815 constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
816 constexpr int nelms_per_block = nthreads * nelms_per_thread;
817 AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(
818 std::numeric_limits<int>::max())*nelms_per_block);
819 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
820 std::size_t sm = 0;
821 auto stream = Gpu::gpuStream();
822
823 using ScanTileState = cub::ScanTileState<T>;
824 std::size_t tile_state_size = 0;
825 ScanTileState::AllocationSize(nblocks, tile_state_size);
826
827 std::size_t nbytes_tile_state = Arena::align(tile_state_size);
828 auto tile_state_p = (char*)(The_Arena()->alloc(nbytes_tile_state));
829
830 ScanTileState tile_state;
831 tile_state.Init(nblocks, tile_state_p, tile_state_size); // Init ScanTileState on host
832
833 if (nblocks > 1) {
834 // Init ScanTileState on device
835 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
836 {
837 const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
838 });
839 }
840
841 T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
842
843 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
844 [=] AMREX_GPU_DEVICE () noexcept
845 {
846 using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
847 using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
848 using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
849
850#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
851 using Sum = cuda::std::plus<T>;
852#else
853 using Sum = cub::Sum;
854#endif
855 using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, Sum, ScanTileState>;
856
857 __shared__ union TempStorage
858 {
859 typename BlockLoad::TempStorage load;
860 typename BlockExchange::TempStorage exchange;
861 struct ScanStorage {
862 typename BlockScan::TempStorage scan;
863 typename TilePrefixCallbackOp::TempStorage prefix;
864 } scan_storeage;
865 } temp_storage;
866
867 // Lambda captured tile_state is const. We have to cast the const away.
868 auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
869
870 int virtual_block_id = blockIdx.x;
871
872 // Each block processes [ibegin,iend).
873 N ibegin = static_cast<N>(nelms_per_block) * virtual_block_id;
874 N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
875
876 auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
877#ifdef AMREX_CUDA_CCCL_VER_GE_2_8
878 thrust::transform_iterator<decltype(input_lambda),thrust::counting_iterator<N> >
879 input_begin(thrust::counting_iterator<N>(0), input_lambda);
880#else
881 cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
882 input_begin(cub::CountingInputIterator<N>(0), input_lambda);
883#endif
884
885 T data[nelms_per_thread];
886 if (static_cast<int>(iend-ibegin) == nelms_per_block) {
887 BlockLoad(temp_storage.load).Load(input_begin, data);
888 } else {
889 BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0); // padding with 0
890 }
891
892 __syncthreads();
893
894 constexpr bool is_exclusive = std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>;
895
896 if (virtual_block_id == 0) {
897 T block_agg;
898 AMREX_IF_CONSTEXPR(is_exclusive) {
899 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
900 } else {
901 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
902 }
903 if (threadIdx.x == 0) {
904 if (nblocks > 1) {
905 scan_tile_state.SetInclusive(0, block_agg);
906 } else if (nblocks == 1 && totalsum_p) {
907 *totalsum_p = block_agg;
908 }
909 }
910 } else {
911 T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
912
913 TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
914 Sum{}, virtual_block_id);
915 AMREX_IF_CONSTEXPR(is_exclusive) {
916 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
917 } else {
918 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
919 }
920 if (totalsum_p) {
921 if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
922 T tsum = data[nelms_per_thread-1];
923 AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
924 *totalsum_p = tsum;
925 }
926 }
927 }
928
929 __syncthreads();
930
931 BlockExchange(temp_storage.exchange).BlockedToStriped(data);
932
933 for (int i = 0; i < nelms_per_thread; ++i) {
934 N offset = ibegin + i*nthreads + threadIdx.x;
935 if (offset < iend) { fout(offset, data[i]); }
936 }
937 });
938
939 if (totalsum_p) {
942
943 The_Arena()->free(tile_state_p);
944 } else {
945 Gpu::freeAsync(The_Arena(), tile_state_p);
946 }
947
948 T ret = (a_ret_sum) ? *totalsum_p : T(0);
949 if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
950
951 return ret;
952}
953
954#endif
955
967template <std::integral N, typename T >
968T InclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
969{
970 if (n <= 0) { return 0; }
971#if defined(AMREX_USE_CUDA)
972 void* d_temp = nullptr;
973 std::size_t temp_bytes = 0;
974 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
975 Gpu::gpuStream()));
976 d_temp = The_Arena()->alloc(temp_bytes);
977 AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
978 Gpu::gpuStream()));
979 T totalsum = 0;
980 if (a_ret_sum) {
981 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
982 }
984 The_Arena()->free(d_temp);
986 return totalsum;
987#elif defined(AMREX_USE_HIP)
988 void* d_temp = nullptr;
989 std::size_t temp_bytes = 0;
990 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
991 rocprim::plus<T>(), Gpu::gpuStream()));
992 d_temp = The_Arena()->alloc(temp_bytes);
993 AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
994 rocprim::plus<T>(), Gpu::gpuStream()));
995 T totalsum = 0;
996 if (a_ret_sum) {
997 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
998 }
1000 The_Arena()->free(d_temp);
1002 return totalsum;
1003#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1004 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1005 std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1006 T totalsum = 0;
1007 if (a_ret_sum) {
1008 Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1009 }
1012 return totalsum;
1013#else
1014 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1015 return PrefixSum<T>(static_cast<int>(n),
1016 [=] AMREX_GPU_DEVICE (int i) -> T { return in[i]; },
1017 [=] AMREX_GPU_DEVICE (int i, T const& x) { out[i] = x; },
1018 Type::inclusive, a_ret_sum);
1019 } else {
1020 return PrefixSum<T>(n,
1021 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1022 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1023 Type::inclusive, a_ret_sum);
1024 }
1025#endif
1026}
1027
1039template <std::integral N, typename T >
1040T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1041{
1042 if (n <= 0) { return 0; }
1043#if defined(AMREX_USE_CUDA)
1044 T in_last = 0;
1045 if (a_ret_sum) {
1046 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1047 }
1048 void* d_temp = nullptr;
1049 std::size_t temp_bytes = 0;
1050 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1051 Gpu::gpuStream()));
1052 d_temp = The_Arena()->alloc(temp_bytes);
1053 AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1054 Gpu::gpuStream()));
1055 T out_last = 0;
1056 if (a_ret_sum) {
1057 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1058 }
1060 The_Arena()->free(d_temp);
1062 return in_last+out_last;
1063#elif defined(AMREX_USE_HIP)
1064 T in_last = 0;
1065 if (a_ret_sum) {
1066 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1067 }
1068 void* d_temp = nullptr;
1069 std::size_t temp_bytes = 0;
1070 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1071 rocprim::plus<T>(), Gpu::gpuStream()));
1072 d_temp = The_Arena()->alloc(temp_bytes);
1073 AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1074 rocprim::plus<T>(), Gpu::gpuStream()));
1075 T out_last = 0;
1076 if (a_ret_sum) {
1077 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1078 }
1080 The_Arena()->free(d_temp);
1082 return in_last+out_last;
1083#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1084 T in_last = 0;
1085 if (a_ret_sum) {
1086 Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1087 }
1088 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1089 std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1090 T out_last = 0;
1091 if (a_ret_sum) {
1092 Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1093 }
1096 return in_last+out_last;
1097#else
1098 if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1099 return PrefixSum<T>(static_cast<int>(n),
1100 [=] AMREX_GPU_DEVICE (int i) -> T { return in[i]; },
1101 [=] AMREX_GPU_DEVICE (int i, T const& x) { out[i] = x; },
1102 Type::exclusive, a_ret_sum);
1103 } else {
1104 return PrefixSum<T>(n,
1105 [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1106 [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1107 Type::exclusive, a_ret_sum);
1108 }
1109#endif
1110}
1111
1112#else
1113// !defined(AMREX_USE_GPU)
1114template <typename T, std::integral N, typename FIN, typename FOUT, typename TYPE>
1115requires (std::same_as<std::decay_t<TYPE>,Type::Inclusive> ||
1116 std::same_as<std::decay_t<TYPE>,Type::Exclusive>)
1117T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
1118{
1119 if (n <= 0) { return 0; }
1120 T totalsum = 0;
1121 for (N i = 0; i < n; ++i) {
1122 T x = fin(i);
1123 T y = totalsum;
1124 totalsum += x;
1125 AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
1126 y += x;
1127 }
1128 fout(i, y);
1129 }
1130 return totalsum;
1131}
1132
1133// The return value is the total sum.
1134template <std::integral N, typename T >
1135T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1136{
1137#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1138 // GCC's __cplusplus is not a reliable indication for C++17 support
1139 std::inclusive_scan(in, in+n, out);
1140#else
1141 std::partial_sum(in, in+n, out);
1142#endif
1143 return (n > 0) ? out[n-1] : T(0);
1144}
1145
1146// The return value is the total sum.
1147template <std::integral N, typename T >
1148T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1149{
1150 if (n <= 0) { return 0; }
1151
1152 auto in_last = in[n-1];
1153#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1154 // GCC's __cplusplus is not a reliable indication for C++17 support
1155 std::exclusive_scan(in, in+n, out, 0);
1156#else
1157 out[0] = 0;
1158 std::partial_sum(in, in+n-1, out+1);
1159#endif
1160 return in_last + out[n-1];
1161}
1162
1163#endif
1164
1165}
1166
1167namespace Gpu
1168{
1170 template<class InIter, class OutIter>
1171 OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
1172 {
1173#if defined(AMREX_USE_GPU)
1174 auto N = std::distance(begin, end);
1175 if (N <= 0) { return result; }
1176 Scan::InclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1177 OutIter result_end = result;
1178 std::advance(result_end, N);
1179 return result_end;
1180#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1181 // GCC's __cplusplus is not a reliable indication for C++17 support
1182 return std::inclusive_scan(begin, end, result);
1183#else
1184 return std::partial_sum(begin, end, result);
1185#endif
1186 }
1187
1189 template<class InIter, class OutIter>
1190 OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
1191 {
1192#if defined(AMREX_USE_GPU)
1193 auto N = std::distance(begin, end);
1194 if (N <= 0) { return result; }
1195 Scan::ExclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1196 OutIter result_end = result;
1197 std::advance(result_end, N);
1198 return result_end;
1199#elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1200 // GCC's __cplusplus is not a reliable indication for C++17 support
1201 return std::exclusive_scan(begin, end, result, 0);
1202#else
1203 if (begin == end) { return result; }
1204
1205 typename std::iterator_traits<InIter>::value_type sum = *begin;
1206 *result++ = sum - *begin;
1207
1208 while (++begin != end) {
1209 sum = std::move(sum) + *begin;
1210 *result++ = sum - *begin;
1211 }
1212 return ++result;
1213#endif
1214 }
1215
1216}}
1217
1218#endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_IF_CONSTEXPR
Definition AMReX_Extension.H:270
#define AMREX_GPU_SAFE_CALL(call)
Definition AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:167
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
amrex_long Long
Definition AMReX_INT.H:30
OutIter exclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1190
T InclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Inclusive sum.
Definition AMReX_Scan.H:968
OutIter inclusive_scan(InIter begin, InIter end, OutIter result)
Definition AMReX_Scan.H:1171
T ExclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Exclusive sum.
Definition AMReX_Scan.H:1040
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:345
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr struct amrex::Scan::Type::Inclusive inclusive
static constexpr RetSum noRetSum
Definition AMReX_Scan.H:34
static constexpr RetSum retSum
Definition AMReX_Scan.H:33
T PrefixSum(N n, FIN const &fin, FOUT const &fout, TYPE, RetSum a_ret_sum=retSum)
Definition AMReX_Scan.H:810
Definition AMReX_Amr.cpp:50
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2018
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:25
const int[]
Definition AMReX_BLProfiler.cpp:1664
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2028
Definition AMReX_Scan.H:29
bool flag
Definition AMReX_Scan.H:30
Definition AMReX_Scan.H:38
Definition AMReX_Scan.H:37