30 explicit operator bool() const noexcept {
return flag; }
40#if defined(AMREX_USE_GPU)
52template <
typename T,
bool SINGLE_WORD>
struct BlockStatus {};
55struct BlockStatus<T, true>
61 void operator=(Data<U>
const&) =
delete;
62 void operator=(Data<U> &&) =
delete;
67 void write (
char a_status, T a_value) {
68#if defined(AMREX_USE_CUDA)
69 volatile uint64_t tmp;
70 reinterpret_cast<STVA<T> volatile&
>(tmp).status = a_status;
71 reinterpret_cast<STVA<T> volatile&
>(tmp).value = a_value;
72 reinterpret_cast<uint64_t&
>(d.s) = tmp;
75 tmp.s = {a_status, a_value};
76 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
77 "HIP/SYCL: unsigned long long must be 64 bits");
78 Gpu::Atomic::Exch(
reinterpret_cast<unsigned long long*
>(&d),
79 reinterpret_cast<unsigned long long&
>(tmp));
84 T get_aggregate()
const {
return d.s.value; }
87 STVA<T> read ()
volatile {
88#if defined(AMREX_USE_CUDA)
89 volatile uint64_t tmp =
reinterpret_cast<uint64_t volatile&
>(d);
90 return {
reinterpret_cast<STVA<T> volatile&
>(tmp).status,
91 reinterpret_cast<STVA<T> volatile&
>(tmp).value };
93 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
94 "HIP/SYCL: unsigned long long must be 64 bits");
95 unsigned long long tmp = Gpu::Atomic::Add
96 (
reinterpret_cast<unsigned long long*
>(
const_cast<Data<T>*
>(&d)), 0ull);
97 return (*
reinterpret_cast<Data<T>*
>(&tmp)).s;
102 void set_status (
char a_status) { d.s.status = a_status; }
105 STVA<T> wait ()
volatile {
108#if defined(AMREX_USE_SYCL)
109 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
111 __threadfence_block();
114 }
while (r.status ==
'x');
120struct BlockStatus<T, false>
127 void write (
char a_status, T a_value) {
128 if (a_status ==
'a') {
133#if defined(AMREX_USE_SYCL)
134 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
142 T get_aggregate()
const {
return aggregate; }
145 STVA<T> read ()
volatile {
146#if defined(AMREX_USE_SYCL)
147 constexpr auto mo = sycl::memory_order::relaxed;
148 constexpr auto ms = sycl::memory_scope::device;
149 constexpr auto as = sycl::access::address_space::global_space;
153 }
else if (status ==
'a') {
154#if defined(AMREX_USE_SYCL)
155 sycl::atomic_ref<T,mo,ms,as> ar{
const_cast<T&
>(aggregate)};
156 return {
'a', ar.load()};
158 return {
'a', aggregate};
161#if defined(AMREX_USE_SYCL)
162 sycl::atomic_ref<T,mo,ms,as> ar{
const_cast<T&
>(
inclusive)};
163 return {
'p', ar.load()};
171 void set_status (
char a_status) { status = a_status; }
174 STVA<T> wait ()
volatile {
178#if defined(AMREX_USE_SYCL)
179 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
183 }
while (r.status ==
'x');
191#if defined(AMREX_USE_SYCL)
193#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
194template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE>
195T PrefixSum_mp (N n, FIN
const& fin, FOUT
const& fout, TYPE, RetSum a_ret_sum)
197 if (n <= 0) {
return 0; }
198 constexpr int nwarps_per_block = 8;
200 constexpr int nchunks = 12;
201 constexpr int nelms_per_block = nthreads * nchunks;
203 std::numeric_limits<int>::max())*nelms_per_block);
204 int nblocks = (
static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
208 std::size_t nbytes_blockresult =
Arena::align(
sizeof(T)*n);
209 std::size_t nbytes_blocksum =
Arena::align(
sizeof(T)*nblocks);
214 T* blockresult_p = (T*)dp;
215 T* blocksum_p = (T*)(dp + nbytes_blockresult);
216 T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
218 amrex::launch<nthreads>(nblocks, sm, stream,
221 sycl::sub_group
const& sg = gh.item->get_sub_group();
222 int lane = sg.get_local_id()[0];
223 int warp = sg.get_group_id()[0];
224 int nwarps = sg.get_group_range()[0];
226 int threadIdxx = gh.item->get_local_id(0);
227 int blockIdxx = gh.item->get_group_linear_id();
228 int blockDimx = gh.item->get_local_range(0);
230 T* shared = (T*)(gh.local);
234 N ibegin =
static_cast<N
>(nelms_per_block) * blockIdxx;
235 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
240 T sum_prev_chunk = 0;
241 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
242 N
offset = ibegin + ichunk*blockDimx;
243 if (
offset >= iend) {
break; }
250 T s = sycl::shift_group_right(sg,
x, i);
251 if (lane >= i) {
x += s; }
261 gh.item->barrier(sycl::access::fence_space::local_space);
266 T
y = (lane < nwarps) ? shared[lane] : 0;
268 T s = sycl::shift_group_right(sg,
y, i);
269 if (lane >= i) {
y += s; }
272 if (lane < nwarps) { shared2[lane] =
y; }
275 gh.item->barrier(sycl::access::fence_space::local_space);
282 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
283 T tmp_out = sum_prev_warp + sum_prev_chunk +
284 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ?
x :
x-x0);
285 sum_prev_chunk += shared2[nwarps-1];
288 blockresult_p[
offset] = tmp_out;
293 if (threadIdxx == 0) {
294 blocksum_p[blockIdxx] = sum_prev_chunk;
298 amrex::launch<nthreads>(1, sm, stream,
301 sycl::sub_group
const& sg = gh.item->get_sub_group();
302 int lane = sg.get_local_id()[0];
303 int warp = sg.get_group_id()[0];
304 int nwarps = sg.get_group_range()[0];
306 int threadIdxx = gh.item->get_local_id(0);
307 int blockDimx = gh.item->get_local_range(0);
309 T* shared = (T*)(gh.local);
312 T sum_prev_chunk = 0;
317 T s = sycl::shift_group_right(sg,
x, i);
318 if (lane >= i) {
x += s; }
328 gh.item->barrier(sycl::access::fence_space::local_space);
333 T
y = (lane < nwarps) ? shared[lane] : 0;
335 T s = sycl::shift_group_right(sg,
y, i);
336 if (lane >= i) {
y += s; }
339 if (lane < nwarps) { shared2[lane] =
y; }
342 gh.item->barrier(sycl::access::fence_space::local_space);
349 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
350 T tmp_out = sum_prev_warp + sum_prev_chunk +
x;
351 sum_prev_chunk += shared2[nwarps-1];
354 blocksum_p[
offset] = tmp_out;
359 if (threadIdxx == 0) {
360 *totalsum_p = sum_prev_chunk;
364 amrex::launch<nthreads>(nblocks, 0, stream,
367 int threadIdxx = gh.item->get_local_id(0);
368 int blockIdxx = gh.item->get_group_linear_id();
369 int blockDimx = gh.item->get_local_range(0);
372 N ibegin =
static_cast<N
>(nelms_per_block) * blockIdxx;
373 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
374 T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
396template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE,
397 typename M=std::enable_if_t<std::is_integral_v<N> &&
398 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
399 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
400T
PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum =
retSum)
402 if (n <= 0) {
return 0; }
403 constexpr int nwarps_per_block = 8;
405 constexpr int nchunks = 12;
406 constexpr int nelms_per_block = nthreads * nchunks;
408 std::numeric_limits<int>::max())*nelms_per_block);
409 int nblocks = (
static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
411#ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
413 return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
420 using BlockStatusT = std::conditional_t<
sizeof(detail::STVA<T>) <= 8,
421 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
423 std::size_t nbytes_blockstatus =
Arena::align(
sizeof(BlockStatusT)*nblocks);
424 std::size_t nbytes_blockid =
Arena::align(
sizeof(
unsigned int));
430 unsigned int*
AMREX_RESTRICT virtual_block_id_p = (
unsigned int*)(dp + nbytes_blockstatus);
431 T*
AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
434 BlockStatusT& block_status = block_status_p[i];
435 block_status.set_status(
'x');
437 *virtual_block_id_p = 0;
442 amrex::launch<nthreads>(nblocks, sm, stream,
445 sycl::sub_group
const& sg = gh.item->get_sub_group();
446 int lane = sg.get_local_id()[0];
447 int warp = sg.get_group_id()[0];
448 int nwarps = sg.get_group_range()[0];
450 int threadIdxx = gh.item->get_local_id(0);
451 int blockDimx = gh.item->get_local_range(0);
452 int gridDimx = gh.item->get_group_range(0);
454 T* shared = (T*)(gh.local);
460 int virtual_block_id = 0;
462 int& virtual_block_id_shared = *((
int*)(shared2+nwarps));
463 if (threadIdxx == 0) {
465 virtual_block_id_shared = bid;
467 gh.item->barrier(sycl::access::fence_space::local_space);
468 virtual_block_id = virtual_block_id_shared;
472 N ibegin =
static_cast<N
>(nelms_per_block) * virtual_block_id;
473 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
474 BlockStatusT& block_status = block_status_p[virtual_block_id];
485 T sum_prev_chunk = 0;
487 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
488 N
offset = ibegin + ichunk*blockDimx;
489 if (
offset >= iend) {
break; }
493 if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value &&
offset == n-1) {
499 T s = sycl::shift_group_right(sg,
x, i);
500 if (lane >= i) {
x += s; }
510 gh.item->barrier(sycl::access::fence_space::local_space);
515 T
y = (lane < nwarps) ? shared[lane] : 0;
517 T s = sycl::shift_group_right(sg,
y, i);
518 if (lane >= i) {
y += s; }
521 if (lane < nwarps) { shared2[lane] =
y; }
524 gh.item->barrier(sycl::access::fence_space::local_space);
531 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
532 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
533 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ?
x :
x-x0);
534 sum_prev_chunk += shared2[nwarps-1];
538 if (threadIdxx == 0 && gridDimx > 1) {
539 block_status.write((virtual_block_id == 0) ?
'p' :
'a',
543 if (virtual_block_id == 0) {
544 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
545 N
offset = ibegin + ichunk*blockDimx + threadIdxx;
546 if (
offset >= iend) {
break; }
547 fout(
offset, tmp_out[ichunk]);
549 *totalsum_p += tmp_out[ichunk];
552 }
else if (virtual_block_id > 0) {
555 T exclusive_prefix = 0;
556 BlockStatusT
volatile* pbs = block_status_p;
559 int iblock = iblock0-lane;
560 detail::STVA<T> stva{
'p', 0};
562 stva = pbs[iblock].wait();
568 unsigned status_bf = (stva.status ==
'p') ? (0x1u << lane) : 0;
570 status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
573 bool stop_lookback = status_bf & 0x1u;
574 if (stop_lookback ==
false) {
575 if (status_bf != 0) {
577 if (lane > 0) {
x = 0; }
578 unsigned int bit_mask = 0x1u;
581 if (i == lane) {
x =
y; }
582 if (status_bf & bit_mask) {
583 stop_lookback =
true;
590 x += sycl::shift_group_left(sg,
x,i);
594 if (lane == 0) { exclusive_prefix +=
x; }
595 if (stop_lookback) {
break; }
599 block_status.write(
'p', block_status.get_aggregate() + exclusive_prefix);
600 shared[0] = exclusive_prefix;
604 gh.item->barrier(sycl::access::fence_space::local_space);
606 T exclusive_prefix = shared[0];
608 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
609 N
offset = ibegin + ichunk*blockDimx + threadIdxx;
610 if (
offset >= iend) {
break; }
611 T t = tmp_out[ichunk] + exclusive_prefix;
637#elif defined(AMREX_USE_HIP)
639template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE,
640 typename M=std::enable_if_t<std::is_integral_v<N> &&
641 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
642 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
643T
PrefixSum (N n, FIN
const& fin, FOUT
const& fout, TYPE, RetSum a_ret_sum =
retSum)
645 if (n <= 0) {
return 0; }
646 constexpr int nwarps_per_block = 4;
648 constexpr int nelms_per_thread =
sizeof(T) >= 8 ? 8 : 16;
649 constexpr int nelms_per_block = nthreads * nelms_per_thread;
651 std::numeric_limits<int>::max())*nelms_per_block);
652 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
656 using ScanTileState = rocprim::detail::lookback_scan_state<T>;
657 using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
659#if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
660 (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
661 defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
663 std::size_t nbytes_tile_state = rocprim::detail::align_size
664 (ScanTileState::get_storage_size(nblocks));
665 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
667 auto dp = (
char*)(
The_Arena()->
alloc(nbytes_tile_state+nbytes_block_id));
669 ScanTileState tile_state = ScanTileState::create(dp, nblocks);
673 std::size_t nbytes_tile_state;
674 AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
675 nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
677 std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
679 auto dp = (
char*)(
The_Arena()->
alloc(nbytes_tile_state+nbytes_block_id));
681 ScanTileState tile_state;
682 AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
686 auto ordered_block_id = OrderedBlockId::create
687 (
reinterpret_cast<OrderedBlockId::id_type*
>(dp + nbytes_tile_state));
690 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=]
AMREX_GPU_DEVICE ()
692 auto& scan_tile_state =
const_cast<ScanTileState&
>(tile_state);
693 auto& scan_bid =
const_cast<OrderedBlockId&
>(ordered_block_id);
694 const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
695 if (gid == 0) { scan_bid.reset(); }
696 scan_tile_state.initialize_prefix(gid, nblocks);
701 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
704 using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
705 rocprim::block_load_method::block_load_transpose>;
706 using BlockScan = rocprim::block_scan<T, nthreads,
707 rocprim::block_scan_algorithm::using_warp_scan>;
708 using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
709 using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
710 <T, rocprim::plus<T>, ScanTileState>;
712 __shared__
struct TempStorage {
713 typename OrderedBlockId::storage_type ordered_bid;
715 typename BlockLoad::storage_type load;
716 typename BlockExchange::storage_type exchange;
717 typename BlockScan::storage_type scan;
722 auto& scan_tile_state =
const_cast<ScanTileState&
>(tile_state);
723 auto& scan_bid =
const_cast<OrderedBlockId&
>(ordered_block_id);
725 auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
728 N ibegin =
static_cast<N
>(nelms_per_block) * virtual_block_id;
729 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
731 auto input_begin = rocprim::make_transform_iterator(
732 rocprim::make_counting_iterator(N(0)),
733 [&] (N i) -> T {
return fin(i+ibegin); });
735 T data[nelms_per_thread];
736 if (
static_cast<int>(iend-ibegin) == nelms_per_block) {
737 BlockLoad().load(input_begin, data, temp_storage.load);
740 BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
745 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
747 if (virtual_block_id == 0) {
750 BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
752 BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
754 if (threadIdx.x == 0) {
756 scan_tile_state.set_complete(0, block_agg);
757 }
else if (nblocks == 1 && totalsum_p) {
758 *totalsum_p = block_agg;
762 T last = data[nelms_per_thread-1];
764 LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
766 BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
769 BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
773 if (iend == n && threadIdx.x == nthreads-1) {
774 T tsum = data[nelms_per_thread-1];
783 BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
785 for (
int i = 0; i < nelms_per_thread; ++i) {
786 N
offset = ibegin + i*nthreads + threadIdx.x;
800 T ret = (a_ret_sum) ? *totalsum_p : T(0);
806#elif defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
808template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE,
809 typename M=std::enable_if_t<std::is_integral_v<N> &&
810 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
811 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
812T
PrefixSum (N n, FIN
const& fin, FOUT
const& fout, TYPE, RetSum a_ret_sum =
retSum)
814 if (n <= 0) {
return 0; }
815 constexpr int nwarps_per_block = 8;
817 constexpr int nelms_per_thread =
sizeof(T) >= 8 ? 4 : 8;
818 constexpr int nelms_per_block = nthreads * nelms_per_thread;
820 std::numeric_limits<int>::max())*nelms_per_block);
821 int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
825 using ScanTileState = cub::ScanTileState<T>;
826 std::size_t tile_state_size = 0;
827 ScanTileState::AllocationSize(nblocks, tile_state_size);
829 std::size_t nbytes_tile_state =
Arena::align(tile_state_size);
830 auto tile_state_p = (
char*)(
The_Arena()->
alloc(nbytes_tile_state));
832 ScanTileState tile_state;
833 tile_state.Init(nblocks, tile_state_p, tile_state_size);
837 amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=]
AMREX_GPU_DEVICE ()
839 const_cast<ScanTileState&
>(tile_state).InitializeStatus(nblocks);
845 amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
848 using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
849 using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
850 using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
852#ifdef AMREX_CUDA_CCCL_VER_GE_3
853 using Sum = cuda::std::plus<T>;
855 using Sum = cub::Sum;
857 using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, Sum, ScanTileState>;
859 __shared__
union TempStorage
861 typename BlockLoad::TempStorage load;
862 typename BlockExchange::TempStorage exchange;
864 typename BlockScan::TempStorage scan;
865 typename TilePrefixCallbackOp::TempStorage prefix;
870 auto& scan_tile_state =
const_cast<ScanTileState&
>(tile_state);
872 int virtual_block_id = blockIdx.x;
875 N ibegin =
static_cast<N
>(nelms_per_block) * virtual_block_id;
876 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
878 auto input_lambda = [&] (N i) -> T {
return fin(i+ibegin); };
879#ifdef AMREX_CUDA_CCCL_VER_GE_3
880 thrust::transform_iterator<
decltype(input_lambda),thrust::counting_iterator<N> >
881 input_begin(thrust::counting_iterator<N>(0), input_lambda);
883 cub::TransformInputIterator<T,
decltype(input_lambda),cub::CountingInputIterator<N> >
884 input_begin(cub::CountingInputIterator<N>(0), input_lambda);
887 T data[nelms_per_thread];
888 if (
static_cast<int>(iend-ibegin) == nelms_per_block) {
889 BlockLoad(temp_storage.load).Load(input_begin, data);
891 BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0);
896 constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
898 if (virtual_block_id == 0) {
901 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
903 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
905 if (threadIdx.x == 0) {
907 scan_tile_state.SetInclusive(0, block_agg);
908 }
else if (nblocks == 1 && totalsum_p) {
909 *totalsum_p = block_agg;
913 T last = data[nelms_per_thread-1];
915 TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
916 Sum{}, virtual_block_id);
918 BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
920 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
923 if (iend == n && threadIdx.x == nthreads-1) {
924 T tsum = data[nelms_per_thread-1];
933 BlockExchange(temp_storage.exchange).BlockedToStriped(data);
935 for (
int i = 0; i < nelms_per_thread; ++i) {
936 N
offset = ibegin + i*nthreads + threadIdx.x;
950 T ret = (a_ret_sum) ? *totalsum_p : T(0);
959template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE,
960 typename M=std::enable_if_t<std::is_integral_v<N> &&
961 (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
962 std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
965 if (n <= 0) {
return 0; }
966 constexpr int nwarps_per_block = 4;
968 constexpr int nchunks = 12;
969 constexpr int nelms_per_block = nthreads * nchunks;
971 std::numeric_limits<int>::max())*nelms_per_block);
972 int nblocks = (
static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
976 using BlockStatusT = std::conditional_t<
sizeof(detail::STVA<T>) <= 8,
977 detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
979 std::size_t nbytes_blockstatus =
Arena::align(
sizeof(BlockStatusT)*nblocks);
980 std::size_t nbytes_blockid =
Arena::align(
sizeof(
unsigned int));
986 unsigned int*
AMREX_RESTRICT virtual_block_id_p = (
unsigned int*)(dp + nbytes_blockstatus);
987 T*
AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
990 BlockStatusT& block_status = block_status_p[i];
991 block_status.set_status(
'x');
993 *virtual_block_id_p = 0;
998 amrex::launch<nthreads>(nblocks, sm, stream,
1012 int virtual_block_id = 0;
1013 if (gridDim.x > 1) {
1014 int& virtual_block_id_shared = *((int*)(shared2+nwarps));
1015 if (threadIdx.x == 0) {
1016 unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
1017 virtual_block_id_shared = bid;
1020 virtual_block_id = virtual_block_id_shared;
1024 N ibegin =
static_cast<N
>(nelms_per_block) * virtual_block_id;
1025 N iend =
amrex::min(
static_cast<N
>(ibegin+nelms_per_block), n);
1026 BlockStatusT& block_status = block_status_p[virtual_block_id];
1037 T sum_prev_chunk = 0;
1039 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
1040 N
offset = ibegin + ichunk*nthreads;
1041 if (
offset >= iend) {
break; }
1052 T s = __shfl_up_sync(0xffffffff,
x, i); )
1053 if (lane >= i) {
x += s; }
1068#ifdef AMREX_USE_CUDA
1069 if (warp == 0 && lane < nwarps) {
1071 int mask = (1 << nwarps) - 1;
1072 for (
int i = 1; i <= nwarps; i *= 2) {
1073 T s = __shfl_up_sync(
mask,
y, i, nwarps);
1074 if (lane >= i) {
y += s; }
1081 if (lane < nwarps) {
1084 for (
int i = 1; i <= nwarps; i *= 2) {
1085 T s = __shfl_up(
y, i, nwarps);
1086 if (lane >= i) {
y += s; }
1088 if (lane < nwarps) {
1101 T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
1102 tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
1104 sum_prev_chunk += shared2[nwarps-1];
1108 if (threadIdx.x == 0 && gridDim.x > 1) {
1109 block_status.write((virtual_block_id == 0) ?
'p' :
'a',
1113 if (virtual_block_id == 0) {
1114 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
1115 N
offset = ibegin + ichunk*nthreads + threadIdx.x;
1116 if (
offset >= iend) {
break; }
1117 fout(
offset, tmp_out[ichunk]);
1119 *totalsum_p += tmp_out[ichunk];
1122 }
else if (virtual_block_id > 0) {
1125 T exclusive_prefix = 0;
1126 BlockStatusT
volatile* pbs = block_status_p;
1129 int iblock = iblock0-lane;
1130 detail::STVA<T> stva{
'p', 0};
1132 stva = pbs[iblock].wait();
1138 unsigned const status_bf = __ballot_sync(0xffffffff, stva.status ==
'p'));
1139 bool stop_lookback = status_bf & 0x1u;
1140 if (stop_lookback ==
false) {
1141 if (status_bf != 0) {
1143 if (lane > 0) {
x = 0; }
1145 unsigned bit_mask = 0x1u);
1148 if (i == lane) {
x =
y; }
1149 if (status_bf & bit_mask) {
1150 stop_lookback =
true;
1158 x += __shfl_down_sync(0xffffffff,
x, i); )
1162 if (lane == 0) { exclusive_prefix +=
x; }
1163 if (stop_lookback) {
break; }
1167 block_status.write(
'p', block_status.get_aggregate() + exclusive_prefix);
1168 shared[0] = exclusive_prefix;
1174 T exclusive_prefix = shared[0];
1176 for (
int ichunk = 0; ichunk < nchunks; ++ichunk) {
1177 N
offset = ibegin + ichunk*nthreads + threadIdx.x;
1178 if (
offset >= iend) {
break; }
1179 T t = tmp_out[ichunk] + exclusive_prefix;
1218template <
typename N,
typename T,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1221#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1222 void* d_temp =
nullptr;
1223 std::size_t temp_bytes = 0;
1237#elif defined(AMREX_USE_HIP)
1238 void* d_temp =
nullptr;
1239 std::size_t temp_bytes = 0;
1253#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1254 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1255 std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1264 if (
static_cast<Long>(n) <=
static_cast<Long>(std::numeric_limits<int>::max())) {
1265 return PrefixSum<T>(
static_cast<int>(n),
1268 Type::inclusive, a_ret_sum);
1270 return PrefixSum<T>(n,
1273 Type::inclusive, a_ret_sum);
1289template <
typename N,
typename T,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1292 if (n <= 0) {
return 0; }
1293#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1298 void* d_temp =
nullptr;
1299 std::size_t temp_bytes = 0;
1312 return in_last+out_last;
1313#elif defined(AMREX_USE_HIP)
1318 void* d_temp =
nullptr;
1319 std::size_t temp_bytes = 0;
1332 return in_last+out_last;
1333#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1338 auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1339 std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1346 return in_last+out_last;
1348 if (
static_cast<Long>(n) <=
static_cast<Long>(std::numeric_limits<int>::max())) {
1349 return PrefixSum<T>(
static_cast<int>(n),
1352 Type::exclusive, a_ret_sum);
1354 return PrefixSum<T>(n,
1357 Type::exclusive, a_ret_sum);
1364template <
typename T,
typename N,
typename FIN,
typename FOUT,
typename TYPE,
1365 typename M=std::enable_if_t<std::is_integral_v<N> &&
1366 (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
1367 std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
1368T PrefixSum (N n, FIN
const& fin, FOUT
const& fout, TYPE, RetSum = retSum)
1370 if (n <= 0) {
return 0; }
1372 for (N i = 0; i < n; ++i) {
1385template <
typename N,
typename T,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1386T
InclusiveSum (N n, T
const* in, T * out, RetSum = retSum)
1388#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1390 std::inclusive_scan(in, in+n, out);
1392 std::partial_sum(in, in+n, out);
1394 return (n > 0) ? out[n-1] : T(0);
1398template <
typename N,
typename T,
typename M=std::enable_if_t<std::is_
integral_v<N>> >
1399T
ExclusiveSum (N n, T
const* in, T * out, RetSum = retSum)
1401 if (n <= 0) {
return 0; }
1403 auto in_last = in[n-1];
1404#if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1406 std::exclusive_scan(in, in+n, out, 0);
1409 std::partial_sum(in, in+n-1, out+1);
1411 return in_last + out[n-1];