Block-Structured AMR Software Framework
AMReX_Scan.H
Go to the documentation of this file.
1 #ifndef AMREX_SCAN_H_
2 #define AMREX_SCAN_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_Extension.H>
6 #include <AMReX_Gpu.H>
7 #include <AMReX_Arena.H>
8 
9 #if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
10 # include <cub/cub.cuh>
11 #elif defined(AMREX_USE_HIP)
12 # include <rocprim/rocprim.hpp>
13 #elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
14 # include <oneapi/dpl/execution>
15 # include <oneapi/dpl/numeric>
16 #endif
17 
18 #include <cstdint>
19 #include <numeric>
20 #include <type_traits>
21 
22 namespace amrex {
23 namespace Scan {
24 
25 struct RetSum {
26  bool flag = true;
27  explicit operator bool() const noexcept { return flag; }
28 };
29 static constexpr RetSum retSum{true};
30 static constexpr RetSum noRetSum{false};
31 
32 namespace Type {
33  static constexpr struct Inclusive {} inclusive{};
34  static constexpr struct Exclusive {} exclusive{};
35 }
36 
37 #if defined(AMREX_USE_GPU)
38 
39 namespace detail {
40 
41 template <typename T>
42 struct STVA
43 {
44  char status;
45  T value;
46 };
47 
48 template <typename T, bool SINGLE_WORD> struct BlockStatus {};
49 
50 template <typename T>
51 struct BlockStatus<T, true>
52 {
53  template<typename U>
54  union Data {
56  uint64_t i;
57  void operator=(Data<U> const&) = delete;
58  void operator=(Data<U> &&) = delete;
59  };
60  Data<T> d;
61 
63  void write (char a_status, T a_value) {
64 #if defined(AMREX_USE_CUDA)
65  volatile uint64_t tmp;
66  reinterpret_cast<STVA<T> volatile&>(tmp).status = a_status;
67  reinterpret_cast<STVA<T> volatile&>(tmp).value = a_value;
68  reinterpret_cast<uint64_t&>(d.s) = tmp;
69 #else
70  Data<T> tmp;
71  tmp.s = {a_status, a_value};
72  static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
73  "HIP/SYCL: unsigned long long must be 64 bits");
74  Gpu::Atomic::Exch(reinterpret_cast<unsigned long long*>(&d),
75  reinterpret_cast<unsigned long long&>(tmp));
76 #endif
77  }
78 
80  T get_aggregate() const { return d.s.value; }
81 
83  STVA<T> read () volatile {
84 #if defined(AMREX_USE_CUDA)
85  volatile uint64_t tmp = reinterpret_cast<uint64_t volatile&>(d);
86  return {reinterpret_cast<STVA<T> volatile&>(tmp).status,
87  reinterpret_cast<STVA<T> volatile&>(tmp).value };
88 #else
89  static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
90  "HIP/SYCL: unsigned long long must be 64 bits");
91  unsigned long long tmp = Gpu::Atomic::Add
92  (reinterpret_cast<unsigned long long*>(const_cast<Data<T>*>(&d)), 0ull);
93  return (*reinterpret_cast<Data<T>*>(&tmp)).s;
94 #endif
95  }
96 
98  void set_status (char a_status) { d.s.status = a_status; }
99 
101  STVA<T> wait () volatile {
102  STVA<T> r;
103  do {
104 #if defined(AMREX_USE_SYCL)
105  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
106 #else
107  __threadfence_block();
108 #endif
109  r = read();
110  } while (r.status == 'x');
111  return r;
112  }
113 };
114 
115 template <typename T>
116 struct BlockStatus<T, false>
117 {
120  char status;
121 
123  void write (char a_status, T a_value) {
124  if (a_status == 'a') {
125  aggregate = a_value;
126  } else {
127  inclusive = a_value;
128  }
129 #if defined(AMREX_USE_SYCL)
130  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
131 #else
132  __threadfence();
133 #endif
134  status = a_status;
135  }
136 
138  T get_aggregate() const { return aggregate; }
139 
141  STVA<T> read () volatile {
142 #if defined(AMREX_USE_SYCL)
143  constexpr auto mo = sycl::memory_order::relaxed;
144  constexpr auto ms = sycl::memory_scope::device;
145  constexpr auto as = sycl::access::address_space::global_space;
146 #endif
147  if (status == 'x') {
148  return {'x', 0};
149  } else if (status == 'a') {
150 #if defined(AMREX_USE_SYCL)
151  sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(aggregate)};
152  return {'a', ar.load()};
153 #else
154  return {'a', aggregate};
155 #endif
156  } else {
157 #if defined(AMREX_USE_SYCL)
158  sycl::atomic_ref<T,mo,ms,as> ar{const_cast<T&>(inclusive)};
159  return {'p', ar.load()};
160 #else
161  return {'p', inclusive};
162 #endif
163  }
164  }
165 
167  void set_status (char a_status) { status = a_status; }
168 
170  STVA<T> wait () volatile {
171  STVA<T> r;
172  do {
173  r = read();
174 #if defined(AMREX_USE_SYCL)
175  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
176 #else
177  __threadfence();
178 #endif
179  } while (r.status == 'x');
180  return r;
181  }
182 };
183 
184 }
185 
186 #if defined(AMREX_USE_SYCL)
187 
188 #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
189 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE>
190 T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
191 {
192  if (n <= 0) { return 0; }
193  constexpr int nwarps_per_block = 8;
194  constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
195  constexpr int nchunks = 12;
196  constexpr int nelms_per_block = nthreads * nchunks;
197  AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
198  int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
199  std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block);
200  auto stream = Gpu::gpuStream();
201 
202  std::size_t nbytes_blockresult = Arena::align(sizeof(T)*n);
203  std::size_t nbytes_blocksum = Arena::align(sizeof(T)*nblocks);
204  std::size_t nbytes_totalsum = Arena::align(sizeof(T));
205  auto dp = (char*)(The_Arena()->alloc(nbytes_blockresult
206  + nbytes_blocksum
207  + nbytes_totalsum));
208  T* blockresult_p = (T*)dp;
209  T* blocksum_p = (T*)(dp + nbytes_blockresult);
210  T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
211 
212  amrex::launch(nblocks, nthreads, sm, stream,
213  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
214  {
215  sycl::sub_group const& sg = gh.item->get_sub_group();
216  int lane = sg.get_local_id()[0];
217  int warp = sg.get_group_id()[0];
218  int nwarps = sg.get_group_range()[0];
219 
220  int threadIdxx = gh.item->get_local_id(0);
221  int blockIdxx = gh.item->get_group_linear_id();
222  int blockDimx = gh.item->get_local_range(0);
223 
224  T* shared = (T*)(gh.local);
225  T* shared2 = shared + Gpu::Device::warp_size;
226 
227  // Each block processes [ibegin,iend).
228  N ibegin = nelms_per_block * blockIdxx;
229  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
230 
231  // Each block is responsible for nchunks chunks of data,
232  // where each chunk has blockDim.x elements, one for each
233  // thread in the block.
234  T sum_prev_chunk = 0; // inclusive sum from previous chunks.
235  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
236  N offset = ibegin + ichunk*blockDimx;
237  if (offset >= iend) { break; }
238 
239  offset += threadIdxx;
240  T x0 = (offset < iend) ? fin(offset) : 0;
241  T x = x0;
242  // Scan within a warp
243  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
244  T s = sycl::shift_group_right(sg, x, i);
245  if (lane >= i) { x += s; }
246  }
247 
248  // x now holds the inclusive sum within the warp. The
249  // last thread in each warp holds the inclusive sum of
250  // this warp. We will store it in shared memory.
251  if (lane == Gpu::Device::warp_size - 1) {
252  shared[warp] = x;
253  }
254 
255  gh.item->barrier(sycl::access::fence_space::local_space);
256 
257  // The first warp will do scan on the warp sums for the
258  // whole block.
259  if (warp == 0) {
260  T y = (lane < nwarps) ? shared[lane] : 0;
261  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
262  T s = sycl::shift_group_right(sg, y, i);
263  if (lane >= i) { y += s; }
264  }
265 
266  if (lane < nwarps) { shared2[lane] = y; }
267  }
268 
269  gh.item->barrier(sycl::access::fence_space::local_space);
270 
271  // shared[0:nwarps) holds the inclusive sum of warp sums.
272 
273  // Also note x still holds the inclusive sum within the
274  // warp. Given these two, we can compute the inclusive
275  // sum within this chunk.
276  T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
277  T tmp_out = sum_prev_warp + sum_prev_chunk +
278  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
279  sum_prev_chunk += shared2[nwarps-1];
280 
281  if (offset < iend) {
282  blockresult_p[offset] = tmp_out;
283  }
284  }
285 
286  // sum_prev_chunk now holds the sum of the whole block.
287  if (threadIdxx == 0) {
288  blocksum_p[blockIdxx] = sum_prev_chunk;
289  }
290  });
291 
292  amrex::launch(1, nthreads, sm, stream,
293  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
294  {
295  sycl::sub_group const& sg = gh.item->get_sub_group();
296  int lane = sg.get_local_id()[0];
297  int warp = sg.get_group_id()[0];
298  int nwarps = sg.get_group_range()[0];
299 
300  int threadIdxx = gh.item->get_local_id(0);
301  int blockDimx = gh.item->get_local_range(0);
302 
303  T* shared = (T*)(gh.local);
304  T* shared2 = shared + Gpu::Device::warp_size;
305 
306  T sum_prev_chunk = 0;
307  for (int offset = threadIdxx; offset - threadIdxx < nblocks; offset += blockDimx) {
308  T x = (offset < nblocks) ? blocksum_p[offset] : 0;
309  // Scan within a warp
310  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
311  T s = sycl::shift_group_right(sg, x, i);
312  if (lane >= i) { x += s; }
313  }
314 
315  // x now holds the inclusive sum within the warp. The
316  // last thread in each warp holds the inclusive sum of
317  // this warp. We will store it in shared memory.
318  if (lane == Gpu::Device::warp_size - 1) {
319  shared[warp] = x;
320  }
321 
322  gh.item->barrier(sycl::access::fence_space::local_space);
323 
324  // The first warp will do scan on the warp sums for the
325  // whole block.
326  if (warp == 0) {
327  T y = (lane < nwarps) ? shared[lane] : 0;
328  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
329  T s = sycl::shift_group_right(sg, y, i);
330  if (lane >= i) { y += s; }
331  }
332 
333  if (lane < nwarps) { shared2[lane] = y; }
334  }
335 
336  gh.item->barrier(sycl::access::fence_space::local_space);
337 
338  // shared[0:nwarps) holds the inclusive sum of warp sums.
339 
340  // Also note x still holds the inclusive sum within the
341  // warp. Given these two, we can compute the inclusive
342  // sum within this chunk.
343  T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
344  T tmp_out = sum_prev_warp + sum_prev_chunk + x;
345  sum_prev_chunk += shared2[nwarps-1];
346 
347  if (offset < nblocks) {
348  blocksum_p[offset] = tmp_out;
349  }
350  }
351 
352  // sum_prev_chunk now holds the total sum.
353  if (threadIdxx == 0) {
354  *totalsum_p = sum_prev_chunk;
355  }
356  });
357 
358  amrex::launch(nblocks, nthreads, 0, stream,
359  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
360  {
361  int threadIdxx = gh.item->get_local_id(0);
362  int blockIdxx = gh.item->get_group_linear_id();
363  int blockDimx = gh.item->get_local_range(0);
364 
365  // Each block processes [ibegin,iend).
366  N ibegin = nelms_per_block * blockIdxx;
367  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
368  T prev_sum = (blockIdxx == 0) ? 0 : blocksum_p[blockIdxx-1];
369  for (N offset = ibegin + threadIdxx; offset < iend; offset += blockDimx) {
370  fout(offset, prev_sum + blockresult_p[offset]);
371  }
372  });
373 
374  T totalsum = 0;
375  if (a_ret_sum) {
376  Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
377  }
379  The_Arena()->free(dp);
380 
382 
383  return totalsum;
384 }
385 #endif
386 
387 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
388  typename M=std::enable_if_t<std::is_integral<N>::value &&
389  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
390  std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
391 T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum)
392 {
393  if (n <= 0) { return 0; }
394  constexpr int nwarps_per_block = 8;
395  constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
396  constexpr int nchunks = 12;
397  constexpr int nelms_per_block = nthreads * nchunks;
398  AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
399  int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
400 
401 #ifndef AMREX_SYCL_NO_MULTIPASS_SCAN
402  if (nblocks > 1) {
403  return PrefixSum_mp<T>(n, std::forward<FIN>(fin), std::forward<FOUT>(fout), type, a_ret_sum);
404  }
405 #endif
406 
407  std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
408  auto stream = Gpu::gpuStream();
409 
410  using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
411  detail::BlockStatus<T,true>, detail::BlockStatus<T,false> >;
412 
413  std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
414  std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
415  std::size_t nbytes_totalsum = Arena::align(sizeof(T));
416  auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
417  + nbytes_blockid
418  + nbytes_totalsum));
419  BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
420  unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
421  T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
422 
423  amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
424  BlockStatusT& block_status = block_status_p[i];
425  block_status.set_status('x');
426  if (i == 0) {
427  *virtual_block_id_p = 0;
428  *totalsum_p = 0;
429  }
430  });
431 
432  amrex::launch(nblocks, nthreads, sm, stream,
433  [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
434  {
435  sycl::sub_group const& sg = gh.item->get_sub_group();
436  int lane = sg.get_local_id()[0];
437  int warp = sg.get_group_id()[0];
438  int nwarps = sg.get_group_range()[0];
439 
440  int threadIdxx = gh.item->get_local_id(0);
441  int blockDimx = gh.item->get_local_range(0);
442  int gridDimx = gh.item->get_group_range(0);
443 
444  T* shared = (T*)(gh.local);
445  T* shared2 = shared + Gpu::Device::warp_size;
446 
447  // First of all, get block virtual id. We must do this to
448  // avoid deadlock because blocks may be launched in any order.
449  // Anywhere in this function, we should not use blockIdx.
450  int virtual_block_id = 0;
451  if (gridDimx > 1) {
452  int& virtual_block_id_shared = *((int*)(shared2+nwarps));
453  if (threadIdxx == 0) {
454  unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
455  virtual_block_id_shared = bid;
456  }
457  gh.item->barrier(sycl::access::fence_space::local_space);
458  virtual_block_id = virtual_block_id_shared;
459  }
460 
461  // Each block processes [ibegin,iend).
462  N ibegin = nelms_per_block * virtual_block_id;
463  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
464  BlockStatusT& block_status = block_status_p[virtual_block_id];
465 
466  //
467  // The overall algorithm is based on "Single-pass Parallel
468  // Prefix Scan with Decoupled Look-back" by D. Merrill &
469  // M. Garland.
470  //
471 
472  // Each block is responsible for nchunks chunks of data,
473  // where each chunk has blockDim.x elements, one for each
474  // thread in the block.
475  T sum_prev_chunk = 0; // inclusive sum from previous chunks.
476  T tmp_out[nchunks]; // block-wide inclusive sum for chunks
477  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
478  N offset = ibegin + ichunk*blockDimx;
479  if (offset >= iend) { break; }
480 
481  offset += threadIdxx;
482  T x0 = (offset < iend) ? fin(offset) : 0;
483  if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
484  *totalsum_p += x0;
485  }
486  T x = x0;
487  // Scan within a warp
488  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
489  T s = sycl::shift_group_right(sg, x, i);
490  if (lane >= i) { x += s; }
491  }
492 
493  // x now holds the inclusive sum within the warp. The
494  // last thread in each warp holds the inclusive sum of
495  // this warp. We will store it in shared memory.
496  if (lane == Gpu::Device::warp_size - 1) {
497  shared[warp] = x;
498  }
499 
500  gh.item->barrier(sycl::access::fence_space::local_space);
501 
502  // The first warp will do scan on the warp sums for the
503  // whole block.
504  if (warp == 0) {
505  T y = (lane < nwarps) ? shared[lane] : 0;
506  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
507  T s = sycl::shift_group_right(sg, y, i);
508  if (lane >= i) { y += s; }
509  }
510 
511  if (lane < nwarps) { shared2[lane] = y; }
512  }
513 
514  gh.item->barrier(sycl::access::fence_space::local_space);
515 
516  // shared[0:nwarps) holds the inclusive sum of warp sums.
517 
518  // Also note x still holds the inclusive sum within the
519  // warp. Given these two, we can compute the inclusive
520  // sum within this chunk.
521  T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
522  tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
523  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
524  sum_prev_chunk += shared2[nwarps-1];
525  }
526 
527  // sum_prev_chunk now holds the sum of the whole block.
528  if (threadIdxx == 0 && gridDimx > 1) {
529  block_status.write((virtual_block_id == 0) ? 'p' : 'a',
530  sum_prev_chunk);
531  }
532 
533  if (virtual_block_id == 0) {
534  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
535  N offset = ibegin + ichunk*blockDimx + threadIdxx;
536  if (offset >= iend) { break; }
537  fout(offset, tmp_out[ichunk]);
538  if (offset == n-1) {
539  *totalsum_p += tmp_out[ichunk];
540  }
541  }
542  } else if (virtual_block_id > 0) {
543 
544  if (warp == 0) {
545  T exclusive_prefix = 0;
546  BlockStatusT volatile* pbs = block_status_p;
547  for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
548  {
549  int iblock = iblock0-lane;
550  detail::STVA<T> stva{'p', 0};
551  if (iblock >= 0) {
552  stva = pbs[iblock].wait();
553  }
554 
555  T x = stva.value;
556 
557  // implement our own __ballot
558  unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
559  for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
560  status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
561  }
562 
563  bool stop_lookback = status_bf & 0x1u;
564  if (stop_lookback == false) {
565  if (status_bf != 0) {
566  T y = x;
567  if (lane > 0) { x = 0; }
568  unsigned int bit_mask = 0x1u;
569  for (int i = 1; i < Gpu::Device::warp_size; ++i) {
570  bit_mask <<= 1;
571  if (i == lane) { x = y; }
572  if (status_bf & bit_mask) {
573  stop_lookback = true;
574  break;
575  }
576  }
577  }
578 
579  for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
580  x += sycl::shift_group_left(sg, x,i);
581  }
582  }
583 
584  if (lane == 0) { exclusive_prefix += x; }
585  if (stop_lookback) { break; }
586  }
587 
588  if (lane == 0) {
589  block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
590  shared[0] = exclusive_prefix;
591  }
592  }
593 
594  gh.item->barrier(sycl::access::fence_space::local_space);
595 
596  T exclusive_prefix = shared[0];
597 
598  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
599  N offset = ibegin + ichunk*blockDimx + threadIdxx;
600  if (offset >= iend) { break; }
601  T t = tmp_out[ichunk] + exclusive_prefix;
602  fout(offset, t);
603  if (offset == n-1) {
604  *totalsum_p += t;
605  }
606  }
607  }
608  });
609 
610  T totalsum = 0;
611  if (a_ret_sum) {
612  // xxxxx SYCL todo: Should test if using pinned memory and thus
613  // avoiding memcpy is faster.
614  Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
615  }
617  The_Arena()->free(dp);
618 
620 
621  return totalsum;
622 }
623 
624 #elif defined(AMREX_USE_HIP)
625 
626 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
627  typename M=std::enable_if_t<std::is_integral<N>::value &&
628  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
629  std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
630 T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
631 {
632  if (n <= 0) { return 0; }
633  constexpr int nwarps_per_block = 4;
634  constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
635  constexpr int nelms_per_thread = sizeof(T) >= 8 ? 8 : 16;
636  constexpr int nelms_per_block = nthreads * nelms_per_thread;
637  int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
638  std::size_t sm = 0;
639  auto stream = Gpu::gpuStream();
640 
641  using ScanTileState = rocprim::detail::lookback_scan_state<T>;
642  using OrderedBlockId = rocprim::detail::ordered_block_id<unsigned int>;
643 
644 #if (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 6)) || \
645  (defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR == 6) && \
646  defined(HIP_VERSION_MINOR) && (HIP_VERSION_MINOR == 0))
647 
648  std::size_t nbytes_tile_state = rocprim::detail::align_size
649  (ScanTileState::get_storage_size(nblocks));
650  std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
651 
652  auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
653 
654  ScanTileState tile_state = ScanTileState::create(dp, nblocks);
655 
656 #else
657 
658  std::size_t nbytes_tile_state;
659  AMREX_HIP_SAFE_CALL(ScanTileState::get_storage_size(nblocks, stream, nbytes_tile_state));
660  nbytes_tile_state = rocprim::detail::align_size(nbytes_tile_state);
661 
662  std::size_t nbytes_block_id = OrderedBlockId::get_storage_size();
663 
664  auto dp = (char*)(The_Arena()->alloc(nbytes_tile_state+nbytes_block_id));
665 
666  ScanTileState tile_state;
667  AMREX_HIP_SAFE_CALL(ScanTileState::create(tile_state, dp, nblocks, stream));
668 
669 #endif
670 
671  auto ordered_block_id = OrderedBlockId::create
672  (reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
673 
674  // Init ScanTileState on device
675  amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
676  {
677  auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
678  auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
679  const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;
680  if (gid == 0) { scan_bid.reset(); }
681  scan_tile_state.initialize_prefix(gid, nblocks);
682  });
683 
684  T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
685 
686  amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
687  [=] AMREX_GPU_DEVICE () noexcept
688  {
689  using BlockLoad = rocprim::block_load<T, nthreads, nelms_per_thread,
690  rocprim::block_load_method::block_load_transpose>;
691  using BlockScan = rocprim::block_scan<T, nthreads,
692  rocprim::block_scan_algorithm::using_warp_scan>;
693  using BlockExchange = rocprim::block_exchange<T, nthreads, nelms_per_thread>;
694  using LookbackScanPrefixOp = rocprim::detail::lookback_scan_prefix_op
695  <T, rocprim::plus<T>, ScanTileState>;
696 
697  __shared__ struct TempStorage {
698  typename OrderedBlockId::storage_type ordered_bid;
699  union {
700  typename BlockLoad::storage_type load;
701  typename BlockExchange::storage_type exchange;
702  typename BlockScan::storage_type scan;
703  };
704  } temp_storage;
705 
706  // Lambda captured tile_state is const. We have to cast the const away.
707  auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
708  auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
709 
710  auto const virtual_block_id = scan_bid.get(threadIdx.x, temp_storage.ordered_bid);
711 
712  // Each block processes [ibegin,iend).
713  N ibegin = nelms_per_block * virtual_block_id;
714  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
715 
716  auto input_begin = rocprim::make_transform_iterator(
717  rocprim::make_counting_iterator(N(0)),
718  [&] (N i) -> T { return fin(i+ibegin); });
719 
720  T data[nelms_per_thread];
721  if (static_cast<int>(iend-ibegin) == nelms_per_block) {
722  BlockLoad().load(input_begin, data, temp_storage.load);
723  } else {
724  // padding with 0
725  BlockLoad().load(input_begin, data, iend-ibegin, 0, temp_storage.load);
726  }
727 
728  __syncthreads();
729 
730  constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
731 
732  if (virtual_block_id == 0) {
733  T block_agg;
734  AMREX_IF_CONSTEXPR(is_exclusive) {
735  BlockScan().exclusive_scan(data, data, T{0}, block_agg, temp_storage.scan);
736  } else {
737  BlockScan().inclusive_scan(data, data, block_agg, temp_storage.scan);
738  }
739  if (threadIdx.x == 0) {
740  if (nblocks > 1) {
741  scan_tile_state.set_complete(0, block_agg);
742  } else if (nblocks == 1 && totalsum_p) {
743  *totalsum_p = block_agg;
744  }
745  }
746  } else {
747  T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
748 
749  LookbackScanPrefixOp prefix_op(virtual_block_id, rocprim::plus<T>(), scan_tile_state);
750  AMREX_IF_CONSTEXPR(is_exclusive) {
751  BlockScan().exclusive_scan(data, data, temp_storage.scan, prefix_op,
752  rocprim::plus<T>());
753  } else {
754  BlockScan().inclusive_scan(data, data, temp_storage.scan, prefix_op,
755  rocprim::plus<T>());
756  }
757  if (totalsum_p) {
758  if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
759  T tsum = data[nelms_per_thread-1];
760  AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
761  *totalsum_p = tsum;
762  }
763  }
764  }
765 
766  __syncthreads();
767 
768  BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
769 
770  for (int i = 0; i < nelms_per_thread; ++i) {
771  N offset = ibegin + i*blockDim.x + threadIdx.x;
772  if (offset < iend) { fout(offset, data[i]); }
773  }
774  });
775 
778 
779  The_Arena()->free(dp);
780 
781  T ret = (a_ret_sum) ? *totalsum_p : T(0);
782  if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
783 
784  return ret;
785 }
786 
787 #elif defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
788 
789 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
790  typename M=std::enable_if_t<std::is_integral<N>::value &&
791  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
792  std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
793 T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
794 {
795  if (n <= 0) { return 0; }
796  constexpr int nwarps_per_block = 8;
797  constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size; // # of threads per block
798  constexpr int nelms_per_thread = sizeof(T) >= 8 ? 4 : 8;
799  constexpr int nelms_per_block = nthreads * nelms_per_thread;
800  int nblocks = (n + nelms_per_block - 1) / nelms_per_block;
801  std::size_t sm = 0;
802  auto stream = Gpu::gpuStream();
803 
804  using ScanTileState = cub::ScanTileState<T>;
805  std::size_t tile_state_size = 0;
806  ScanTileState::AllocationSize(nblocks, tile_state_size);
807 
808  std::size_t nbytes_tile_state = Arena::align(tile_state_size);
809  auto tile_state_p = (char*)(The_Arena()->alloc(nbytes_tile_state));
810 
811  ScanTileState tile_state;
812  tile_state.Init(nblocks, tile_state_p, tile_state_size); // Init ScanTileState on host
813 
814  if (nblocks > 1) {
815  // Init ScanTileState on device
816  amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
817  {
818  const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
819  });
820  }
821 
822  T* totalsum_p = (a_ret_sum) ? (T*)(The_Pinned_Arena()->alloc(sizeof(T))) : nullptr;
823 
824  amrex::launch_global<nthreads> <<<nblocks, nthreads, sm, stream>>> (
825  [=] AMREX_GPU_DEVICE () noexcept
826  {
827  using BlockLoad = cub::BlockLoad<T, nthreads, nelms_per_thread, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
828  using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
829  using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;
830  using TilePrefixCallbackOp = cub::TilePrefixCallbackOp<T, cub::Sum, ScanTileState>;
831 
832  __shared__ union TempStorage
833  {
834  typename BlockLoad::TempStorage load;
835  typename BlockExchange::TempStorage exchange;
836  struct ScanStorage {
837  typename BlockScan::TempStorage scan;
838  typename TilePrefixCallbackOp::TempStorage prefix;
839  } scan_storeage;
840  } temp_storage;
841 
842  // Lambda captured tile_state is const. We have to cast the const away.
843  auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
844 
845  int virtual_block_id = blockIdx.x;
846 
847  // Each block processes [ibegin,iend).
848  N ibegin = nelms_per_block * virtual_block_id;
849  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
850 
851  auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
852  cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
853  input_begin(cub::CountingInputIterator<N>(0), input_lambda);
854 
855  T data[nelms_per_thread];
856  if (static_cast<int>(iend-ibegin) == nelms_per_block) {
857  BlockLoad(temp_storage.load).Load(input_begin, data);
858  } else {
859  BlockLoad(temp_storage.load).Load(input_begin, data, iend-ibegin, 0); // padding with 0
860  }
861 
862  __syncthreads();
863 
864  constexpr bool is_exclusive = std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value;
865 
866  if (virtual_block_id == 0) {
867  T block_agg;
868  AMREX_IF_CONSTEXPR(is_exclusive) {
869  BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, block_agg);
870  } else {
871  BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, block_agg);
872  }
873  if (threadIdx.x == 0) {
874  if (nblocks > 1) {
875  scan_tile_state.SetInclusive(0, block_agg);
876  } else if (nblocks == 1 && totalsum_p) {
877  *totalsum_p = block_agg;
878  }
879  }
880  } else {
881  T last = data[nelms_per_thread-1]; // Need this for the total sum in exclusive case
882 
883  TilePrefixCallbackOp prefix_op(scan_tile_state, temp_storage.scan_storeage.prefix,
884  cub::Sum{}, virtual_block_id);
885  AMREX_IF_CONSTEXPR(is_exclusive) {
886  BlockScan(temp_storage.scan_storeage.scan).ExclusiveSum(data, data, prefix_op);
887  } else {
888  BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
889  }
890  if (totalsum_p) {
891  if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
892  T tsum = data[nelms_per_thread-1];
893  AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
894  *totalsum_p = tsum;
895  }
896  }
897  }
898 
899  __syncthreads();
900 
901  BlockExchange(temp_storage.exchange).BlockedToStriped(data);
902 
903  for (int i = 0; i < nelms_per_thread; ++i) {
904  N offset = ibegin + i*blockDim.x + threadIdx.x;
905  if (offset < iend) { fout(offset, data[i]); }
906  }
907  });
908 
911 
912  The_Arena()->free(tile_state_p);
913 
914  T ret = (a_ret_sum) ? *totalsum_p : T(0);
915  if (totalsum_p) { The_Pinned_Arena()->free(totalsum_p); }
916 
917  return ret;
918 }
919 
920 #else
921 
922 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
923  typename M=std::enable_if_t<std::is_integral<N>::value &&
924  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ||
925  std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value)> >
926 T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = retSum)
927 {
928  if (n <= 0) { return 0; }
929  constexpr int nwarps_per_block = 4;
930  constexpr int nthreads = nwarps_per_block*Gpu::Device::warp_size;
931  constexpr int nchunks = 12;
932  constexpr int nelms_per_block = nthreads * nchunks;
933  AMREX_ALWAYS_ASSERT(static_cast<Long>(n) < static_cast<Long>(std::numeric_limits<int>::max())*nelms_per_block);
934  int nblocks = (static_cast<Long>(n) + nelms_per_block - 1) / nelms_per_block;
935  std::size_t sm = sizeof(T) * (Gpu::Device::warp_size + nwarps_per_block) + sizeof(int);
936  auto stream = Gpu::gpuStream();
937 
938  using BlockStatusT = std::conditional_t<sizeof(detail::STVA<T>) <= 8,
940 
941  std::size_t nbytes_blockstatus = Arena::align(sizeof(BlockStatusT)*nblocks);
942  std::size_t nbytes_blockid = Arena::align(sizeof(unsigned int));
943  std::size_t nbytes_totalsum = Arena::align(sizeof(T));
944  auto dp = (char*)(The_Arena()->alloc( nbytes_blockstatus
945  + nbytes_blockid
946  + nbytes_totalsum));
947  BlockStatusT* AMREX_RESTRICT block_status_p = (BlockStatusT*)dp;
948  unsigned int* AMREX_RESTRICT virtual_block_id_p = (unsigned int*)(dp + nbytes_blockstatus);
949  T* AMREX_RESTRICT totalsum_p = (T*)(dp + nbytes_blockstatus + nbytes_blockid);
950 
951  amrex::ParallelFor(nblocks, [=] AMREX_GPU_DEVICE (int i) noexcept {
952  BlockStatusT& block_status = block_status_p[i];
953  block_status.set_status('x');
954  if (i == 0) {
955  *virtual_block_id_p = 0;
956  *totalsum_p = 0;
957  }
958  });
959 
960  amrex::launch(nblocks, nthreads, sm, stream,
961  [=] AMREX_GPU_DEVICE () noexcept
962  {
963  int lane = threadIdx.x % Gpu::Device::warp_size;
964  int warp = threadIdx.x / Gpu::Device::warp_size;
965  int nwarps = blockDim.x / Gpu::Device::warp_size;
966 
968  T* shared = gsm.dataPtr();
969  T* shared2 = shared + Gpu::Device::warp_size;
970 
971  // First of all, get block virtual id. We must do this to
972  // avoid deadlock because CUDA may launch blocks in any order.
973  // Anywhere in this function, we should not use blockIdx.
974  int virtual_block_id = 0;
975  if (gridDim.x > 1) {
976  int& virtual_block_id_shared = *((int*)(shared2+nwarps));
977  if (threadIdx.x == 0) {
978  unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
979  virtual_block_id_shared = bid;
980  }
981  __syncthreads();
982  virtual_block_id = virtual_block_id_shared;
983  }
984 
985  // Each block processes [ibegin,iend).
986  N ibegin = nelms_per_block * virtual_block_id;
987  N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);
988  BlockStatusT& block_status = block_status_p[virtual_block_id];
989 
990  //
991  // The overall algorithm is based on "Single-pass Parallel
992  // Prefix Scan with Decoupled Look-back" by D. Merrill &
993  // M. Garland.
994  //
995 
996  // Each block is responsible for nchunks chunks of data,
997  // where each chunk has blockDim.x elements, one for each
998  // thread in the block.
999  T sum_prev_chunk = 0; // inclusive sum from previous chunks.
1000  T tmp_out[nchunks]; // block-wide inclusive sum for chunks
1001  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1002  N offset = ibegin + ichunk*blockDim.x;
1003  if (offset >= iend) { break; }
1004 
1005  offset += threadIdx.x;
1006  T x0 = (offset < iend) ? fin(offset) : 0;
1007  if (std::is_same<std::decay_t<TYPE>,Type::Exclusive>::value && offset == n-1) {
1008  *totalsum_p += x0;
1009  }
1010  T x = x0;
1011  // Scan within a warp
1012  for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
1013  AMREX_HIP_OR_CUDA( T s = __shfl_up(x,i);,
1014  T s = __shfl_up_sync(0xffffffff, x, i); )
1015  if (lane >= i) { x += s; }
1016  }
1017 
1018  // x now holds the inclusive sum within the warp. The
1019  // last thread in each warp holds the inclusive sum of
1020  // this warp. We will store it in shared memory.
1021  if (lane == Gpu::Device::warp_size - 1) {
1022  shared[warp] = x;
1023  }
1024 
1025  __syncthreads();
1026 
1027  // The first warp will do scan on the warp sums for the
1028  // whole block. Not all threads in the warp need to
1029  // participate.
1030 #ifdef AMREX_USE_CUDA
1031  if (warp == 0 && lane < nwarps) {
1032  T y = shared[lane];
1033  int mask = (1 << nwarps) - 1;
1034  for (int i = 1; i <= nwarps; i *= 2) {
1035  T s = __shfl_up_sync(mask, y, i, nwarps);
1036  if (lane >= i) { y += s; }
1037  }
1038  shared2[lane] = y;
1039  }
1040 #else
1041  if (warp == 0) {
1042  T y = 0;
1043  if (lane < nwarps) {
1044  y = shared[lane];
1045  }
1046  for (int i = 1; i <= nwarps; i *= 2) {
1047  T s = __shfl_up(y, i, nwarps);
1048  if (lane >= i) { y += s; }
1049  }
1050  if (lane < nwarps) {
1051  shared2[lane] = y;
1052  }
1053  }
1054 #endif
1055 
1056  __syncthreads();
1057 
1058  // shared[0:nwarps) holds the inclusive sum of warp sums.
1059 
1060  // Also note x still holds the inclusive sum within the
1061  // warp. Given these two, we can compute the inclusive
1062  // sum within this chunk.
1063  T sum_prev_warp = (warp == 0) ? 0 : shared2[warp-1];
1064  tmp_out[ichunk] = sum_prev_warp + sum_prev_chunk +
1065  (std::is_same<std::decay_t<TYPE>,Type::Inclusive>::value ? x : x-x0);
1066  sum_prev_chunk += shared2[nwarps-1];
1067  }
1068 
1069  // sum_prev_chunk now holds the sum of the whole block.
1070  if (threadIdx.x == 0 && gridDim.x > 1) {
1071  block_status.write((virtual_block_id == 0) ? 'p' : 'a',
1072  sum_prev_chunk);
1073  }
1074 
1075  if (virtual_block_id == 0) {
1076  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1077  N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
1078  if (offset >= iend) { break; }
1079  fout(offset, tmp_out[ichunk]);
1080  if (offset == n-1) {
1081  *totalsum_p += tmp_out[ichunk];
1082  }
1083  }
1084  } else if (virtual_block_id > 0) {
1085 
1086  if (warp == 0) {
1087  T exclusive_prefix = 0;
1088  BlockStatusT volatile* pbs = block_status_p;
1089  for (int iblock0 = virtual_block_id-1; iblock0 >= 0; iblock0 -= Gpu::Device::warp_size)
1090  {
1091  int iblock = iblock0-lane;
1092  detail::STVA<T> stva{'p', 0};
1093  if (iblock >= 0) {
1094  stva = pbs[iblock].wait();
1095  }
1096 
1097  T x = stva.value;
1098 
1099  AMREX_HIP_OR_CUDA( uint64_t const status_bf = __ballot(stva.status == 'p');,
1100  unsigned const status_bf = __ballot_sync(0xffffffff, stva.status == 'p'));
1101  bool stop_lookback = status_bf & 0x1u;
1102  if (stop_lookback == false) {
1103  if (status_bf != 0) {
1104  T y = x;
1105  if (lane > 0) { x = 0; }
1106  AMREX_HIP_OR_CUDA(uint64_t bit_mask = 0x1ull;,
1107  unsigned bit_mask = 0x1u);
1108  for (int i = 1; i < Gpu::Device::warp_size; ++i) {
1109  bit_mask <<= 1;
1110  if (i == lane) { x = y; }
1111  if (status_bf & bit_mask) {
1112  stop_lookback = true;
1113  break;
1114  }
1115  }
1116  }
1117 
1118  for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
1119  AMREX_HIP_OR_CUDA( x += __shfl_down(x,i);,
1120  x += __shfl_down_sync(0xffffffff, x, i); )
1121  }
1122  }
1123 
1124  if (lane == 0) { exclusive_prefix += x; }
1125  if (stop_lookback) { break; }
1126  }
1127 
1128  if (lane == 0) {
1129  block_status.write('p', block_status.get_aggregate() + exclusive_prefix);
1130  shared[0] = exclusive_prefix;
1131  }
1132  }
1133 
1134  __syncthreads();
1135 
1136  T exclusive_prefix = shared[0];
1137 
1138  for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1139  N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
1140  if (offset >= iend) { break; }
1141  T t = tmp_out[ichunk] + exclusive_prefix;
1142  fout(offset, t);
1143  if (offset == n-1) {
1144  *totalsum_p += t;
1145  }
1146  }
1147  }
1148  });
1149 
1150  T totalsum = 0;
1151  if (a_ret_sum) {
1152  // xxxxx CUDA < 11 todo: Should test if using pinned memory and thus
1153  // avoiding memcpy is faster.
1154  Gpu::dtoh_memcpy_async(&totalsum, totalsum_p, sizeof(T));
1155  }
1157  The_Arena()->free(dp);
1158 
1160 
1161  return totalsum;
1162 }
1163 
1164 #endif
1165 
1166 // The return value is the total sum if a_ret_sum is true.
1167 template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
1168 T InclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1169 {
1170 #if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1171  void* d_temp = nullptr;
1172  std::size_t temp_bytes = 0;
1173  AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1174  Gpu::gpuStream()));
1175  d_temp = The_Arena()->alloc(temp_bytes);
1176  AMREX_GPU_SAFE_CALL(cub::DeviceScan::InclusiveSum(d_temp, temp_bytes, in, out, n,
1177  Gpu::gpuStream()));
1178  T totalsum = 0;
1179  if (a_ret_sum) {
1180  Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1181  }
1183  The_Arena()->free(d_temp);
1185  return totalsum;
1186 #elif defined(AMREX_USE_HIP)
1187  void* d_temp = nullptr;
1188  std::size_t temp_bytes = 0;
1189  AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1190  rocprim::plus<T>(), Gpu::gpuStream()));
1191  d_temp = The_Arena()->alloc(temp_bytes);
1192  AMREX_GPU_SAFE_CALL(rocprim::inclusive_scan(d_temp, temp_bytes, in, out, n,
1193  rocprim::plus<T>(), Gpu::gpuStream()));
1194  T totalsum = 0;
1195  if (a_ret_sum) {
1196  Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1197  }
1199  The_Arena()->free(d_temp);
1201  return totalsum;
1202 #elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1203  auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1204  std::inclusive_scan(policy, in, in+n, out, std::plus<T>(), T(0));
1205  T totalsum = 0;
1206  if (a_ret_sum) {
1207  Gpu::dtoh_memcpy_async(&totalsum, out+(n-1), sizeof(T));
1208  }
1211  return totalsum;
1212 #else
1213  if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1214  return PrefixSum<T>(static_cast<int>(n),
1215  [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1216  [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1217  Type::inclusive, a_ret_sum);
1218  } else {
1219  return PrefixSum<T>(n,
1220  [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1221  [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1222  Type::inclusive, a_ret_sum);
1223  }
1224 #endif
1225 }
1226 
1227 // The return value is the total sum if a_ret_sum is true.
1228 template <typename N, typename T, typename M=std::enable_if_t<std::is_integral<N>::value> >
1229 T ExclusiveSum (N n, T const* in, T * out, RetSum a_ret_sum = retSum)
1230 {
1231  if (n <= 0) { return 0; }
1232 #if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
1233  T in_last = 0;
1234  if (a_ret_sum) {
1235  Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1236  }
1237  void* d_temp = nullptr;
1238  std::size_t temp_bytes = 0;
1239  AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1240  Gpu::gpuStream()));
1241  d_temp = The_Arena()->alloc(temp_bytes);
1242  AMREX_GPU_SAFE_CALL(cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, in, out, n,
1243  Gpu::gpuStream()));
1244  T out_last = 0;
1245  if (a_ret_sum) {
1246  Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1247  }
1249  The_Arena()->free(d_temp);
1251  return in_last+out_last;
1252 #elif defined(AMREX_USE_HIP)
1253  T in_last = 0;
1254  if (a_ret_sum) {
1255  Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1256  }
1257  void* d_temp = nullptr;
1258  std::size_t temp_bytes = 0;
1259  AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1260  rocprim::plus<T>(), Gpu::gpuStream()));
1261  d_temp = The_Arena()->alloc(temp_bytes);
1262  AMREX_GPU_SAFE_CALL(rocprim::exclusive_scan(d_temp, temp_bytes, in, out, T{0}, n,
1263  rocprim::plus<T>(), Gpu::gpuStream()));
1264  T out_last = 0;
1265  if (a_ret_sum) {
1266  Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1267  }
1269  The_Arena()->free(d_temp);
1271  return in_last+out_last;
1272 #elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
1273  T in_last = 0;
1274  if (a_ret_sum) {
1275  Gpu::dtoh_memcpy_async(&in_last, in+(n-1), sizeof(T));
1276  }
1277  auto policy = oneapi::dpl::execution::make_device_policy(Gpu::Device::streamQueue());
1278  std::exclusive_scan(policy, in, in+n, out, T(0), std::plus<T>());
1279  T out_last = 0;
1280  if (a_ret_sum) {
1281  Gpu::dtoh_memcpy_async(&out_last, out+(n-1), sizeof(T));
1282  }
1285  return in_last+out_last;
1286 #else
1287  if (static_cast<Long>(n) <= static_cast<Long>(std::numeric_limits<int>::max())) {
1288  return PrefixSum<T>(static_cast<int>(n),
1289  [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1290  [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1291  Type::exclusive, a_ret_sum);
1292  } else {
1293  return PrefixSum<T>(n,
1294  [=] AMREX_GPU_DEVICE (N i) -> T { return in[i]; },
1295  [=] AMREX_GPU_DEVICE (N i, T const& x) { out[i] = x; },
1296  Type::exclusive, a_ret_sum);
1297  }
1298 #endif
1299 }
1300 
1301 #else
1302 // !defined(AMREX_USE_GPU)
1303 template <typename T, typename N, typename FIN, typename FOUT, typename TYPE,
1304  typename M=std::enable_if_t<std::is_integral_v<N> &&
1305  (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive> ||
1306  std::is_same_v<std::decay_t<TYPE>,Type::Exclusive>)> >
1307 T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum = retSum)
1308 {
1309  if (n <= 0) { return 0; }
1310  T totalsum = 0;
1311  for (N i = 0; i < n; ++i) {
1312  T x = fin(i);
1313  T y = totalsum;
1314  totalsum += x;
1315  AMREX_IF_CONSTEXPR (std::is_same_v<std::decay_t<TYPE>,Type::Inclusive>) {
1316  y += x;
1317  }
1318  fout(i, y);
1319  }
1320  return totalsum;
1321 }
1322 
1323 // The return value is the total sum.
1324 template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1325 T InclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1326 {
1327 #if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1328  // GCC's __cplusplus is not a reliable indication for C++17 support
1329  std::inclusive_scan(in, in+n, out);
1330 #else
1331  std::partial_sum(in, in+n, out);
1332 #endif
1333  return (n > 0) ? out[n-1] : T(0);
1334 }
1335 
1336 // The return value is the total sum.
1337 template <typename N, typename T, typename M=std::enable_if_t<std::is_integral_v<N>> >
1338 T ExclusiveSum (N n, T const* in, T * out, RetSum /*a_ret_sum*/ = retSum)
1339 {
1340  if (n <= 0) { return 0; }
1341 
1342  auto in_last = in[n-1];
1343 #if (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1344  // GCC's __cplusplus is not a reliable indication for C++17 support
1345  std::exclusive_scan(in, in+n, out, 0);
1346 #else
1347  out[0] = 0;
1348  std::partial_sum(in, in+n-1, out+1);
1349 #endif
1350  return in_last + out[n-1];
1351 }
1352 
1353 #endif
1354 
1355 }
1356 
1357 namespace Gpu
1358 {
1359  template<class InIter, class OutIter>
1360  OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
1361  {
1362 #if defined(AMREX_USE_GPU)
1363  auto N = std::distance(begin, end);
1364  Scan::InclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1365  OutIter result_end = result;
1366  std::advance(result_end, N);
1367  return result_end;
1368 #elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1369  // GCC's __cplusplus is not a reliable indication for C++17 support
1370  return std::inclusive_scan(begin, end, result);
1371 #else
1372  return std::partial_sum(begin, end, result);
1373 #endif
1374  }
1375 
1376  template<class InIter, class OutIter>
1377  OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
1378  {
1379 #if defined(AMREX_USE_GPU)
1380  auto N = std::distance(begin, end);
1381  Scan::ExclusiveSum(N, &(*begin), &(*result), Scan::noRetSum);
1382  OutIter result_end = result;
1383  std::advance(result_end, N);
1384  return result_end;
1385 #elif (__cplusplus >= 201703L) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10)
1386  // GCC's __cplusplus is not a reliable indication for C++17 support
1387  return std::exclusive_scan(begin, end, result, 0);
1388 #else
1389  if (begin == end) { return result; }
1390 
1391  typename std::iterator_traits<InIter>::value_type sum = *begin;
1392  *result++ = sum - *begin;
1393 
1394  while (++begin != end) {
1395  sum = std::move(sum) + *begin;
1396  *result++ = sum - *begin;
1397  }
1398  return ++result;
1399 #endif
1400  }
1401 
1402 }}
1403 
1404 #endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition: AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_RESTRICT
Definition: AMReX_Extension.H:37
#define AMREX_IF_CONSTEXPR
Definition: AMReX_Extension.H:269
#define AMREX_HIP_OR_CUDA(a, b)
Definition: AMReX_GpuControl.H:21
#define AMREX_GPU_SAFE_CALL(call)
Definition: AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition: AMReX_GpuError.H:105
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
Array4< int const > mask
Definition: AMReX_InterpFaceRegister.cpp:93
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition: AMReX_Arena.cpp:130
virtual void * alloc(std::size_t sz)=0
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition: AMReX_GpuAtomic.H:485
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition: AMReX_GpuAtomic.H:198
OutIter exclusive_scan(InIter begin, InIter end, OutIter result)
Definition: AMReX_Scan.H:1377
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
OutIter inclusive_scan(InIter begin, InIter end, OutIter result)
Definition: AMReX_Scan.H:1360
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
void Sum(T &v, MPI_Comm comm)
Definition: AMReX_ParallelReduce.H:204
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr struct amrex::Scan::Type::Inclusive inclusive
T InclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition: AMReX_Scan.H:1168
T ExclusiveSum(N n, T const *in, T *out, RetSum a_ret_sum=retSum)
Definition: AMReX_Scan.H:1229
static constexpr RetSum noRetSum
Definition: AMReX_Scan.H:30
static constexpr RetSum retSum
Definition: AMReX_Scan.H:29
T PrefixSum(N n, FIN const &fin, FOUT const &fout, TYPE, RetSum a_ret_sum=retSum)
Definition: AMReX_Scan.H:926
@ max
Definition: AMReX_ParallelReduce.H:17
@ sum
Definition: AMReX_ParallelReduce.H:19
static constexpr int M
Definition: AMReX_OpenBC.H:13
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1890
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:634
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1881
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition: AMReX_Arena.cpp:594
Definition: AMReX_FabArrayCommI.H:841
integer, parameter dp
Definition: AMReX_SDCquadrature.F90:8
Definition: AMReX_GpuMemory.H:125
AMREX_GPU_DEVICE T * dataPtr() noexcept
Definition: AMReX_GpuMemory.H:126
Definition: AMReX_Scan.H:25
bool flag
Definition: AMReX_Scan.H:26
Definition: AMReX_Scan.H:34
Definition: AMReX_Scan.H:33
T inclusive
Definition: AMReX_Scan.H:119
T aggregate
Definition: AMReX_Scan.H:118
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void write(char a_status, T a_value)
Definition: AMReX_Scan.H:123
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T get_aggregate() const
Definition: AMReX_Scan.H:138
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > read() volatile
Definition: AMReX_Scan.H:141
char status
Definition: AMReX_Scan.H:120
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void set_status(char a_status)
Definition: AMReX_Scan.H:167
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > wait() volatile
Definition: AMReX_Scan.H:170
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void set_status(char a_status)
Definition: AMReX_Scan.H:98
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T get_aggregate() const
Definition: AMReX_Scan.H:80
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > wait() volatile
Definition: AMReX_Scan.H:101
AMREX_GPU_DEVICE AMREX_FORCE_INLINE STVA< T > read() volatile
Definition: AMReX_Scan.H:83
Data< T > d
Definition: AMReX_Scan.H:60
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void write(char a_status, T a_value)
Definition: AMReX_Scan.H:63
Definition: AMReX_Scan.H:48
Definition: AMReX_Scan.H:43
char status
Definition: AMReX_Scan.H:44
T value
Definition: AMReX_Scan.H:45
STVA< U > s
Definition: AMReX_Scan.H:55
uint64_t i
Definition: AMReX_Scan.H:56
void operator=(Data< U > const &)=delete