Block-Structured AMR Software Framework
AMReX_GpuContainers.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_CONTAINERS_H_
2 #define AMREX_GPU_CONTAINERS_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_Vector.H>
6 #include <AMReX_PODVector.H>
7 #include <AMReX_GpuAllocators.H>
8 #include <type_traits>
9 
10 #include <numeric>
11 #include <iterator>
12 
13 namespace amrex::Gpu {
14 
15 #ifdef AMREX_USE_GPU
16 
22  template <class T>
24 
29  template <class T>
31 
36  template <class T>
38 
43  template <class T>
45 
52  template <class T>
54 
60  template <class T>
62 
68  template <class T>
70 
71 #else
73  template <class T>
74  using DeviceVector = PODVector<T>;
75 
76  template <class T>
77  using HostVector = PODVector<T>;
78 
79  template <class T>
80  using NonManagedVector = PODVector<T>;
81 
82  template <class T>
84 
85  template <class T>
87 
88  template <class T>
89  using PinnedVector = PODVector<T>;
90 
91  template <class T>
92  using AsyncVector = PODVector<T>;
93 #endif
94 
95  struct HostToDevice {};
96  struct DeviceToHost {};
97  struct DeviceToDevice {};
98  static constexpr HostToDevice hostToDevice{};
99  static constexpr DeviceToHost deviceToHost{};
100  static constexpr DeviceToDevice deviceToDevice{};
101 
120  template<class InIter, class OutIter>
121  void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
122  {
123  using value_type = typename std::iterator_traits<InIter>::value_type;
124 
125  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
126  static_assert(std::is_same_v<value_type, out_value_type>);
127  static_assert(std::is_trivially_copyable<value_type>(),
128  "Can only copy trivially copyable types");
129 
130  auto size = std::distance(begin, end);
131  if (size == 0) { return; }
132 #ifdef AMREX_USE_GPU
133  htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
134 #else
135  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
136 #endif
137  }
138 
157  template<class InIter, class OutIter>
158  void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
159  {
160  using value_type = typename std::iterator_traits<InIter>::value_type;
161 
162  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
163  static_assert(std::is_same_v<value_type, out_value_type>);
164  static_assert(std::is_trivially_copyable<value_type>(),
165  "Can only copy trivially copyable types");
166 
167  auto size = std::distance(begin, end);
168  if (size == 0) { return; }
169 #ifdef AMREX_USE_GPU
170  dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));
171 #else
172  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
173 #endif
174  }
175 
194  template<class InIter, class OutIter>
195  void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
196  {
197  using value_type = typename std::iterator_traits<InIter>::value_type;
198 
199  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
200  static_assert(std::is_same_v<value_type, out_value_type>);
201  static_assert(std::is_trivially_copyable<value_type>(),
202  "Can only copy trivially copyable types");
203 
204  auto size = std::distance(begin, end);
205  if (size == 0) { return; }
206 #ifdef AMREX_USE_GPU
207  dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
208 #else
209  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
210 #endif
211  }
212 
232  template<class InIter, class OutIter>
233  void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
234  {
235  using value_type = typename std::iterator_traits<InIter>::value_type;
236 
237  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
238  static_assert(std::is_same_v<value_type, out_value_type>);
239  static_assert(std::is_trivially_copyable<value_type>(),
240  "Can only copy trivially copyable types");
241 
242  auto size = std::distance(begin, end);
243  if (size == 0) { return; }
244 #ifdef AMREX_USE_GPU
245  htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
246 #else
247  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
248 #endif
249  }
250 
270  template<class InIter, class OutIter>
271  void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
272  {
273  using value_type = typename std::iterator_traits<InIter>::value_type;
274 
275  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
276  static_assert(std::is_same_v<value_type, out_value_type>);
277  static_assert(std::is_trivially_copyable<value_type>(),
278  "Can only copy trivially copyable types");
279 
280  auto size = std::distance(begin, end);
281  if (size == 0) { return; }
282 #ifdef AMREX_USE_GPU
283  dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
284 #else
285  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
286 #endif
287  }
288 
308  template<class InIter, class OutIter>
309  void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
310  {
311  using value_type = typename std::iterator_traits<InIter>::value_type;
312 
313  using out_value_type = typename std::iterator_traits<OutIter>::value_type;
314  static_assert(std::is_same_v<value_type, out_value_type>);
315  static_assert(std::is_trivially_copyable<value_type>(),
316  "Can only copy trivially copyable types");
317 
318  auto size = std::distance(begin, end);
319  if (size == 0) { return; }
320 #ifdef AMREX_USE_GPU
321  dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
322 #else
323  std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
324 #endif
325  }
326 
333  template<class Iter>
334  void prefetchToHost (Iter begin, Iter end) noexcept
335  {
336  using value_type = typename std::iterator_traits<Iter>::value_type;
337  static_assert(std::is_trivially_copyable<value_type>(),
338  "Can only copy trivially copyable types");
339 
340  auto size = std::distance(begin, end);
341  if (size == 0) { return; }
342 
343 #ifdef AMREX_USE_GPU
344  // Currently only implemented for CUDA.
345 #if defined(AMREX_USE_CUDA) && !defined(_WIN32)
346  if (Gpu::Device::devicePropMajor() >= 6) {
347  AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
348  size*sizeof(value_type),
349  cudaCpuDeviceId,
350  Gpu::gpuStream()));
351  }
352 #endif
353 #endif
354 
356  }
357 
364  template<class Iter>
365  void prefetchToDevice (Iter begin, Iter end) noexcept
366  {
367  using value_type = typename std::iterator_traits<Iter>::value_type;
368  static_assert(std::is_trivially_copyable<value_type>(),
369  "Can only copy trivially copyable types");
370 
371  auto size = std::distance(begin, end);
372  if (size == 0) { return; }
373 
374 #ifdef AMREX_USE_GPU
375  // Currently only implemented for CUDA.
376 #if defined(AMREX_USE_CUDA) && !defined(_WIN32)
377  if (Gpu::Device::devicePropMajor() >= 6) {
378  AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
379  size*sizeof(value_type),
381  Gpu::gpuStream()));
382  }
383 #endif
384 #endif
385 
387  }
388 
404  template <typename IT, typename F,
405  typename T = typename std::iterator_traits<IT>::value_type,
406  std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory
407  std::is_trivially_copyable_v<T> &&
409  int> FOO = 0>
410  void fillAsync (IT first, IT last, F const& f) noexcept
411  {
412  auto N = static_cast<Long>(std::distance(first, last));
413  if (N <= 0) { return; }
414  auto p = &(*first);
415 #ifndef AMREX_USE_GPU
416  for (Long i = 0; i < N; ++i) {
417  f(p[i], i);
418  }
419 #else
420  // No need to use shared memory if the type is small.
421  // May not have enough shared memory if the type is too big.
422  // Cannot use shared memory, if the type is not trivially copable.
423  if constexpr ((sizeof(T) <= 8)
424  || (sizeof(T) > 36*8)
425  || ! std::is_trivially_copyable<T>()) {
426  amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
427  {
428  f(p[i], i);
429  });
430  } else {
431  static_assert(sizeof(T) % sizeof(unsigned int) == 0);
432  using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
433  unsigned long long, unsigned int>;
434  constexpr Long nU = sizeof(T) / sizeof(U);
435  auto pu = reinterpret_cast<U*>(p);
436  int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
437  int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
438  std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
439 #ifdef AMREX_USE_SYCL
440  amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
441  [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
442  {
443  Long i = handler.globalIdx();
444  Long blockDimx = handler.blockDim();
445  Long threadIdxx = handler.threadIdx();
446  Long blockIdxx = handler.blockIdx();
447  auto const shared_U = (U*)handler.sharedMemory();
448  auto const shared_T = (T*)shared_U;
449  if (i < N) {
450  auto ga = new(shared_T+threadIdxx) T;
451  f(*ga, i);
452  }
453  handler.sharedBarrier();
454  for (Long m = threadIdxx,
455  mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
456  m < mend; m += blockDimx) {
457  pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
458  }
459  });
460 #else
461  amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
462  [=] AMREX_GPU_DEVICE () noexcept
463  {
464  Long blockDimx = blockDim.x;
465  Long threadIdxx = threadIdx.x;
466  Long blockIdxx = blockIdx.x;
467  Long i = blockDimx*blockIdxx + threadIdxx;
469  auto const shared_U = gsm.dataPtr();
470  auto const shared_T = (T*)shared_U;
471  if (i < N) {
472  auto ga = new(shared_T+threadIdxx) T;
473  f(*ga, i);
474  }
475  __syncthreads();
476  for (Long m = threadIdxx,
477  mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
478  m < mend; m += blockDimx) {
479  pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
480  }
481  });
482 #endif
483  }
484 #endif
485  }
486 
487 }
488 
489 #endif
#define AMREX_CUDA_SAFE_CALL(call)
Definition: AMReX_GpuError.H:73
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
static int deviceId() noexcept
Definition: AMReX_GpuDevice.cpp:568
static int devicePropMajor() noexcept
Definition: AMReX_GpuDevice.H:142
Definition: AMReX_PODVector.H:246
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26
Definition: AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:279
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition: AMReX_GpuContainers.H:410
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:121
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:233
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition: AMReX_GpuContainers.H:334
static constexpr DeviceToDevice deviceToDevice
Definition: AMReX_GpuContainers.H:100
static constexpr DeviceToHost deviceToHost
Definition: AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition: AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy(void *dest, const void *src, std::size_t count)
Definition: AMReX_GpuUtility.H:214
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:301
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:293
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:309
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition: AMReX_GpuContainers.H:365
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1890
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1881
Definition: AMReX_GpuContainers.H:97
Definition: AMReX_GpuContainers.H:96
Definition: AMReX_GpuTypes.H:86
Definition: AMReX_GpuContainers.H:95
Definition: AMReX_GpuMemory.H:125
AMREX_GPU_DEVICE T * dataPtr() noexcept
Definition: AMReX_GpuMemory.H:126
Test if a given type T is callable with arguments of type Args...
Definition: AMReX_TypeTraits.H:201