Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuContainers.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_CONTAINERS_H_
2#define AMREX_GPU_CONTAINERS_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Vector.H>
6#include <AMReX_PODVector.H>
8#include <type_traits>
9
10#include <numeric>
11#include <iterator>
12
13namespace amrex::Gpu {
14
15#ifdef AMREX_USE_GPU
16
23 template <class T>
25
31 template <class T>
33
39 template <class T>
41
47 template <class T>
49
57 template <class T>
59
66 template <class T>
68
75 template <class T>
77
78#else
80 template <class T>
82
83 template <class T>
85
86 template <class T>
87 using NonManagedVector = PODVector<T>;
88
89 template <class T>
91
92 template <class T>
94
95 template <class T>
97
98 template <class T>
100#endif
101
102 struct HostToDevice {};
103 struct DeviceToHost {};
104 struct DeviceToDevice {};
105 static constexpr HostToDevice hostToDevice{};
106 static constexpr DeviceToHost deviceToHost{};
107 static constexpr DeviceToDevice deviceToDevice{};
108
127 template<class InIter, class OutIter>
128 void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
129 {
130 using value_type = typename std::iterator_traits<InIter>::value_type;
131
132 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
133 static_assert(std::is_same_v<value_type, out_value_type>);
134 static_assert(std::is_trivially_copyable<value_type>(),
135 "Can only copy trivially copyable types");
136
137 auto size = std::distance(begin, end);
138 if (size == 0) { return; }
139 htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
140 }
141
160 template<class InIter, class OutIter>
161 void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
162 {
163 using value_type = typename std::iterator_traits<InIter>::value_type;
164
165 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
166 static_assert(std::is_same_v<value_type, out_value_type>);
167 static_assert(std::is_trivially_copyable<value_type>(),
168 "Can only copy trivially copyable types");
169
170 auto size = std::distance(begin, end);
171 if (size == 0) { return; }
172 dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));
173 }
174
193 template<class InIter, class OutIter>
194 void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
195 {
196 using value_type = typename std::iterator_traits<InIter>::value_type;
197
198 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
199 static_assert(std::is_same_v<value_type, out_value_type>);
200 static_assert(std::is_trivially_copyable<value_type>(),
201 "Can only copy trivially copyable types");
202
203 auto size = std::distance(begin, end);
204 if (size == 0) { return; }
205 dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
206 }
207
227 template<class InIter, class OutIter>
228 void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
229 {
230 using value_type = typename std::iterator_traits<InIter>::value_type;
231
232 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
233 static_assert(std::is_same_v<value_type, out_value_type>);
234 static_assert(std::is_trivially_copyable<value_type>(),
235 "Can only copy trivially copyable types");
236
237 auto size = std::distance(begin, end);
238 if (size == 0) { return; }
239 htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
240 }
241
261 template<class InIter, class OutIter>
262 void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
263 {
264 using value_type = typename std::iterator_traits<InIter>::value_type;
265
266 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
267 static_assert(std::is_same_v<value_type, out_value_type>);
268 static_assert(std::is_trivially_copyable<value_type>(),
269 "Can only copy trivially copyable types");
270
271 auto size = std::distance(begin, end);
272 if (size == 0) { return; }
273 dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
274 }
275
295 template<class InIter, class OutIter>
296 void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
297 {
298 using value_type = typename std::iterator_traits<InIter>::value_type;
299
300 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
301 static_assert(std::is_same_v<value_type, out_value_type>);
302 static_assert(std::is_trivially_copyable<value_type>(),
303 "Can only copy trivially copyable types");
304
305 auto size = std::distance(begin, end);
306 if (size == 0) { return; }
307 dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
308 }
309
316 template<class Iter>
317 void prefetchToHost (Iter begin, Iter end) noexcept
318 {
319 using value_type = typename std::iterator_traits<Iter>::value_type;
320 static_assert(std::is_trivially_copyable<value_type>(),
321 "Can only copy trivially copyable types");
322
323 auto size = std::distance(begin, end);
324 if (size == 0) { return; }
325
326#ifdef AMREX_USE_GPU
327 // Currently only implemented for CUDA.
328#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
329 if (Gpu::Device::devicePropMajor() >= 6) {
330#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
331 cudaMemLocation location = {};
332 location.type = cudaMemLocationTypeDevice;
333 location.id = cudaCpuDeviceId;
334 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
335 size*sizeof(value_type),
336 location, 0,
337 Gpu::gpuStream()));
338#else
339 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
340 size*sizeof(value_type),
341 cudaCpuDeviceId,
342 Gpu::gpuStream()));
343#endif
344 }
345#endif
346#endif
347
349 }
350
357 template<class Iter>
358 void prefetchToDevice (Iter begin, Iter end) noexcept
359 {
360 using value_type = typename std::iterator_traits<Iter>::value_type;
361 static_assert(std::is_trivially_copyable<value_type>(),
362 "Can only copy trivially copyable types");
363
364 auto size = std::distance(begin, end);
365 if (size == 0) { return; }
366
367#ifdef AMREX_USE_GPU
368 // Currently only implemented for CUDA.
369#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
370 if (Gpu::Device::devicePropMajor() >= 6) {
371#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
372 cudaMemLocation location = {};
373 location.type = cudaMemLocationTypeDevice;
374 location.id = Gpu::Device::deviceId();
375 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
376 size*sizeof(value_type),
377 location, 0,
378 Gpu::gpuStream()));
379#else
380 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
381 size*sizeof(value_type),
383 Gpu::gpuStream()));
384#endif
385 }
386#endif
387#endif
388
390 }
391
407 template <typename IT, typename F,
408 typename T = typename std::iterator_traits<IT>::value_type,
409 std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory
410 std::is_trivially_copyable_v<T> &&
412 int> FOO = 0>
413 void fillAsync (IT first, IT last, F const& f) noexcept
414 {
415 auto N = static_cast<Long>(std::distance(first, last));
416 if (N <= 0) { return; }
417 auto p = &(*first);
418#ifndef AMREX_USE_GPU
419 for (Long i = 0; i < N; ++i) {
420 f(p[i], i);
421 }
422#else
423 // No need to use shared memory if the type is small.
424 // May not have enough shared memory if the type is too big.
425 // Cannot use shared memory, if the type is not trivially copable.
426 if constexpr ((sizeof(T) <= 8)
427 || (sizeof(T) > 36*8)
428 || ! std::is_trivially_copyable<T>()) {
429 amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
430 {
431 f(p[i], i);
432 });
433 } else {
434 static_assert(sizeof(T) % sizeof(unsigned int) == 0);
435 using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
436 unsigned long long, unsigned int>;
437 constexpr Long nU = sizeof(T) / sizeof(U);
438 auto pu = reinterpret_cast<U*>(p);
439 constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
440 int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
441 std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
442#ifdef AMREX_USE_SYCL
443 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
444 [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
445 {
446 Long i = handler.globalIdx();
447 Long blockDimx = handler.blockDim();
448 Long threadIdxx = handler.threadIdx();
449 Long blockIdxx = handler.blockIdx();
450 auto const shared_U = (U*)handler.sharedMemory();
451 auto const shared_T = (T*)shared_U;
452 if (i < N) {
453 auto ga = new(shared_T+threadIdxx) T;
454 f(*ga, i);
455 }
456 handler.sharedBarrier();
457 for (Long m = threadIdxx,
458 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
459 m < mend; m += blockDimx) {
460 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
461 }
462 });
463#else
464 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
465 [=] AMREX_GPU_DEVICE () noexcept
466 {
467 Long blockDimx = blockDim.x;
468 Long threadIdxx = threadIdx.x;
469 Long blockIdxx = blockIdx.x;
470 Long i = blockDimx*blockIdxx + threadIdxx;
472 auto const shared_U = gsm.dataPtr();
473 auto const shared_T = (T*)shared_U;
474 if (i < N) {
475 auto ga = new(shared_T+threadIdxx) T;
476 f(*ga, i);
477 }
478 __syncthreads();
479 for (Long m = threadIdxx,
480 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
481 m < mend; m += blockDimx) {
482 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
483 }
484 });
485#endif
486 }
487#endif
488 }
489
490}
491
492#endif
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:679
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:166
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:329
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition AMReX_GpuContainers.H:413
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:128
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:317
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:315
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:376
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:368
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:384
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:358
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
Definition AMReX_GpuContainers.H:104
Definition AMReX_GpuContainers.H:103
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuContainers.H:102
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213