Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuContainers.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_CONTAINERS_H_
2#define AMREX_GPU_CONTAINERS_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Vector.H>
6#include <AMReX_PODVector.H>
8#include <type_traits>
9
10#include <numeric>
11#include <iterator>
12
13namespace amrex::Gpu {
14
15#ifdef AMREX_USE_GPU
16
22 template <class T>
24
29 template <class T>
31
36 template <class T>
38
43 template <class T>
45
52 template <class T>
54
60 template <class T>
62
68 template <class T>
70
71#else
73 template <class T>
75
76 template <class T>
78
79 template <class T>
80 using NonManagedVector = PODVector<T>;
81
82 template <class T>
84
85 template <class T>
87
88 template <class T>
90
91 template <class T>
93#endif
94
95 struct HostToDevice {};
96 struct DeviceToHost {};
97 struct DeviceToDevice {};
98 static constexpr HostToDevice hostToDevice{};
99 static constexpr DeviceToHost deviceToHost{};
100 static constexpr DeviceToDevice deviceToDevice{};
101
120 template<class InIter, class OutIter>
121 void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
122 {
123 using value_type = typename std::iterator_traits<InIter>::value_type;
124
125 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
126 static_assert(std::is_same_v<value_type, out_value_type>);
127 static_assert(std::is_trivially_copyable<value_type>(),
128 "Can only copy trivially copyable types");
129
130 auto size = std::distance(begin, end);
131 if (size == 0) { return; }
132 htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
133 }
134
153 template<class InIter, class OutIter>
154 void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
155 {
156 using value_type = typename std::iterator_traits<InIter>::value_type;
157
158 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
159 static_assert(std::is_same_v<value_type, out_value_type>);
160 static_assert(std::is_trivially_copyable<value_type>(),
161 "Can only copy trivially copyable types");
162
163 auto size = std::distance(begin, end);
164 if (size == 0) { return; }
165 dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));
166 }
167
186 template<class InIter, class OutIter>
187 void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
188 {
189 using value_type = typename std::iterator_traits<InIter>::value_type;
190
191 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
192 static_assert(std::is_same_v<value_type, out_value_type>);
193 static_assert(std::is_trivially_copyable<value_type>(),
194 "Can only copy trivially copyable types");
195
196 auto size = std::distance(begin, end);
197 if (size == 0) { return; }
198 dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
199 }
200
220 template<class InIter, class OutIter>
221 void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
222 {
223 using value_type = typename std::iterator_traits<InIter>::value_type;
224
225 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
226 static_assert(std::is_same_v<value_type, out_value_type>);
227 static_assert(std::is_trivially_copyable<value_type>(),
228 "Can only copy trivially copyable types");
229
230 auto size = std::distance(begin, end);
231 if (size == 0) { return; }
232 htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
233 }
234
254 template<class InIter, class OutIter>
255 void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
256 {
257 using value_type = typename std::iterator_traits<InIter>::value_type;
258
259 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
260 static_assert(std::is_same_v<value_type, out_value_type>);
261 static_assert(std::is_trivially_copyable<value_type>(),
262 "Can only copy trivially copyable types");
263
264 auto size = std::distance(begin, end);
265 if (size == 0) { return; }
266 dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
267 }
268
288 template<class InIter, class OutIter>
289 void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
290 {
291 using value_type = typename std::iterator_traits<InIter>::value_type;
292
293 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
294 static_assert(std::is_same_v<value_type, out_value_type>);
295 static_assert(std::is_trivially_copyable<value_type>(),
296 "Can only copy trivially copyable types");
297
298 auto size = std::distance(begin, end);
299 if (size == 0) { return; }
300 dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
301 }
302
309 template<class Iter>
310 void prefetchToHost (Iter begin, Iter end) noexcept
311 {
312 using value_type = typename std::iterator_traits<Iter>::value_type;
313 static_assert(std::is_trivially_copyable<value_type>(),
314 "Can only copy trivially copyable types");
315
316 auto size = std::distance(begin, end);
317 if (size == 0) { return; }
318
319#ifdef AMREX_USE_GPU
320 // Currently only implemented for CUDA.
321#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
322 if (Gpu::Device::devicePropMajor() >= 6) {
323#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
324 cudaMemLocation location = {};
325 location.type = cudaMemLocationTypeDevice;
326 location.id = cudaCpuDeviceId;
327 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
328 size*sizeof(value_type),
329 location, 0,
330 Gpu::gpuStream()));
331#else
332 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
333 size*sizeof(value_type),
334 cudaCpuDeviceId,
335 Gpu::gpuStream()));
336#endif
337 }
338#endif
339#endif
340
342 }
343
350 template<class Iter>
351 void prefetchToDevice (Iter begin, Iter end) noexcept
352 {
353 using value_type = typename std::iterator_traits<Iter>::value_type;
354 static_assert(std::is_trivially_copyable<value_type>(),
355 "Can only copy trivially copyable types");
356
357 auto size = std::distance(begin, end);
358 if (size == 0) { return; }
359
360#ifdef AMREX_USE_GPU
361 // Currently only implemented for CUDA.
362#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
363 if (Gpu::Device::devicePropMajor() >= 6) {
364#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
365 cudaMemLocation location = {};
366 location.type = cudaMemLocationTypeDevice;
367 location.id = Gpu::Device::deviceId();
368 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
369 size*sizeof(value_type),
370 location, 0,
371 Gpu::gpuStream()));
372#else
373 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
374 size*sizeof(value_type),
376 Gpu::gpuStream()));
377#endif
378 }
379#endif
380#endif
381
383 }
384
400 template <typename IT, typename F,
401 typename T = typename std::iterator_traits<IT>::value_type,
402 std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory
403 std::is_trivially_copyable_v<T> &&
405 int> FOO = 0>
406 void fillAsync (IT first, IT last, F const& f) noexcept
407 {
408 auto N = static_cast<Long>(std::distance(first, last));
409 if (N <= 0) { return; }
410 auto p = &(*first);
411#ifndef AMREX_USE_GPU
412 for (Long i = 0; i < N; ++i) {
413 f(p[i], i);
414 }
415#else
416 // No need to use shared memory if the type is small.
417 // May not have enough shared memory if the type is too big.
418 // Cannot use shared memory, if the type is not trivially copable.
419 if constexpr ((sizeof(T) <= 8)
420 || (sizeof(T) > 36*8)
421 || ! std::is_trivially_copyable<T>()) {
422 amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
423 {
424 f(p[i], i);
425 });
426 } else {
427 static_assert(sizeof(T) % sizeof(unsigned int) == 0);
428 using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
429 unsigned long long, unsigned int>;
430 constexpr Long nU = sizeof(T) / sizeof(U);
431 auto pu = reinterpret_cast<U*>(p);
432 constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
433 int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
434 std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
435#ifdef AMREX_USE_SYCL
436 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
437 [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
438 {
439 Long i = handler.globalIdx();
440 Long blockDimx = handler.blockDim();
441 Long threadIdxx = handler.threadIdx();
442 Long blockIdxx = handler.blockIdx();
443 auto const shared_U = (U*)handler.sharedMemory();
444 auto const shared_T = (T*)shared_U;
445 if (i < N) {
446 auto ga = new(shared_T+threadIdxx) T;
447 f(*ga, i);
448 }
449 handler.sharedBarrier();
450 for (Long m = threadIdxx,
451 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
452 m < mend; m += blockDimx) {
453 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
454 }
455 });
456#else
457 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
458 [=] AMREX_GPU_DEVICE () noexcept
459 {
460 Long blockDimx = blockDim.x;
461 Long threadIdxx = threadIdx.x;
462 Long blockIdxx = blockIdx.x;
463 Long i = blockDimx*blockIdxx + threadIdxx;
465 auto const shared_U = gsm.dataPtr();
466 auto const shared_T = (T*)shared_U;
467 if (i < N) {
468 auto ga = new(shared_T+threadIdxx) T;
469 f(*ga, i);
470 }
471 __syncthreads();
472 for (Long m = threadIdxx,
473 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
474 m < mend; m += blockDimx) {
475 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
476 }
477 });
478#endif
479 }
480#endif
481 }
482
483}
484
485#endif
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:672
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:163
Definition AMReX_PODVector.H:297
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:317
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition AMReX_GpuContainers.H:406
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:121
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:221
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:310
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:100
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:260
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:303
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:364
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:356
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:289
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:372
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:351
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1899
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1908
Definition AMReX_GpuContainers.H:97
Definition AMReX_GpuContainers.H:96
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuContainers.H:95
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:209