Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_GpuContainers.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_CONTAINERS_H_
2#define AMREX_GPU_CONTAINERS_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Vector.H>
6#include <AMReX_PODVector.H>
8#include <type_traits>
9
10#include <numeric>
11#include <iterator>
12
13namespace amrex::Gpu {
14
15#ifdef AMREX_USE_GPU
16
22 template <class T>
24
29 template <class T>
31
36 template <class T>
38
43 template <class T>
45
52 template <class T>
54
60 template <class T>
62
68 template <class T>
70
71#else
73 template <class T>
75
76 template <class T>
78
79 template <class T>
80 using NonManagedVector = PODVector<T>;
81
82 template <class T>
84
85 template <class T>
87
88 template <class T>
90
91 template <class T>
93#endif
94
95 struct HostToDevice {};
96 struct DeviceToHost {};
97 struct DeviceToDevice {};
98 static constexpr HostToDevice hostToDevice{};
99 static constexpr DeviceToHost deviceToHost{};
100 static constexpr DeviceToDevice deviceToDevice{};
101
120 template<class InIter, class OutIter>
121 void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
122 {
123 using value_type = typename std::iterator_traits<InIter>::value_type;
124
125 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
126 static_assert(std::is_same_v<value_type, out_value_type>);
127 static_assert(std::is_trivially_copyable<value_type>(),
128 "Can only copy trivially copyable types");
129
130 auto size = std::distance(begin, end);
131 if (size == 0) { return; }
132#ifdef AMREX_USE_GPU
133 htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
134#else
135 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
136#endif
137 }
138
157 template<class InIter, class OutIter>
158 void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
159 {
160 using value_type = typename std::iterator_traits<InIter>::value_type;
161
162 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
163 static_assert(std::is_same_v<value_type, out_value_type>);
164 static_assert(std::is_trivially_copyable<value_type>(),
165 "Can only copy trivially copyable types");
166
167 auto size = std::distance(begin, end);
168 if (size == 0) { return; }
169#ifdef AMREX_USE_GPU
170 dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));
171#else
172 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
173#endif
174 }
175
194 template<class InIter, class OutIter>
195 void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
196 {
197 using value_type = typename std::iterator_traits<InIter>::value_type;
198
199 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
200 static_assert(std::is_same_v<value_type, out_value_type>);
201 static_assert(std::is_trivially_copyable<value_type>(),
202 "Can only copy trivially copyable types");
203
204 auto size = std::distance(begin, end);
205 if (size == 0) { return; }
206#ifdef AMREX_USE_GPU
207 dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));
208#else
209 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
210#endif
211 }
212
232 template<class InIter, class OutIter>
233 void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
234 {
235 using value_type = typename std::iterator_traits<InIter>::value_type;
236
237 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
238 static_assert(std::is_same_v<value_type, out_value_type>);
239 static_assert(std::is_trivially_copyable<value_type>(),
240 "Can only copy trivially copyable types");
241
242 auto size = std::distance(begin, end);
243 if (size == 0) { return; }
244#ifdef AMREX_USE_GPU
245 htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
246#else
247 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
248#endif
249 }
250
270 template<class InIter, class OutIter>
271 void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
272 {
273 using value_type = typename std::iterator_traits<InIter>::value_type;
274
275 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
276 static_assert(std::is_same_v<value_type, out_value_type>);
277 static_assert(std::is_trivially_copyable<value_type>(),
278 "Can only copy trivially copyable types");
279
280 auto size = std::distance(begin, end);
281 if (size == 0) { return; }
282#ifdef AMREX_USE_GPU
283 dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
284#else
285 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
286#endif
287 }
288
308 template<class InIter, class OutIter>
309 void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
310 {
311 using value_type = typename std::iterator_traits<InIter>::value_type;
312
313 using out_value_type = typename std::iterator_traits<OutIter>::value_type;
314 static_assert(std::is_same_v<value_type, out_value_type>);
315 static_assert(std::is_trivially_copyable<value_type>(),
316 "Can only copy trivially copyable types");
317
318 auto size = std::distance(begin, end);
319 if (size == 0) { return; }
320#ifdef AMREX_USE_GPU
321 dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));
322#else
323 std::memcpy(&(*result), &(*begin), size*sizeof(value_type));
324#endif
325 }
326
333 template<class Iter>
334 void prefetchToHost (Iter begin, Iter end) noexcept
335 {
336 using value_type = typename std::iterator_traits<Iter>::value_type;
337 static_assert(std::is_trivially_copyable<value_type>(),
338 "Can only copy trivially copyable types");
339
340 auto size = std::distance(begin, end);
341 if (size == 0) { return; }
342
343#ifdef AMREX_USE_GPU
344 // Currently only implemented for CUDA.
345#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
346 if (Gpu::Device::devicePropMajor() >= 6) {
347 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
348 size*sizeof(value_type),
349 cudaCpuDeviceId,
350 Gpu::gpuStream()));
351 }
352#endif
353#endif
354
356 }
357
364 template<class Iter>
365 void prefetchToDevice (Iter begin, Iter end) noexcept
366 {
367 using value_type = typename std::iterator_traits<Iter>::value_type;
368 static_assert(std::is_trivially_copyable<value_type>(),
369 "Can only copy trivially copyable types");
370
371 auto size = std::distance(begin, end);
372 if (size == 0) { return; }
373
374#ifdef AMREX_USE_GPU
375 // Currently only implemented for CUDA.
376#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
377 if (Gpu::Device::devicePropMajor() >= 6) {
378 AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
379 size*sizeof(value_type),
381 Gpu::gpuStream()));
382 }
383#endif
384#endif
385
387 }
388
404 template <typename IT, typename F,
405 typename T = typename std::iterator_traits<IT>::value_type,
406 std::enable_if_t<(sizeof(T) <= 36*8) && // so there is enough shared memory
407 std::is_trivially_copyable_v<T> &&
409 int> FOO = 0>
410 void fillAsync (IT first, IT last, F const& f) noexcept
411 {
412 auto N = static_cast<Long>(std::distance(first, last));
413 if (N <= 0) { return; }
414 auto p = &(*first);
415#ifndef AMREX_USE_GPU
416 for (Long i = 0; i < N; ++i) {
417 f(p[i], i);
418 }
419#else
420 // No need to use shared memory if the type is small.
421 // May not have enough shared memory if the type is too big.
422 // Cannot use shared memory, if the type is not trivially copable.
423 if constexpr ((sizeof(T) <= 8)
424 || (sizeof(T) > 36*8)
425 || ! std::is_trivially_copyable<T>()) {
426 amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept
427 {
428 f(p[i], i);
429 });
430 } else {
431 static_assert(sizeof(T) % sizeof(unsigned int) == 0);
432 using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,
433 unsigned long long, unsigned int>;
434 constexpr Long nU = sizeof(T) / sizeof(U);
435 auto pu = reinterpret_cast<U*>(p);
436 int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
437 int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
438 std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
439#ifdef AMREX_USE_SYCL
440 amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
441 [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
442 {
443 Long i = handler.globalIdx();
444 Long blockDimx = handler.blockDim();
445 Long threadIdxx = handler.threadIdx();
446 Long blockIdxx = handler.blockIdx();
447 auto const shared_U = (U*)handler.sharedMemory();
448 auto const shared_T = (T*)shared_U;
449 if (i < N) {
450 auto ga = new(shared_T+threadIdxx) T;
451 f(*ga, i);
452 }
453 handler.sharedBarrier();
454 for (Long m = threadIdxx,
455 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
456 m < mend; m += blockDimx) {
457 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
458 }
459 });
460#else
461 amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
462 [=] AMREX_GPU_DEVICE () noexcept
463 {
464 Long blockDimx = blockDim.x;
465 Long threadIdxx = threadIdx.x;
466 Long blockIdxx = blockIdx.x;
467 Long i = blockDimx*blockIdxx + threadIdxx;
469 auto const shared_U = gsm.dataPtr();
470 auto const shared_T = (T*)shared_U;
471 if (i < N) {
472 auto ga = new(shared_T+threadIdxx) T;
473 f(*ga, i);
474 }
475 __syncthreads();
476 for (Long m = threadIdxx,
477 mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
478 m < mend; m += blockDimx) {
479 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
480 }
481 });
482#endif
483 }
484#endif
485 }
486
487}
488
489#endif
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:608
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:142
Definition AMReX_PODVector.H:262
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:279
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition AMReX_GpuContainers.H:410
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:121
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:233
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:334
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:100
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:265
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:293
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:251
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:309
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:365
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:120
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1890
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1881
Definition AMReX_GpuContainers.H:97
Definition AMReX_GpuContainers.H:96
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuContainers.H:95
Definition AMReX_GpuMemory.H:125
AMREX_GPU_DEVICE T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:201