1#ifndef AMREX_GPU_CONTAINERS_H_
2#define AMREX_GPU_CONTAINERS_H_
3#include <AMReX_Config.H>
127 template<
class InIter,
class OutIter>
130 using value_type =
typename std::iterator_traits<InIter>::value_type;
132 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
133 static_assert(std::is_same_v<value_type, out_value_type>);
134 static_assert(std::is_trivially_copyable<value_type>(),
135 "Can only copy trivially copyable types");
137 auto size = std::distance(
begin,
end);
138 if (size == 0) {
return; }
160 template<
class InIter,
class OutIter>
163 using value_type =
typename std::iterator_traits<InIter>::value_type;
165 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
166 static_assert(std::is_same_v<value_type, out_value_type>);
167 static_assert(std::is_trivially_copyable<value_type>(),
168 "Can only copy trivially copyable types");
170 auto size = std::distance(
begin,
end);
171 if (size == 0) {
return; }
193 template<
class InIter,
class OutIter>
196 using value_type =
typename std::iterator_traits<InIter>::value_type;
198 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
199 static_assert(std::is_same_v<value_type, out_value_type>);
200 static_assert(std::is_trivially_copyable<value_type>(),
201 "Can only copy trivially copyable types");
203 auto size = std::distance(
begin,
end);
204 if (size == 0) {
return; }
227 template<
class InIter,
class OutIter>
230 using value_type =
typename std::iterator_traits<InIter>::value_type;
232 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
233 static_assert(std::is_same_v<value_type, out_value_type>);
234 static_assert(std::is_trivially_copyable<value_type>(),
235 "Can only copy trivially copyable types");
237 auto size = std::distance(
begin,
end);
238 if (size == 0) {
return; }
261 template<
class InIter,
class OutIter>
264 using value_type =
typename std::iterator_traits<InIter>::value_type;
266 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
267 static_assert(std::is_same_v<value_type, out_value_type>);
268 static_assert(std::is_trivially_copyable<value_type>(),
269 "Can only copy trivially copyable types");
271 auto size = std::distance(
begin,
end);
272 if (size == 0) {
return; }
295 template<
class InIter,
class OutIter>
298 using value_type =
typename std::iterator_traits<InIter>::value_type;
300 using out_value_type =
typename std::iterator_traits<OutIter>::value_type;
301 static_assert(std::is_same_v<value_type, out_value_type>);
302 static_assert(std::is_trivially_copyable<value_type>(),
303 "Can only copy trivially copyable types");
305 auto size = std::distance(
begin,
end);
306 if (size == 0) {
return; }
324 using value_type =
typename std::iterator_traits<Iter>::value_type;
325 static_assert(std::is_trivially_copyable<value_type>(),
326 "Can only copy trivially copyable types");
328 auto size = std::distance(
begin,
end);
329 if (size == 0) {
return; }
333#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
335#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
336 cudaMemLocation location = {};
337 location.type = cudaMemLocationTypeDevice;
338 location.id = cudaCpuDeviceId;
340 size*
sizeof(value_type),
345 size*
sizeof(value_type),
370 using value_type =
typename std::iterator_traits<Iter>::value_type;
371 static_assert(std::is_trivially_copyable<value_type>(),
372 "Can only copy trivially copyable types");
374 auto size = std::distance(
begin,
end);
375 if (size == 0) {
return; }
379#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
381#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
382 cudaMemLocation location = {};
383 location.type = cudaMemLocationTypeDevice;
386 size*
sizeof(value_type),
391 size*
sizeof(value_type),
417 template <
typename IT,
typename F,
418 typename T =
typename std::iterator_traits<IT>::value_type,
419 std::enable_if_t<(
sizeof(T) <= 36*8) &&
420 std::is_trivially_copyable_v<T> &&
425 auto N =
static_cast<Long>(std::distance(first, last));
426 if (N <= 0) {
return; }
429 for (
Long i = 0; i < N; ++i) {
436 if constexpr ((
sizeof(T) <= 8)
437 || (
sizeof(T) > 36*8)
438 || ! std::is_trivially_copyable<T>()) {
444 static_assert(
sizeof(T) %
sizeof(
unsigned int) == 0);
445 using U = std::conditional_t<
sizeof(T) %
sizeof(
unsigned long long) == 0,
446 unsigned long long,
unsigned int>;
447 constexpr Long nU =
sizeof(T) /
sizeof(U);
448 auto pu =
reinterpret_cast<U*
>(p);
449 constexpr int nthreads_per_block = (
sizeof(T) <= 64) ? 256 : 128;
450 int nblocks =
static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
451 std::size_t shared_mem_bytes = nthreads_per_block *
sizeof(T);
453 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
456 Long i = handler.globalIdx();
457 Long blockDimx = handler.blockDim();
458 Long threadIdxx = handler.threadIdx();
459 Long blockIdxx = handler.blockIdx();
460 auto const shared_U = (U*)handler.sharedMemory();
461 auto const shared_T = (T*)shared_U;
463 auto ga =
new(shared_T+threadIdxx) T;
466 handler.sharedBarrier();
467 for (
Long m = threadIdxx,
468 mend = nU *
amrex::min(blockDimx, N-blockDimx*blockIdxx);
469 m < mend; m += blockDimx) {
470 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
474 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
477 Long blockDimx = blockDim.x;
478 Long threadIdxx = threadIdx.x;
479 Long blockIdxx = blockIdx.x;
480 Long i = blockDimx*blockIdxx + threadIdxx;
482 auto const shared_U = gsm.
dataPtr();
483 auto const shared_T = (T*)shared_U;
485 auto ga =
new(shared_T+threadIdxx) T;
489 for (
Long m = threadIdxx,
490 mend = nU *
amrex::min(blockDimx, N-blockDimx*blockIdxx);
491 m < mend; m += blockDimx) {
492 pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:691
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:203
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:55
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:449
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition AMReX_GpuContainers.H:423
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:128
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:322
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:496
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:488
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:504
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition AMReX_GpuContainers.H:368
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
Definition AMReX_GpuContainers.H:104
Definition AMReX_GpuContainers.H:103
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuContainers.H:102
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:213