amrex/doxygen/AMReX__GpuContainers_8H_source.html

 #ifndef AMREX_GPU_CONTAINERS_H_

 #define AMREX_GPU_CONTAINERS_H_

 #include <AMReX_Config.H>


 #include <AMReX_Vector.H>

 #include <AMReX_PODVector.H>

 #include <AMReX_GpuAllocators.H>

 #include <type_traits>


 #include <numeric>

 #include <iterator>


 namespace amrex::Gpu {


 #ifdef AMREX_USE_GPU


     template <class T>

     using DeviceVector = PODVector<T, ArenaAllocator<T> >;


     template <class T>

     using NonManagedDeviceVector = PODVector<T, DeviceArenaAllocator<T> >;


     template <class T>

     using ManagedVector = PODVector<T, ManagedArenaAllocator<T> >;


     template <class T>

     using PinnedVector = PODVector<T, PinnedArenaAllocator<T> >;


     template <class T>

     using AsyncVector = PODVector<T, AsyncArenaAllocator<T> >;


     template <class T>

     using HostVector = PinnedVector<T>;


     template <class T>

     using ManagedDeviceVector = PODVector<T, ManagedArenaAllocator<T> >;


 #else

     template <class T>

     using DeviceVector = PODVector<T>;


     template <class T>

     using HostVector = PODVector<T>;


     template <class T>

     using NonManagedVector = PODVector<T>;


     template <class T>

     using ManagedVector = PODVector<T>;


     template <class T>

     using ManagedDeviceVector = PODVector<T>;


     template <class T>

     using PinnedVector = PODVector<T>;


     template <class T>

     using AsyncVector = PODVector<T>;

 #endif


     struct HostToDevice {};

     struct DeviceToHost {};

     struct DeviceToDevice {};

     static constexpr HostToDevice   hostToDevice{};

     static constexpr DeviceToHost   deviceToHost{};

     static constexpr DeviceToDevice deviceToDevice{};


     template<class InIter, class OutIter>

     void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         htod_memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class InIter, class OutIter>

     void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         dtoh_memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class InIter, class OutIter>

     void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         dtod_memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class InIter, class OutIter>

     void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         htod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class InIter, class OutIter>

     void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         dtoh_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class InIter, class OutIter>

     void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept

     {

         using value_type = typename std::iterator_traits<InIter>::value_type;


         using out_value_type = typename std::iterator_traits<OutIter>::value_type;

         static_assert(std::is_same_v<value_type, out_value_type>);

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }

 #ifdef AMREX_USE_GPU

         dtod_memcpy_async(&(*result), &(*begin), size*sizeof(value_type));

 #else

         std::memcpy(&(*result), &(*begin), size*sizeof(value_type));

 #endif

     }


     template<class Iter>

     void prefetchToHost (Iter begin, Iter end) noexcept

     {

         using value_type = typename std::iterator_traits<Iter>::value_type;

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }


 #ifdef AMREX_USE_GPU

         // Currently only implemented for CUDA.

 #if defined(AMREX_USE_CUDA) && !defined(_WIN32)

         if (Gpu::Device::devicePropMajor() >= 6) {

             AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                       size*sizeof(value_type),

                                                       cudaCpuDeviceId,

                                                       Gpu::gpuStream()));

         }

 #endif

 #endif


         Gpu::streamSynchronize();

     }


     template<class Iter>

     void prefetchToDevice (Iter begin, Iter end) noexcept

     {

         using value_type = typename std::iterator_traits<Iter>::value_type;

         static_assert(std::is_trivially_copyable<value_type>(),

                       "Can only copy trivially copyable types");


         auto size = std::distance(begin, end);

         if (size == 0) { return; }


 #ifdef AMREX_USE_GPU

         // Currently only implemented for CUDA.

 #if defined(AMREX_USE_CUDA) && !defined(_WIN32)

         if (Gpu::Device::devicePropMajor() >= 6) {

             AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),

                                                       size*sizeof(value_type),

                                                       Gpu::Device::deviceId(),

                                                       Gpu::gpuStream()));

         }

 #endif

 #endif


         Gpu::streamSynchronize();

     }


     template <typename IT, typename F,

               typename T = typename std::iterator_traits<IT>::value_type,

               std::enable_if_t<(sizeof(T) <= 36*8) &&  // so there is enough shared memory

                                std::is_trivially_copyable_v<T> &&

                                amrex::IsCallable<F, T&, Long>::value,

                                int> FOO = 0>

     void fillAsync (IT first, IT last, F const& f) noexcept

     {

         auto N = static_cast<Long>(std::distance(first, last));

         if (N <= 0) { return; }

         auto p = &(*first);

 #ifndef AMREX_USE_GPU

         for (Long i = 0; i < N; ++i) {

             f(p[i], i);

         }

 #else

         // No need to use shared memory if the type is small.

         // May not have enough shared memory if the type is too big.

         // Cannot use shared memory, if the type is not trivially copable.

         if constexpr ((sizeof(T) <= 8)

                       || (sizeof(T) > 36*8)

                       || ! std::is_trivially_copyable<T>()) {

             amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (Long i) noexcept

             {

                 f(p[i], i);

             });

         } else {

             static_assert(sizeof(T) % sizeof(unsigned int) == 0);

             using U = std::conditional_t<sizeof(T) % sizeof(unsigned long long) == 0,

                                          unsigned long long, unsigned int>;

             constexpr Long nU = sizeof(T) / sizeof(U);

             auto pu = reinterpret_cast<U*>(p);

             int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;

             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);

             std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);

 #ifdef AMREX_USE_SYCL

             amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),

             [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept

             {

                 Long i = handler.globalIdx();

                 Long blockDimx = handler.blockDim();

                 Long threadIdxx = handler.threadIdx();

                 Long blockIdxx = handler.blockIdx();

                 auto const shared_U = (U*)handler.sharedMemory();

                 auto const shared_T = (T*)shared_U;

                 if (i < N) {

                     auto ga = new(shared_T+threadIdxx) T;

                     f(*ga, i);

                 }

                 handler.sharedBarrier();

                 for (Long m = threadIdxx,

                          mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);

                      m < mend; m += blockDimx) {

                     pu[blockDimx*blockIdxx*nU+m] = shared_U[m];

                 }

             });

 #else

             amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),

                           [=] AMREX_GPU_DEVICE () noexcept

             {

                 Long blockDimx = blockDim.x;

                 Long threadIdxx = threadIdx.x;

                 Long blockIdxx = blockIdx.x;

                 Long i = blockDimx*blockIdxx + threadIdxx;

                 Gpu::SharedMemory<U> gsm;

                 auto const shared_U = gsm.dataPtr();

                 auto const shared_T = (T*)shared_U;

                 if (i < N) {

                     auto ga = new(shared_T+threadIdxx) T;

                     f(*ga, i);

                 }

                 __syncthreads();

                 for (Long m = threadIdxx,

                          mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);

                      m < mend; m += blockDimx) {

                     pu[blockDimx*blockIdxx*nU+m] = shared_U[m];

                 }

             });

 #endif

         }

 #endif

     }


 }


 #endif

AMReX_GpuAllocators.H

AMREX_CUDA_SAFE_CALL
#define AMREX_CUDA_SAFE_CALL(call)
Definition: AMReX_GpuError.H:73

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18

AMReX_PODVector.H

AMReX_Vector.H

amrex::Gpu::Device::deviceId
static int deviceId() noexcept
Definition: AMReX_GpuDevice.cpp:568

amrex::Gpu::Device::devicePropMajor
static int devicePropMajor() noexcept
Definition: AMReX_GpuDevice.H:142

amrex::PODVector
Definition: AMReX_PODVector.H:246

amrex::Gpu::range_detail::size
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26

amrex::Gpu
Definition: AMReX_BaseFwd.H:52

amrex::Gpu::dtod_memcpy_async
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:279

amrex::Gpu::fillAsync
void fillAsync(IT first, IT last, F const &f) noexcept
Fill the elements in the given range using the given calllable.
Definition: AMReX_GpuContainers.H:410

amrex::Gpu::copy
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:121

amrex::Gpu::copyAsync
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:233

amrex::Gpu::prefetchToHost
void prefetchToHost(Iter begin, Iter end) noexcept
Migrate elements of a container from device to host. This is a no-op for host-only code.
Definition: AMReX_GpuContainers.H:334

amrex::Gpu::deviceToDevice
static constexpr DeviceToDevice deviceToDevice
Definition: AMReX_GpuContainers.H:100

amrex::Gpu::deviceToHost
static constexpr DeviceToHost deviceToHost
Definition: AMReX_GpuContainers.H:99

amrex::Gpu::hostToDevice
static constexpr HostToDevice hostToDevice
Definition: AMReX_GpuContainers.H:98

amrex::Gpu::streamSynchronize
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237

amrex::Gpu::dtoh_memcpy_async
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265

amrex::Gpu::memcpy
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy(void *dest, const void *src, std::size_t count)
Definition: AMReX_GpuUtility.H:214

amrex::Gpu::dtoh_memcpy
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:301

amrex::Gpu::htod_memcpy
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:293

amrex::Gpu::htod_memcpy_async
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251

amrex::Gpu::dtod_memcpy
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:309

amrex::Gpu::gpuStream
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218

amrex::Gpu::prefetchToDevice
void prefetchToDevice(Iter begin, Iter end) noexcept
Migrate elements of a container from host to device. This is a no-op for host-only code.
Definition: AMReX_GpuContainers.H:365

amrex::SundialsUserFun::f
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44

amrex::ParallelFor
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200

amrex::Order::F
@ F

amrex::min
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21

amrex::launch
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120

amrex::end
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1890

amrex::begin
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1881

amrex::Gpu::DeviceToDevice
Definition: AMReX_GpuContainers.H:97

amrex::Gpu::DeviceToHost
Definition: AMReX_GpuContainers.H:96

amrex::Gpu::Handler
Definition: AMReX_GpuTypes.H:86

amrex::Gpu::HostToDevice
Definition: AMReX_GpuContainers.H:95

amrex::Gpu::SharedMemory
Definition: AMReX_GpuMemory.H:125

amrex::Gpu::SharedMemory::dataPtr
AMREX_GPU_DEVICE T * dataPtr() noexcept
Definition: AMReX_GpuMemory.H:126

amrex::IsCallable
Test if a given type T is callable with arguments of type Args...
Definition: AMReX_TypeTraits.H:201