1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
21#define AMREX_GPU_MAX_STREAMS 8
27#elif defined(AMREX_USE_CUDA)
29#elif defined(AMREX_USE_SYCL)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
43 int concurrentManagedAccess;
73 static void Initialize (
bool minimal,
int a_device_id);
76#if defined(AMREX_USE_GPU)
85 static sycl::queue& streamQueue () noexcept {
return *(
gpuStream().queue); }
86 static sycl::queue& streamQueue (
int i)
noexcept {
return *(
gpu_stream_pool[i].get().queue); }
128#if defined(__CUDACC__)
130 static void startGraphRecording(
bool first_iter,
void* h_ptr,
void* d_ptr,
size_t sz);
131 static cudaGraphExec_t stopGraphRecording(
bool last_iter);
134 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
137 static void executeGraph(
const cudaGraphExec_t &graphExec,
bool synch =
true);
148 dim3& numBlocks, dim3& numThreads)
noexcept;
149 static void c_threads_and_blocks (
const int* lo,
const int* hi, dim3& numBlocks, dim3& numThreads)
noexcept;
169#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
170 return std::string(
"AMD");
171#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
173 return std::string(
"NVIDIA");
174#elif defined(AMREX_USE_SYCL)
177 return std::string(
"Unknown");
189#if defined(AMREX_USE_HIP)
191#elif defined(AMREX_USE_SYCL)
200 static Long maxMemAllocSize () noexcept {
return device_prop.maxMemAllocSize; }
201 static sycl::context& syclContext () {
return *sycl_context; }
202 static sycl::device& syclDevice () {
return *sycl_device; }
229 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
230 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
239#if defined(AMREX_USE_GPU)
291 if (sz == 0) {
return; }
293 auto& q = Device::streamQueue();
294 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
297 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpuStream()));,
305 if (sz == 0) {
return; }
307 auto& q = Device::streamQueue();
308 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
311 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost,
gpuStream()));,
319 if (sz == 0) {
return; }
321 auto& q = Device::streamQueue();
322 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
325 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice,
gpuStream()));,
335 if (sz == 0) {
return; }
336 std::memcpy(p_d, p_h, sz);
342 if (sz == 0) {
return; }
343 std::memcpy(p_h, p_d, sz);
347dtod_memcpy_async (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
349 if (sz == 0) {
return; }
350 std::memcpy(p_d_dst, p_d_src, sz);
356htod_memcpy (
void* p_d,
const void* p_h,
const std::size_t sz)
noexcept
358 if (sz == 0) {
return; }
364dtoh_memcpy (
void* p_h,
const void* p_d,
const std::size_t sz)
noexcept
366 if (sz == 0) {
return; }
372dtod_memcpy (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
374 if (sz == 0) {
return; }
379#ifdef AMREX_USE_HYPRE
380void hypreSynchronize ();
390#if defined(AMREX_USE_CUDA)
392 cudaMemcpyHostToDevice,
394#elif defined(AMREX_USE_HIP)
395 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes,
offset,
396 hipMemcpyHostToDevice,
398#elif defined(AMREX_USE_SYCL)
399 Device::streamQueue().memcpy(dg, src, nbytes,
offset);
401 auto* p = (
char*)(&dg);
402 std::memcpy(p+
offset, src, nbytes);
413#if defined(AMREX_USE_CUDA)
415 cudaMemcpyDeviceToHost,
417#elif defined(AMREX_USE_HIP)
418 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes,
offset,
419 hipMemcpyDeviceToHost,
421#elif defined(AMREX_USE_SYCL)
422 Device::streamQueue().memcpy(dst, dg, nbytes,
offset);
424 auto const* p = (
char const*)(&dg);
425 std::memcpy(dst, p+
offset, nbytes);
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:105
Definition AMReX_GpuDevice.H:69
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:724
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:690
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:683
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:90
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:768
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:926
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:983
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:187
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:703
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:716
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:197
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:156
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:158
static int num_devices_used
Definition AMReX_GpuDevice.H:211
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:164
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:154
static void Finalize()
Definition AMReX_GpuDevice.cpp:443
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:733
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:894
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:159
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:82
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:960
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:750
static int num_device_partners
Definition AMReX_GpuDevice.H:212
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:77
static int verbose
Definition AMReX_GpuDevice.H:213
static dim3 numBlocksOverride
Definition AMReX_GpuDevice.H:218
static dim3 numThreadsMin
Definition AMReX_GpuDevice.H:217
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:167
static Vector< int > gpu_stream_index
Definition AMReX_GpuDevice.H:223
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1155
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:968
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
friend StreamManager
Definition AMReX_GpuDevice.H:233
static dim3 numThreadsOverride
Definition AMReX_GpuDevice.H:218
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:153
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1122
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:758
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1144
static Vector< StreamManager > gpu_stream_pool
Definition AMReX_GpuDevice.H:220
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:157
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:672
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:155
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:678
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:95
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:206
static int memory_pools_supported
Definition AMReX_GpuDevice.H:225
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:975
static unsigned int max_blocks_per_launch
Definition AMReX_GpuDevice.H:226
static int device_id
Definition AMReX_GpuDevice.H:210
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:152
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1041
static void initialize_gpu(bool minimal)
Definition AMReX_GpuDevice.cpp:474
static int max_gpu_streams
Definition AMReX_GpuDevice.H:214
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:163
static gpuDeviceProp_t device_prop
Definition AMReX_GpuDevice.H:224
Definition AMReX_GpuDevice.H:57
Vector< std::pair< Arena *, void * > > m_free_wait_list
Definition AMReX_GpuDevice.H:60
gpuStream_t & get()
Definition AMReX_GpuDevice.cpp:146
std::mutex m_mutex
Definition AMReX_GpuDevice.H:59
void sync()
Definition AMReX_GpuDevice.cpp:151
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:184
gpuStream_t m_stream
Definition AMReX_GpuDevice.H:58
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:317
void synchronize() noexcept
Definition AMReX_GpuDevice.H:254
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:281
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:260
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:303
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:266
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:409
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:248
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:364
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:356
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:289
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:372
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:386
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83