1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
21#define AMREX_GPU_MAX_STREAMS 8
27#elif defined(AMREX_USE_CUDA)
29#elif defined(AMREX_USE_SYCL)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
43 int concurrentManagedAccess;
74 static void Initialize (
bool minimal,
int a_device_id);
77#if defined(AMREX_USE_GPU)
86 static sycl::queue& streamQueue () noexcept {
return *(
gpuStream().queue); }
87 static sycl::queue& streamQueue (
int i)
noexcept {
return *(gpu_stream_pool[i].get().queue); }
131#if defined(__CUDACC__)
133 static void startGraphRecording(
bool first_iter,
void* h_ptr,
void* d_ptr,
size_t sz);
134 static cudaGraphExec_t stopGraphRecording(
bool last_iter);
137 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
140 static void executeGraph(
const cudaGraphExec_t &graphExec,
bool synch =
true);
151 dim3& numBlocks, dim3& numThreads)
noexcept;
152 static void c_threads_and_blocks (
const int* lo,
const int* hi, dim3& numBlocks, dim3& numThreads)
noexcept;
155 static std::size_t
totalGlobalMem () noexcept {
return device_prop.totalGlobalMem; }
162 static std::string
deviceName () noexcept {
return std::string(device_prop.name); }
172#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
173 return std::string(
"AMD");
174#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
176 return std::string(
"NVIDIA");
177#elif defined(AMREX_USE_SYCL)
178 return device_prop.vendor;
180 return std::string(
"Unknown");
192#if defined(AMREX_USE_HIP)
194#elif defined(AMREX_USE_SYCL)
203 static Long maxMemAllocSize () noexcept {
return device_prop.maxMemAllocSize; }
204 static sycl::context& syclContext () {
return *sycl_context; }
205 static sycl::device& syclDevice () {
return *sycl_device; }
211 static void initialize_gpu (
bool minimal);
221 static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
223 static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool;
232 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
233 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
236 friend StreamManager;
242#if defined(AMREX_USE_GPU)
303 if (sz == 0) {
return; }
305 auto& q = Device::streamQueue();
306 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
309 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpuStream()));,
317 if (sz == 0) {
return; }
319 auto& q = Device::streamQueue();
320 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
323 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost,
gpuStream()));,
331 if (sz == 0) {
return; }
333 auto& q = Device::streamQueue();
334 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
337 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice,
gpuStream()));,
347 if (sz == 0) {
return; }
348 std::memcpy(p_d, p_h, sz);
354 if (sz == 0) {
return; }
355 std::memcpy(p_h, p_d, sz);
359dtod_memcpy_async (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
361 if (sz == 0) {
return; }
362 std::memcpy(p_d_dst, p_d_src, sz);
368htod_memcpy (
void* p_d,
const void* p_h,
const std::size_t sz)
noexcept
370 if (sz == 0) {
return; }
376dtoh_memcpy (
void* p_h,
const void* p_d,
const std::size_t sz)
noexcept
378 if (sz == 0) {
return; }
384dtod_memcpy (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
386 if (sz == 0) {
return; }
391#ifdef AMREX_USE_HYPRE
392void hypreSynchronize ();
402#if defined(AMREX_USE_CUDA)
404 cudaMemcpyHostToDevice,
406#elif defined(AMREX_USE_HIP)
407 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes,
offset,
408 hipMemcpyHostToDevice,
410#elif defined(AMREX_USE_SYCL)
411 Device::streamQueue().memcpy(dg, src, nbytes,
offset);
413 auto* p = (
char*)(&dg);
414 std::memcpy(p+
offset, src, nbytes);
425#if defined(AMREX_USE_CUDA)
427 cudaMemcpyDeviceToHost,
429#elif defined(AMREX_USE_HIP)
430 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes,
offset,
431 hipMemcpyDeviceToHost,
433#elif defined(AMREX_USE_SYCL)
434 Device::streamQueue().memcpy(dst, dg, nbytes,
offset);
436 auto const* p = (
char const*)(&dg);
437 std::memcpy(dst, p+
offset, nbytes);
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:105
Definition AMReX_GpuDevice.H:70
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:731
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:697
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:690
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:91
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:775
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:950
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1007
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:190
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:710
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:723
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:159
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:161
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:167
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:157
static void Finalize()
Definition AMReX_GpuDevice.cpp:450
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:740
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:918
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:162
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:83
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:984
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:757
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:78
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:170
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1179
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:992
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:156
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1146
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:765
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1168
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:160
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:679
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:158
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:685
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:96
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:213
static bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.cpp:785
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:999
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:155
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1065
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:166
Definition AMReX_GpuDevice.H:57
gpuStream_t & get()
Definition AMReX_GpuDevice.cpp:146
std::size_t wait_list_size()
Definition AMReX_GpuDevice.cpp:204
void sync()
Definition AMReX_GpuDevice.cpp:151
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:184
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:329
void synchronize() noexcept
Definition AMReX_GpuDevice.H:257
bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.H:293
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:284
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:315
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:269
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:421
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:251
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:376
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:368
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:384
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:398
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83