1 #ifndef AMREX_GPU_DEVICE_H_
2 #define AMREX_GPU_DEVICE_H_
3 #include <AMReX_Config.H>
20 #define AMREX_GPU_MAX_STREAMS 8
26 #elif defined(AMREX_USE_CUDA)
28 #elif defined(AMREX_USE_SYCL)
32 std::size_t totalGlobalMem;
33 std::size_t sharedMemPerBlock;
34 int multiProcessorCount;
35 int maxThreadsPerMultiProcessor;
36 int maxThreadsPerBlock;
42 int concurrentManagedAccess;
59 #if defined(AMREX_USE_GPU)
67 static sycl::queue& streamQueue (
int i) noexcept {
return *(
gpu_stream_pool[i].queue); }
107 #if defined(__CUDACC__)
109 static void startGraphRecording(
bool first_iter,
void* h_ptr,
void* d_ptr,
size_t sz);
110 static cudaGraphExec_t stopGraphRecording(
bool last_iter);
113 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
116 static void executeGraph(
const cudaGraphExec_t &graphExec,
bool synch =
true);
127 dim3& numBlocks, dim3& numThreads) noexcept;
128 static void c_threads_and_blocks (
const int* lo,
const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
141 #ifdef AMREX_USE_CUDA
148 #if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
149 return std::string(
"AMD");
150 #elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
152 return std::string(
"NVIDIA");
153 #elif defined(AMREX_USE_SYCL)
156 return std::string(
"Unknown");
168 #if defined(AMREX_USE_HIP)
170 #elif defined(AMREX_USE_SYCL)
178 #ifdef AMREX_USE_SYCL
179 static Long maxMemAllocSize () noexcept {
return device_prop.maxMemAllocSize; }
180 static sycl::context& syclContext () {
return *sycl_context; }
181 static sycl::device& syclDevice () {
return *sycl_device; }
207 #ifdef AMREX_USE_SYCL
208 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
209 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
216 #if defined(AMREX_USE_GPU)
253 if (sz == 0) {
return; }
254 #ifdef AMREX_USE_SYCL
255 auto& q = Device::streamQueue();
256 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
259 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpuStream()));,
267 if (sz == 0) {
return; }
268 #ifdef AMREX_USE_SYCL
269 auto& q = Device::streamQueue();
270 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
273 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost,
gpuStream()));,
281 if (sz == 0) {
return; }
282 #ifdef AMREX_USE_SYCL
283 auto& q = Device::streamQueue();
284 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
287 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice,
gpuStream()));,
293 htod_memcpy (
void* p_d,
const void* p_h,
const std::size_t sz) noexcept
295 if (sz == 0) {
return; }
301 dtoh_memcpy (
void* p_h,
const void* p_d,
const std::size_t sz) noexcept
303 if (sz == 0) {
return; }
309 dtod_memcpy (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz) noexcept
311 if (sz == 0) {
return; }
318 #ifdef AMREX_USE_HYPRE
319 void hypreSynchronize ();
324 template <
typename T>
329 #if defined(AMREX_USE_CUDA)
331 cudaMemcpyHostToDevice,
333 #elif defined(AMREX_USE_HIP)
334 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes,
offset,
335 hipMemcpyHostToDevice,
337 #elif defined(AMREX_USE_SYCL)
338 Device::streamQueue().memcpy(dg, src, nbytes,
offset);
340 auto* p = (
char*)(&dg);
347 template <
typename T>
352 #if defined(AMREX_USE_CUDA)
354 cudaMemcpyDeviceToHost,
356 #elif defined(AMREX_USE_HIP)
357 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes,
offset,
358 hipMemcpyDeviceToHost,
360 #elif defined(AMREX_USE_SYCL)
361 Device::streamQueue().memcpy(dst, dg, nbytes,
offset);
363 auto const* p = (
char const*)(&dg);
#define AMREX_EXPORT
Definition: AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition: AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition: AMReX_GpuError.H:73
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
Definition: AMReX_GpuDevice.H:52
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition: AMReX_GpuDevice.cpp:615
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition: AMReX_GpuDevice.cpp:586
static int numDevicePartners() noexcept
Definition: AMReX_GpuDevice.cpp:579
static int numGpuStreams() noexcept
Definition: AMReX_GpuDevice.H:71
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition: AMReX_GpuDevice.cpp:811
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:859
static int memoryPoolsSupported() noexcept
Definition: AMReX_GpuDevice.H:166
static void setStreamIndex(int idx) noexcept
Definition: AMReX_GpuDevice.cpp:594
static gpuStream_t resetStream() noexcept
Definition: AMReX_GpuDevice.cpp:607
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream_pool
Definition: AMReX_GpuDevice.H:199
static unsigned int maxBlocksPerLaunch() noexcept
Definition: AMReX_GpuDevice.H:176
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream
Definition: AMReX_GpuDevice.H:202
static int maxThreadsPerBlock() noexcept
Definition: AMReX_GpuDevice.H:135
static int maxBlocksPerGrid(int dir) noexcept
Definition: AMReX_GpuDevice.H:137
static AMREX_EXPORT int num_devices_used
Definition: AMReX_GpuDevice.H:190
static int devicePropMinor() noexcept
Definition: AMReX_GpuDevice.H:143
static int numMultiProcessors() noexcept
Definition: AMReX_GpuDevice.H:133
static void Finalize()
Definition: AMReX_GpuDevice.cpp:347
static void synchronize() noexcept
Definition: AMReX_GpuDevice.cpp:624
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition: AMReX_GpuDevice.cpp:788
static std::string deviceName() noexcept
Definition: AMReX_GpuDevice.H:138
static cudaStream_t cudaStream() noexcept
Definition: AMReX_GpuDevice.H:63
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition: AMReX_GpuDevice.cpp:836
static void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.cpp:641
static AMREX_EXPORT int num_device_partners
Definition: AMReX_GpuDevice.H:191
static gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:60
static AMREX_EXPORT int verbose
Definition: AMReX_GpuDevice.H:192
static AMREX_EXPORT dim3 numBlocksOverride
Definition: AMReX_GpuDevice.H:197
static AMREX_EXPORT dim3 numThreadsMin
Definition: AMReX_GpuDevice.H:196
static std::string deviceVendor() noexcept
Definition: AMReX_GpuDevice.H:146
static void profilerStop()
Definition: AMReX_GpuDevice.cpp:1031
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:844
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
static AMREX_EXPORT dim3 numThreadsOverride
Definition: AMReX_GpuDevice.H:197
static std::size_t sharedMemPerBlock() noexcept
Definition: AMReX_GpuDevice.H:132
static std::size_t freeMemAvailable()
Definition: AMReX_GpuDevice.cpp:998
static void streamSynchronizeAll() noexcept
Definition: AMReX_GpuDevice.cpp:657
static void profilerStart()
Definition: AMReX_GpuDevice.cpp:1020
static int maxThreadsPerBlock(int dir) noexcept
Definition: AMReX_GpuDevice.H:136
static int deviceId() noexcept
Definition: AMReX_GpuDevice.cpp:568
static int maxThreadsPerMultiProcessor() noexcept
Definition: AMReX_GpuDevice.H:134
static int numDevicesUsed() noexcept
Definition: AMReX_GpuDevice.cpp:574
static void resetStreamIndex() noexcept
Definition: AMReX_GpuDevice.H:76
static AMREX_EXPORT int memory_pools_supported
Definition: AMReX_GpuDevice.H:204
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:851
static void initialize_gpu()
Definition: AMReX_GpuDevice.cpp:377
static AMREX_EXPORT unsigned int max_blocks_per_launch
Definition: AMReX_GpuDevice.H:205
static AMREX_EXPORT int device_id
Definition: AMReX_GpuDevice.H:189
static std::size_t totalGlobalMem() noexcept
Definition: AMReX_GpuDevice.H:131
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:917
static void Initialize()
Definition: AMReX_GpuDevice.cpp:143
static AMREX_EXPORT int max_gpu_streams
Definition: AMReX_GpuDevice.H:193
static int devicePropMajor() noexcept
Definition: AMReX_GpuDevice.H:142
static AMREX_EXPORT gpuDeviceProp_t device_prop
Definition: AMReX_GpuDevice.H:203
Definition: AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:279
void synchronize() noexcept
Definition: AMReX_GpuDevice.H:231
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
void streamSynchronizeAll() noexcept
Definition: AMReX_GpuDevice.H:243
bool inSingleStreamRegion() noexcept
Definition: AMReX_GpuControl.H:145
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition: AMReX_GpuDevice.H:348
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy(void *dest, const void *src, std::size_t count)
Definition: AMReX_GpuUtility.H:214
int numGpuStreams() noexcept
Definition: AMReX_GpuDevice.H:225
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:301
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:293
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:309
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition: AMReX_GpuDevice.H:325
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition: AMReX_OpenMP.H:37
Definition: AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition: AMReX_GpuDevice.H:27
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77