1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
20#define AMREX_GPU_MAX_STREAMS 8
26#elif defined(AMREX_USE_CUDA)
28#elif defined(AMREX_USE_SYCL)
32 std::size_t totalGlobalMem;
33 std::size_t sharedMemPerBlock;
34 int multiProcessorCount;
35 int maxThreadsPerMultiProcessor;
36 int maxThreadsPerBlock;
42 int concurrentManagedAccess;
59#if defined(AMREX_USE_GPU)
67 static sycl::queue& streamQueue (
int i)
noexcept {
return *(
gpu_stream_pool[i].queue); }
107#if defined(__CUDACC__)
109 static void startGraphRecording(
bool first_iter,
void* h_ptr,
void* d_ptr,
size_t sz);
110 static cudaGraphExec_t stopGraphRecording(
bool last_iter);
113 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
116 static void executeGraph(
const cudaGraphExec_t &graphExec,
bool synch =
true);
127 dim3& numBlocks, dim3& numThreads)
noexcept;
128 static void c_threads_and_blocks (
const int* lo,
const int* hi, dim3& numBlocks, dim3& numThreads)
noexcept;
148#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
149 return std::string(
"AMD");
150#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
152 return std::string(
"NVIDIA");
153#elif defined(AMREX_USE_SYCL)
156 return std::string(
"Unknown");
168#if defined(AMREX_USE_HIP)
170#elif defined(AMREX_USE_SYCL)
179 static Long maxMemAllocSize () noexcept {
return device_prop.maxMemAllocSize; }
180 static sycl::context& syclContext () {
return *sycl_context; }
181 static sycl::device& syclDevice () {
return *sycl_device; }
208 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
209 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
216#if defined(AMREX_USE_GPU)
253 if (sz == 0) {
return; }
255 auto& q = Device::streamQueue();
256 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
259 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpuStream()));,
267 if (sz == 0) {
return; }
269 auto& q = Device::streamQueue();
270 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
273 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost,
gpuStream()));,
281 if (sz == 0) {
return; }
283 auto& q = Device::streamQueue();
284 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
287 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice,
gpuStream()));,
293htod_memcpy (
void* p_d,
const void* p_h,
const std::size_t sz)
noexcept
295 if (sz == 0) {
return; }
301dtoh_memcpy (
void* p_h,
const void* p_d,
const std::size_t sz)
noexcept
303 if (sz == 0) {
return; }
309dtod_memcpy (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
311 if (sz == 0) {
return; }
318#ifdef AMREX_USE_HYPRE
319void hypreSynchronize ();
329#if defined(AMREX_USE_CUDA)
331 cudaMemcpyHostToDevice,
333#elif defined(AMREX_USE_HIP)
334 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes,
offset,
335 hipMemcpyHostToDevice,
337#elif defined(AMREX_USE_SYCL)
338 Device::streamQueue().memcpy(dg, src, nbytes,
offset);
340 auto* p = (
char*)(&dg);
341 std::memcpy(p+
offset, src, nbytes);
352#if defined(AMREX_USE_CUDA)
354 cudaMemcpyDeviceToHost,
356#elif defined(AMREX_USE_HIP)
357 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes,
offset,
358 hipMemcpyDeviceToHost,
360#elif defined(AMREX_USE_SYCL)
361 Device::streamQueue().memcpy(dst, dg, nbytes,
offset);
363 auto const* p = (
char const*)(&dg);
364 std::memcpy(dst, p+
offset, nbytes);
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Definition AMReX_GpuDevice.H:52
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:655
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:626
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:619
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:71
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:851
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:899
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:166
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:634
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:647
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream_pool
Definition AMReX_GpuDevice.H:199
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:176
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream
Definition AMReX_GpuDevice.H:202
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:135
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:137
static AMREX_EXPORT int num_devices_used
Definition AMReX_GpuDevice.H:190
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:143
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:133
static void Finalize()
Definition AMReX_GpuDevice.cpp:383
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:664
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:828
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:138
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:63
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:876
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:681
static AMREX_EXPORT int num_device_partners
Definition AMReX_GpuDevice.H:191
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:60
static AMREX_EXPORT int verbose
Definition AMReX_GpuDevice.H:192
static AMREX_EXPORT dim3 numBlocksOverride
Definition AMReX_GpuDevice.H:197
static AMREX_EXPORT dim3 numThreadsMin
Definition AMReX_GpuDevice.H:196
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:146
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1071
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:884
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
static AMREX_EXPORT dim3 numThreadsOverride
Definition AMReX_GpuDevice.H:197
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:132
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1038
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:697
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1060
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:136
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:608
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:134
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:614
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:76
static AMREX_EXPORT int memory_pools_supported
Definition AMReX_GpuDevice.H:204
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:891
static AMREX_EXPORT unsigned int max_blocks_per_launch
Definition AMReX_GpuDevice.H:205
static AMREX_EXPORT int device_id
Definition AMReX_GpuDevice.H:189
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:131
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:957
static void initialize_gpu(bool minimal)
Definition AMReX_GpuDevice.cpp:413
static void Initialize(bool minimal)
Definition AMReX_GpuDevice.cpp:148
static AMREX_EXPORT int max_gpu_streams
Definition AMReX_GpuDevice.H:193
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:142
static AMREX_EXPORT gpuDeviceProp_t device_prop
Definition AMReX_GpuDevice.H:203
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:279
void synchronize() noexcept
Definition AMReX_GpuDevice.H:231
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:265
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:243
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:145
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:348
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:225
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:293
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:251
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:309
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:325
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:27
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:77