Block-Structured AMR Software Framework
AMReX_GpuDevice.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_DEVICE_H_
2 #define AMREX_GPU_DEVICE_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX.H>
6 #include <AMReX_Extension.H>
7 #include <AMReX_Utility.H>
8 #include <AMReX_GpuTypes.H>
9 #include <AMReX_GpuError.H>
10 #include <AMReX_GpuControl.H>
11 #include <AMReX_OpenMP.H>
12 #include <AMReX_Vector.H>
13 
14 #include <algorithm>
15 #include <array>
16 #include <cstdlib>
17 #include <cstring>
18 #include <memory>
19 
20 #define AMREX_GPU_MAX_STREAMS 8
21 
22 #ifdef AMREX_USE_GPU
23 namespace amrex {
24 #ifdef AMREX_USE_HIP
25 using gpuDeviceProp_t = hipDeviceProp_t;
26 #elif defined(AMREX_USE_CUDA)
27 using gpuDeviceProp_t = cudaDeviceProp;
28 #elif defined(AMREX_USE_SYCL)
29  struct gpuDeviceProp_t {
30  std::string name;
31  std::string vendor; // SYCL only (inferred for CUDA and HIP)
32  std::size_t totalGlobalMem;
33  std::size_t sharedMemPerBlock;
34  int multiProcessorCount;
35  int maxThreadsPerMultiProcessor;
36  int maxThreadsPerBlock;
37  int maxThreadsDim[3];
38  int maxGridSize[3];
39  int warpSize;
40  Long maxMemAllocSize; // SYCL only
41  int managedMemory;
42  int concurrentManagedAccess;
43  int maxParameterSize;
44  };
45 #endif
46 }
47 #endif
48 
49 namespace amrex::Gpu {
50 
51 class Device
52 {
53 
54 public:
55 
56  static void Initialize ();
57  static void Finalize ();
58 
59 #if defined(AMREX_USE_GPU)
60  static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
61 #ifdef AMREX_USE_CUDA
63  static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
64 #endif
65 #ifdef AMREX_USE_SYCL
66  static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
67  static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
68 #endif
69 #endif
70 
71  static int numGpuStreams () noexcept {
73  }
74 
75  static void setStreamIndex (int idx) noexcept;
76  static void resetStreamIndex () noexcept { setStreamIndex(0); }
77 
78 #ifdef AMREX_USE_GPU
79  static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
80 
81  static gpuStream_t setStream (gpuStream_t s) noexcept;
82  static gpuStream_t resetStream () noexcept;
83 #endif
84 
85  static int deviceId () noexcept;
86  static int numDevicesUsed () noexcept; // Total number of device used
87  static int numDevicePartners () noexcept; // Number of partners sharing my device
88 
93  static void synchronize () noexcept;
94 
99  static void streamSynchronize () noexcept;
100 
105  static void streamSynchronizeAll () noexcept;
106 
107 #if defined(__CUDACC__)
109  static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
110  static cudaGraphExec_t stopGraphRecording(bool last_iter);
111 
113  static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
114 
116  static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
117 
118 #endif
119 
120  static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
121  static void mem_advise_set_readonly (void* p, std::size_t sz);
122 
123 #ifdef AMREX_USE_GPU
124  static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
125  static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
126  static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
127  dim3& numBlocks, dim3& numThreads) noexcept;
128  static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
129  static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
130 
131  static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
132  static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
133  static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
134  static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
135  static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
136  static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
137  static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
138  static std::string deviceName () noexcept { return std::string(device_prop.name); }
139 #endif
140 
141 #ifdef AMREX_USE_CUDA
142  static int devicePropMajor () noexcept { return device_prop.major; }
143  static int devicePropMinor () noexcept { return device_prop.minor; }
144 #endif
145 
146  static std::string deviceVendor() noexcept
147  {
148 #if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
149  return std::string("AMD");
150 #elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
151  // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
152  return std::string("NVIDIA");
153 #elif defined(AMREX_USE_SYCL)
154  return device_prop.vendor;
155 #else
156  return std::string("Unknown");
157 #endif
158  }
159 
160  static std::size_t freeMemAvailable ();
161  static void profilerStart ();
162  static void profilerStop ();
163 
164 #ifdef AMREX_USE_GPU
165 
166  static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
167 
168 #if defined(AMREX_USE_HIP)
169  static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
170 #elif defined(AMREX_USE_SYCL)
171  static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
172 #else
173  static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
174 #endif
175 
176  static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
177 
178 #ifdef AMREX_USE_SYCL
179  static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
180  static sycl::context& syclContext () { return *sycl_context; }
181  static sycl::device& syclDevice () { return *sycl_device; }
182 #endif
183 #endif
184 
185 private:
186 
187  static void initialize_gpu ();
188 
192  static AMREX_EXPORT int verbose;
194 
195 #ifdef AMREX_USE_GPU
198 
199  static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
200  // The non-owning gpu_stream is used to store the current stream that will be used.
201  // gpu_stream is a vector so that it's thread safe to write to it.
202  static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
205  static AMREX_EXPORT unsigned int max_blocks_per_launch;
206 
207 #ifdef AMREX_USE_SYCL
208  static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
209  static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
210 #endif
211 #endif
212 };
213 
214 // Put these in amrex::Gpu
215 
216 #if defined(AMREX_USE_GPU)
217 inline gpuStream_t
218 gpuStream () noexcept
219 {
220  return Device::gpuStream();
221 }
222 #endif
223 
224 inline int
225 numGpuStreams () noexcept
226 {
227  return Device::numGpuStreams();
228 }
229 
230 inline void
231 synchronize () noexcept
232 {
234 }
235 
236 inline void
237 streamSynchronize () noexcept
238 {
240 }
241 
242 inline void
244 {
246 }
247 
248 #ifdef AMREX_USE_GPU
249 
250 inline void
251 htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
252 {
253  if (sz == 0) { return; }
254 #ifdef AMREX_USE_SYCL
255  auto& q = Device::streamQueue();
256  q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
257 #else
259  AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
260  AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
261 #endif
262 }
263 
264 inline void
265 dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
266 {
267  if (sz == 0) { return; }
268 #ifdef AMREX_USE_SYCL
269  auto& q = Device::streamQueue();
270  q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
271 #else
273  AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
274  AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
275 #endif
276 }
277 
278 inline void
279 dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
280 {
281  if (sz == 0) { return; }
282 #ifdef AMREX_USE_SYCL
283  auto& q = Device::streamQueue();
284  q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
285 #else
287  AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
288  AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
289 #endif
290 }
291 
292 inline void
293 htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
294 {
295  if (sz == 0) { return; }
296  htod_memcpy_async(p_d, p_h, sz);
298 }
299 
300 inline void
301 dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
302 {
303  if (sz == 0) { return; }
304  dtoh_memcpy_async(p_h, p_d, sz);
306 }
307 
308 inline void
309 dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
310 {
311  if (sz == 0) { return; }
312  dtod_memcpy_async(p_d_dst, p_d_src, sz);
314 }
315 
316 #endif
317 
318 #ifdef AMREX_USE_HYPRE
319 void hypreSynchronize ();
320 #endif
321 
324 template <typename T>
325 void memcpy_from_host_to_device_global_async (T& dg, const void* src,
326  std::size_t nbytes,
327  std::size_t offset = 0)
328 {
329 #if defined(AMREX_USE_CUDA)
330  AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
331  cudaMemcpyHostToDevice,
332  Device::gpuStream()));
333 #elif defined(AMREX_USE_HIP)
334  AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
335  hipMemcpyHostToDevice,
336  Device::gpuStream()));
337 #elif defined(AMREX_USE_SYCL)
338  Device::streamQueue().memcpy(dg, src, nbytes, offset);
339 #else
340  auto* p = (char*)(&dg);
341  std::memcpy(p+offset, src, nbytes);
342 #endif
343 }
344 
347 template <typename T>
348 void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
349  std::size_t nbytes,
350  std::size_t offset = 0)
351 {
352 #if defined(AMREX_USE_CUDA)
353  AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
354  cudaMemcpyDeviceToHost,
355  Device::gpuStream()));
356 #elif defined(AMREX_USE_HIP)
357  AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
358  hipMemcpyDeviceToHost,
359  Device::gpuStream()));
360 #elif defined(AMREX_USE_SYCL)
361  Device::streamQueue().memcpy(dst, dg, nbytes, offset);
362 #else
363  auto const* p = (char const*)(&dg);
364  std::memcpy(dst, p+offset, nbytes);
365 #endif
366 }
367 
368 }
369 
370 #endif
#define AMREX_EXPORT
Definition: AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition: AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition: AMReX_GpuError.H:73
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
Definition: AMReX_GpuDevice.H:52
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition: AMReX_GpuDevice.cpp:615
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition: AMReX_GpuDevice.cpp:586
static int numDevicePartners() noexcept
Definition: AMReX_GpuDevice.cpp:579
static int numGpuStreams() noexcept
Definition: AMReX_GpuDevice.H:71
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition: AMReX_GpuDevice.cpp:811
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:859
static int memoryPoolsSupported() noexcept
Definition: AMReX_GpuDevice.H:166
static void setStreamIndex(int idx) noexcept
Definition: AMReX_GpuDevice.cpp:594
static gpuStream_t resetStream() noexcept
Definition: AMReX_GpuDevice.cpp:607
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream_pool
Definition: AMReX_GpuDevice.H:199
static unsigned int maxBlocksPerLaunch() noexcept
Definition: AMReX_GpuDevice.H:176
static AMREX_EXPORT Vector< gpuStream_t > gpu_stream
Definition: AMReX_GpuDevice.H:202
static int maxThreadsPerBlock() noexcept
Definition: AMReX_GpuDevice.H:135
static int maxBlocksPerGrid(int dir) noexcept
Definition: AMReX_GpuDevice.H:137
static AMREX_EXPORT int num_devices_used
Definition: AMReX_GpuDevice.H:190
static int devicePropMinor() noexcept
Definition: AMReX_GpuDevice.H:143
static int numMultiProcessors() noexcept
Definition: AMReX_GpuDevice.H:133
static void Finalize()
Definition: AMReX_GpuDevice.cpp:347
static void synchronize() noexcept
Definition: AMReX_GpuDevice.cpp:624
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition: AMReX_GpuDevice.cpp:788
static std::string deviceName() noexcept
Definition: AMReX_GpuDevice.H:138
static cudaStream_t cudaStream() noexcept
Definition: AMReX_GpuDevice.H:63
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition: AMReX_GpuDevice.cpp:836
static void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.cpp:641
static AMREX_EXPORT int num_device_partners
Definition: AMReX_GpuDevice.H:191
static gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:60
static AMREX_EXPORT int verbose
Definition: AMReX_GpuDevice.H:192
static AMREX_EXPORT dim3 numBlocksOverride
Definition: AMReX_GpuDevice.H:197
static AMREX_EXPORT dim3 numThreadsMin
Definition: AMReX_GpuDevice.H:196
static std::string deviceVendor() noexcept
Definition: AMReX_GpuDevice.H:146
static void profilerStop()
Definition: AMReX_GpuDevice.cpp:1031
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:844
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
static AMREX_EXPORT dim3 numThreadsOverride
Definition: AMReX_GpuDevice.H:197
static std::size_t sharedMemPerBlock() noexcept
Definition: AMReX_GpuDevice.H:132
static std::size_t freeMemAvailable()
Definition: AMReX_GpuDevice.cpp:998
static void streamSynchronizeAll() noexcept
Definition: AMReX_GpuDevice.cpp:657
static void profilerStart()
Definition: AMReX_GpuDevice.cpp:1020
static int maxThreadsPerBlock(int dir) noexcept
Definition: AMReX_GpuDevice.H:136
static int deviceId() noexcept
Definition: AMReX_GpuDevice.cpp:568
static int maxThreadsPerMultiProcessor() noexcept
Definition: AMReX_GpuDevice.H:134
static int numDevicesUsed() noexcept
Definition: AMReX_GpuDevice.cpp:574
static void resetStreamIndex() noexcept
Definition: AMReX_GpuDevice.H:76
static AMREX_EXPORT int memory_pools_supported
Definition: AMReX_GpuDevice.H:204
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:851
static void initialize_gpu()
Definition: AMReX_GpuDevice.cpp:377
static AMREX_EXPORT unsigned int max_blocks_per_launch
Definition: AMReX_GpuDevice.H:205
static AMREX_EXPORT int device_id
Definition: AMReX_GpuDevice.H:189
static std::size_t totalGlobalMem() noexcept
Definition: AMReX_GpuDevice.H:131
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:917
static void Initialize()
Definition: AMReX_GpuDevice.cpp:143
static AMREX_EXPORT int max_gpu_streams
Definition: AMReX_GpuDevice.H:193
static int devicePropMajor() noexcept
Definition: AMReX_GpuDevice.H:142
static AMREX_EXPORT gpuDeviceProp_t device_prop
Definition: AMReX_GpuDevice.H:203
Definition: AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:279
void synchronize() noexcept
Definition: AMReX_GpuDevice.H:231
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
void streamSynchronizeAll() noexcept
Definition: AMReX_GpuDevice.H:243
bool inSingleStreamRegion() noexcept
Definition: AMReX_GpuControl.H:145
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition: AMReX_GpuDevice.H:348
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy(void *dest, const void *src, std::size_t count)
Definition: AMReX_GpuUtility.H:214
int numGpuStreams() noexcept
Definition: AMReX_GpuDevice.H:225
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:301
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:293
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:309
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition: AMReX_GpuDevice.H:325
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
constexpr int get_thread_num()
Definition: AMReX_OpenMP.H:37
Definition: AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition: AMReX_GpuDevice.H:27
cudaStream_t gpuStream_t
Definition: AMReX_GpuControl.H:77