Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuDevice.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_Extension.H>
7#include <AMReX_Utility.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuControl.H>
11#include <AMReX_OpenMP.H>
12#include <AMReX_Vector.H>
13
14#include <algorithm>
15#include <array>
16#include <cstdlib>
17#include <cstring>
18#include <memory>
19#include <mutex>
20
21#define AMREX_GPU_MAX_STREAMS 8
22
23#ifdef AMREX_USE_GPU
24namespace amrex {
25#ifdef AMREX_USE_HIP
26using gpuDeviceProp_t = hipDeviceProp_t;
27#elif defined(AMREX_USE_CUDA)
28using gpuDeviceProp_t = cudaDeviceProp;
29#elif defined(AMREX_USE_SYCL)
30 struct gpuDeviceProp_t {
31 std::string name;
32 std::string vendor; // SYCL only (inferred for CUDA and HIP)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
38 int maxThreadsDim[3];
39 int maxGridSize[3];
40 int warpSize;
41 Long maxMemAllocSize; // SYCL only
42 int managedMemory;
43 int concurrentManagedAccess;
44 int maxParameterSize;
45 };
46#endif
47}
48#endif
49
50namespace amrex {
51 class Arena;
52}
53
54namespace amrex::Gpu {
55
56#ifdef AMREX_USE_GPU
59 std::mutex m_mutex;
61public:
62 [[nodiscard]] gpuStream_t& get ();
63 void sync ();
64 void free_async (Arena* arena, void* mem);
65};
66#endif
67
68class Device
69{
70
71public:
72
73 static void Initialize (bool minimal, int a_device_id);
74 static void Finalize ();
75
76#if defined(AMREX_USE_GPU)
77 static gpuStream_t gpuStream () noexcept {
79 }
80#ifdef AMREX_USE_CUDA
82 static cudaStream_t cudaStream () noexcept { return gpuStream(); }
83#endif
84#ifdef AMREX_USE_SYCL
85 static sycl::queue& streamQueue () noexcept { return *(gpuStream().queue); }
86 static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].get().queue); }
87#endif
88#endif
89
90 static int numGpuStreams () noexcept {
92 }
93
94 static void setStreamIndex (int idx) noexcept;
95 static void resetStreamIndex () noexcept { setStreamIndex(0); }
96
97#ifdef AMREX_USE_GPU
98 static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
99
100 static gpuStream_t setStream (gpuStream_t s) noexcept;
101 static gpuStream_t resetStream () noexcept;
102#endif
103
104 static int deviceId () noexcept;
105 static int numDevicesUsed () noexcept; // Total number of device used
106 static int numDevicePartners () noexcept; // Number of partners sharing my device
107
112 static void synchronize () noexcept;
113
118 static void streamSynchronize () noexcept;
119
124 static void streamSynchronizeAll () noexcept;
125
126 static void freeAsync (Arena* arena, void* mem) noexcept;
127
128#if defined(__CUDACC__)
130 static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
131 static cudaGraphExec_t stopGraphRecording(bool last_iter);
132
134 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
135
137 static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
138
139#endif
140
141 static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
142 static void mem_advise_set_readonly (void* p, std::size_t sz);
143
144#ifdef AMREX_USE_GPU
145 static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
146 static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
147 static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
148 dim3& numBlocks, dim3& numThreads) noexcept;
149 static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
150 static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
151
152 static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
153 static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
154 static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
155 static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
156 static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
157 static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
158 static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
159 static std::string deviceName () noexcept { return std::string(device_prop.name); }
160#endif
161
162#ifdef AMREX_USE_CUDA
163 static int devicePropMajor () noexcept { return device_prop.major; }
164 static int devicePropMinor () noexcept { return device_prop.minor; }
165#endif
166
167 static std::string deviceVendor() noexcept
168 {
169#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
170 return std::string("AMD");
171#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
172 // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
173 return std::string("NVIDIA");
174#elif defined(AMREX_USE_SYCL)
175 return device_prop.vendor;
176#else
177 return std::string("Unknown");
178#endif
179 }
180
181 static std::size_t freeMemAvailable ();
182 static void profilerStart ();
183 static void profilerStop ();
184
185#ifdef AMREX_USE_GPU
186
187 static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
188
189#if defined(AMREX_USE_HIP)
190 static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
191#elif defined(AMREX_USE_SYCL)
192 static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
193#else
194 static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
195#endif
196
197 static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
198
199#ifdef AMREX_USE_SYCL
200 static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
201 static sycl::context& syclContext () { return *sycl_context; }
202 static sycl::device& syclDevice () { return *sycl_device; }
203#endif
204#endif
205
206private:
207
208 static void initialize_gpu (bool minimal);
209
215
216#ifdef AMREX_USE_GPU
219
220 static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
221 // The non-owning gpu_stream_index is used to store the current stream index that will be used.
222 // gpu_stream_index is a vector so that it's thread safe to write to it.
223 static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
227
228#ifdef AMREX_USE_SYCL
229 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
230 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
231#endif
232
234#endif
235};
236
237// Put these in amrex::Gpu
238
239#if defined(AMREX_USE_GPU)
240inline gpuStream_t
241gpuStream () noexcept
242{
243 return Device::gpuStream();
244}
245#endif
246
247inline int
248numGpuStreams () noexcept
249{
250 return Device::numGpuStreams();
251}
252
253inline void
254synchronize () noexcept
255{
257}
258
259inline void
261{
263}
264
265inline void
270
280inline void
281freeAsync (Arena* arena, void* mem) noexcept
282{
283 Device::freeAsync(arena, mem);
284}
285
286#ifdef AMREX_USE_GPU
287
288inline void
289htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
290{
291 if (sz == 0) { return; }
292#ifdef AMREX_USE_SYCL
293 auto& q = Device::streamQueue();
294 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
295#else
297 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
298 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
299#endif
300}
301
302inline void
303dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
304{
305 if (sz == 0) { return; }
306#ifdef AMREX_USE_SYCL
307 auto& q = Device::streamQueue();
308 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
309#else
311 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
312 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
313#endif
314}
315
316inline void
317dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
318{
319 if (sz == 0) { return; }
320#ifdef AMREX_USE_SYCL
321 auto& q = Device::streamQueue();
322 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
323#else
325 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
326 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
327#endif
328}
329
330#else // AMREX_USE_GPU
331
332inline void
333htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
334{
335 if (sz == 0) { return; }
336 std::memcpy(p_d, p_h, sz);
337}
338
339inline void
340dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
341{
342 if (sz == 0) { return; }
343 std::memcpy(p_h, p_d, sz);
344}
345
346inline void
347dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
348{
349 if (sz == 0) { return; }
350 std::memcpy(p_d_dst, p_d_src, sz);
351}
352
353#endif // AMREX_USE_GPU
354
355inline void
356htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
357{
358 if (sz == 0) { return; }
359 htod_memcpy_async(p_d, p_h, sz);
361}
362
363inline void
364dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
365{
366 if (sz == 0) { return; }
367 dtoh_memcpy_async(p_h, p_d, sz);
369}
370
371inline void
372dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
373{
374 if (sz == 0) { return; }
375 dtod_memcpy_async(p_d_dst, p_d_src, sz);
377}
378
379#ifdef AMREX_USE_HYPRE
380void hypreSynchronize ();
381#endif
382
385template <typename T>
386void memcpy_from_host_to_device_global_async (T& dg, const void* src,
387 std::size_t nbytes,
388 std::size_t offset = 0)
389{
390#if defined(AMREX_USE_CUDA)
391 AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
392 cudaMemcpyHostToDevice,
394#elif defined(AMREX_USE_HIP)
395 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
396 hipMemcpyHostToDevice,
398#elif defined(AMREX_USE_SYCL)
399 Device::streamQueue().memcpy(dg, src, nbytes, offset);
400#else
401 auto* p = (char*)(&dg);
402 std::memcpy(p+offset, src, nbytes);
403#endif
404}
405
408template <typename T>
409void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
410 std::size_t nbytes,
411 std::size_t offset = 0)
412{
413#if defined(AMREX_USE_CUDA)
414 AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
415 cudaMemcpyDeviceToHost,
417#elif defined(AMREX_USE_HIP)
418 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
419 hipMemcpyDeviceToHost,
421#elif defined(AMREX_USE_SYCL)
422 Device::streamQueue().memcpy(dst, dg, nbytes, offset);
423#else
424 auto const* p = (char const*)(&dg);
425 std::memcpy(dst, p+offset, nbytes);
426#endif
427}
428
429}
430
431#endif
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:105
Definition AMReX_GpuDevice.H:69
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:724
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:690
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:683
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:90
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:768
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:926
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:983
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:187
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:703
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:716
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:197
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:156
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:158
static int num_devices_used
Definition AMReX_GpuDevice.H:211
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:164
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:154
static void Finalize()
Definition AMReX_GpuDevice.cpp:443
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:733
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:894
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:159
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:82
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:960
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:750
static int num_device_partners
Definition AMReX_GpuDevice.H:212
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:77
static int verbose
Definition AMReX_GpuDevice.H:213
static dim3 numBlocksOverride
Definition AMReX_GpuDevice.H:218
static dim3 numThreadsMin
Definition AMReX_GpuDevice.H:217
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:167
static Vector< int > gpu_stream_index
Definition AMReX_GpuDevice.H:223
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1155
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:968
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
friend StreamManager
Definition AMReX_GpuDevice.H:233
static dim3 numThreadsOverride
Definition AMReX_GpuDevice.H:218
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:153
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1122
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:758
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1144
static Vector< StreamManager > gpu_stream_pool
Definition AMReX_GpuDevice.H:220
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:157
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:672
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:155
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:678
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:95
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:206
static int memory_pools_supported
Definition AMReX_GpuDevice.H:225
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:975
static unsigned int max_blocks_per_launch
Definition AMReX_GpuDevice.H:226
static int device_id
Definition AMReX_GpuDevice.H:210
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:152
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1041
static void initialize_gpu(bool minimal)
Definition AMReX_GpuDevice.cpp:474
static int max_gpu_streams
Definition AMReX_GpuDevice.H:214
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:163
static gpuDeviceProp_t device_prop
Definition AMReX_GpuDevice.H:224
Definition AMReX_GpuDevice.H:57
Vector< std::pair< Arena *, void * > > m_free_wait_list
Definition AMReX_GpuDevice.H:60
gpuStream_t & get()
Definition AMReX_GpuDevice.cpp:146
std::mutex m_mutex
Definition AMReX_GpuDevice.H:59
void sync()
Definition AMReX_GpuDevice.cpp:151
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:184
gpuStream_t m_stream
Definition AMReX_GpuDevice.H:58
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:317
void synchronize() noexcept
Definition AMReX_GpuDevice.H:254
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:281
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:260
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:303
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:266
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:409
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:248
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:364
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:356
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:289
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:372
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:386
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83