Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuDevice.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_Extension.H>
7#include <AMReX_Utility.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuControl.H>
11#include <AMReX_OpenMP.H>
12#include <AMReX_Vector.H>
13
14#include <algorithm>
15#include <array>
16#include <cstdlib>
17#include <cstring>
18#include <memory>
19#include <mutex>
20
21#define AMREX_GPU_MAX_STREAMS 8
22
23#ifdef AMREX_USE_GPU
24namespace amrex {
25#ifdef AMREX_USE_HIP
26using gpuDeviceProp_t = hipDeviceProp_t;
27#elif defined(AMREX_USE_CUDA)
28using gpuDeviceProp_t = cudaDeviceProp;
29#elif defined(AMREX_USE_SYCL)
30 struct gpuDeviceProp_t {
31 std::string name;
32 std::string vendor; // SYCL only (inferred for CUDA and HIP)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
38 int maxThreadsDim[3];
39 int maxGridSize[3];
40 int warpSize;
41 Long maxMemAllocSize; // SYCL only
42 int managedMemory;
43 int concurrentManagedAccess;
44 int maxParameterSize;
45 };
46#endif
47}
48#endif
49
50namespace amrex {
51 class Arena;
52}
53
54namespace amrex::Gpu {
55
56#ifdef AMREX_USE_GPU
58 gpuStream_t m_stream;
59 std::mutex m_mutex;
60 Vector<std::pair<Arena*, void*>> m_free_wait_list;
61public:
62 [[nodiscard]] gpuStream_t& get ();
63 void sync ();
64 void free_async (Arena* arena, void* mem);
65 std::size_t wait_list_size ();
66};
67#endif
68
69class Device
70{
71
72public:
73
74 static void Initialize (bool minimal, int a_device_id);
75 static void Finalize ();
76
77#if defined(AMREX_USE_GPU)
78 static gpuStream_t gpuStream () noexcept {
79 return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
80 }
81#ifdef AMREX_USE_CUDA
83 static cudaStream_t cudaStream () noexcept { return gpuStream(); }
84#endif
85#ifdef AMREX_USE_SYCL
86 static sycl::queue& streamQueue () noexcept { return *(gpuStream().queue); }
87 static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].get().queue); }
88#endif
89#endif
90
91 static int numGpuStreams () noexcept {
92 return inSingleStreamRegion() ? 1 : max_gpu_streams;
93 }
94
95 static void setStreamIndex (int idx) noexcept;
96 static void resetStreamIndex () noexcept { setStreamIndex(0); }
97
98#ifdef AMREX_USE_GPU
99 static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
100
101 static gpuStream_t setStream (gpuStream_t s) noexcept;
102 static gpuStream_t resetStream () noexcept;
103#endif
104
105 static int deviceId () noexcept;
106 static int numDevicesUsed () noexcept; // Total number of device used
107 static int numDevicePartners () noexcept; // Number of partners sharing my device
108
113 static void synchronize () noexcept;
114
119 static void streamSynchronize () noexcept;
120
125 static void streamSynchronizeAll () noexcept;
126
127 static void freeAsync (Arena* arena, void* mem) noexcept;
128
129 static bool clearFreeAsyncBuffer () noexcept;
130
131#if defined(__CUDACC__)
133 static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
134 static cudaGraphExec_t stopGraphRecording(bool last_iter);
135
137 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
138
140 static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
141
142#endif
143
144 static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
145 static void mem_advise_set_readonly (void* p, std::size_t sz);
146
147#ifdef AMREX_USE_GPU
148 static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
149 static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
150 static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
151 dim3& numBlocks, dim3& numThreads) noexcept;
152 static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
153 static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
154
155 static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
156 static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
157 static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
158 static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
159 static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
160 static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
161 static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
162 static std::string deviceName () noexcept { return std::string(device_prop.name); }
163#endif
164
165#ifdef AMREX_USE_CUDA
166 static int devicePropMajor () noexcept { return device_prop.major; }
167 static int devicePropMinor () noexcept { return device_prop.minor; }
168#endif
169
170 static std::string deviceVendor() noexcept
171 {
172#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
173 return std::string("AMD");
174#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
175 // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
176 return std::string("NVIDIA");
177#elif defined(AMREX_USE_SYCL)
178 return device_prop.vendor;
179#else
180 return std::string("Unknown");
181#endif
182 }
183
184 static std::size_t freeMemAvailable ();
185 static void profilerStart ();
186 static void profilerStop ();
187
188#ifdef AMREX_USE_GPU
189
190 static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
191
192#if defined(AMREX_USE_HIP)
193 static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
194#elif defined(AMREX_USE_SYCL)
195 static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
196#else
197 static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
198#endif
199
200 static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
201
202#ifdef AMREX_USE_SYCL
203 static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
204 static sycl::context& syclContext () { return *sycl_context; }
205 static sycl::device& syclDevice () { return *sycl_device; }
206#endif
207#endif
208
209private:
210
211 static void initialize_gpu (bool minimal);
212
213 static AMREX_EXPORT int device_id;
214 static AMREX_EXPORT int num_devices_used;
215 static AMREX_EXPORT int num_device_partners;
216 static AMREX_EXPORT int verbose;
217 static AMREX_EXPORT int max_gpu_streams;
218
219#ifdef AMREX_USE_GPU
220 static AMREX_EXPORT dim3 numThreadsMin;
221 static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
222
223 static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
224 // The non-owning gpu_stream_index is used to store the current stream index that will be used.
225 // gpu_stream_index is a vector so that it's thread safe to write to it.
226 static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
227 static AMREX_EXPORT gpuDeviceProp_t device_prop;
228 static AMREX_EXPORT int memory_pools_supported;
229 static AMREX_EXPORT unsigned int max_blocks_per_launch;
230
231#ifdef AMREX_USE_SYCL
232 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
233 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
234#endif
235
236 friend StreamManager;
237#endif
238};
239
240// Put these in amrex::Gpu
241
242#if defined(AMREX_USE_GPU)
243inline gpuStream_t
244gpuStream () noexcept
245{
246 return Device::gpuStream();
247}
248#endif
249
250inline int
251numGpuStreams () noexcept
252{
253 return Device::numGpuStreams();
254}
255
256inline void
257synchronize () noexcept
258{
260}
261
262inline void
264{
266}
267
268inline void
273
283inline void
284freeAsync (Arena* arena, void* mem) noexcept
285{
286 Device::freeAsync(arena, mem);
287}
288
292inline bool
294{
296}
297
298#ifdef AMREX_USE_GPU
299
300inline void
301htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
302{
303 if (sz == 0) { return; }
304#ifdef AMREX_USE_SYCL
305 auto& q = Device::streamQueue();
306 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
307#else
309 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
310 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
311#endif
312}
313
314inline void
315dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
316{
317 if (sz == 0) { return; }
318#ifdef AMREX_USE_SYCL
319 auto& q = Device::streamQueue();
320 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
321#else
323 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
324 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
325#endif
326}
327
328inline void
329dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
330{
331 if (sz == 0) { return; }
332#ifdef AMREX_USE_SYCL
333 auto& q = Device::streamQueue();
334 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
335#else
337 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
338 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
339#endif
340}
341
342#else // AMREX_USE_GPU
343
344inline void
345htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
346{
347 if (sz == 0) { return; }
348 std::memcpy(p_d, p_h, sz);
349}
350
351inline void
352dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
353{
354 if (sz == 0) { return; }
355 std::memcpy(p_h, p_d, sz);
356}
357
358inline void
359dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
360{
361 if (sz == 0) { return; }
362 std::memcpy(p_d_dst, p_d_src, sz);
363}
364
365#endif // AMREX_USE_GPU
366
367inline void
368htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
369{
370 if (sz == 0) { return; }
371 htod_memcpy_async(p_d, p_h, sz);
373}
374
375inline void
376dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
377{
378 if (sz == 0) { return; }
379 dtoh_memcpy_async(p_h, p_d, sz);
381}
382
383inline void
384dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
385{
386 if (sz == 0) { return; }
387 dtod_memcpy_async(p_d_dst, p_d_src, sz);
389}
390
391#ifdef AMREX_USE_HYPRE
392void hypreSynchronize ();
393#endif
394
397template <typename T>
398void memcpy_from_host_to_device_global_async (T& dg, const void* src,
399 std::size_t nbytes,
400 std::size_t offset = 0)
401{
402#if defined(AMREX_USE_CUDA)
403 AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
404 cudaMemcpyHostToDevice,
406#elif defined(AMREX_USE_HIP)
407 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
408 hipMemcpyHostToDevice,
410#elif defined(AMREX_USE_SYCL)
411 Device::streamQueue().memcpy(dg, src, nbytes, offset);
412#else
413 auto* p = (char*)(&dg);
414 std::memcpy(p+offset, src, nbytes);
415#endif
416}
417
420template <typename T>
421void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
422 std::size_t nbytes,
423 std::size_t offset = 0)
424{
425#if defined(AMREX_USE_CUDA)
426 AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
427 cudaMemcpyDeviceToHost,
429#elif defined(AMREX_USE_HIP)
430 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
431 hipMemcpyDeviceToHost,
433#elif defined(AMREX_USE_SYCL)
434 Device::streamQueue().memcpy(dst, dg, nbytes, offset);
435#else
436 auto const* p = (char const*)(&dg);
437 std::memcpy(dst, p+offset, nbytes);
438#endif
439}
440
441}
442
443#endif
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:105
Definition AMReX_GpuDevice.H:70
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:731
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:697
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:690
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:91
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:775
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:950
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1007
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:190
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:710
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:723
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:200
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:159
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:161
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:167
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:157
static void Finalize()
Definition AMReX_GpuDevice.cpp:450
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:740
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:918
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:162
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:83
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:984
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:757
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:78
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:170
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1179
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:992
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:156
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1146
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:765
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1168
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:160
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:679
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:158
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:685
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:96
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:213
static bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.cpp:785
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:999
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:155
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1065
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:166
Definition AMReX_GpuDevice.H:57
gpuStream_t & get()
Definition AMReX_GpuDevice.cpp:146
std::size_t wait_list_size()
Definition AMReX_GpuDevice.cpp:204
void sync()
Definition AMReX_GpuDevice.cpp:151
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:184
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:52
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:329
void synchronize() noexcept
Definition AMReX_GpuDevice.H:257
bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.H:293
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:284
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:315
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:269
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:421
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:251
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:376
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:368
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:384
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:398
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83