1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
21#define AMREX_GPU_MAX_STREAMS 8
27#elif defined(AMREX_USE_CUDA)
29#elif defined(AMREX_USE_SYCL)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
43 int concurrentManagedAccess;
82 static void Initialize (
bool minimal,
int a_device_id);
85#if defined(AMREX_USE_GPU)
87 if (!external_stream_stack.empty()) {
88 AMREX_ASSERT(external_stream_stack.back().manager !=
nullptr);
89 return external_stream_stack.back().manager->getStream();
92 return gpu_stream_pool[gpu_stream_index[tid]].getStream();
100 static sycl::queue& streamQueue () noexcept {
return *(
gpuStream().queue); }
101 static sycl::queue& streamQueue (
int i)
noexcept {
return *(gpu_stream_pool[i].getStream().queue); }
168#if defined(__CUDACC__)
170 static void startGraphRecording(
bool first_iter,
void* h_ptr,
void* d_ptr,
size_t sz);
171 static cudaGraphExec_t stopGraphRecording(
bool last_iter);
174 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
177 static void executeGraph(
const cudaGraphExec_t &graphExec,
bool synch =
true);
188 dim3& numBlocks, dim3& numThreads)
noexcept;
189 static void c_threads_and_blocks (
const int* lo,
const int* hi, dim3& numBlocks, dim3& numThreads)
noexcept;
192 static std::size_t
totalGlobalMem () noexcept {
return device_prop.totalGlobalMem; }
199 static std::string
deviceName () noexcept {
return std::string(device_prop.name); }
209#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
210 return std::string(
"AMD");
211#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
213 return std::string(
"NVIDIA");
214#elif defined(AMREX_USE_SYCL)
215 return device_prop.vendor;
217 return std::string(
"Unknown");
231#if defined(AMREX_USE_HIP)
233#elif defined(AMREX_USE_SYCL)
242 static Long maxMemAllocSize () noexcept {
return device_prop.maxMemAllocSize; }
243 static sycl::context& syclContext () {
return *sycl_context; }
244 static sycl::device& syclDevice () {
return *sycl_device; }
250 static void initialize_gpu (
bool minimal);
260 static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
262 static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool;
270 struct ExternalStream
272 std::unique_ptr<StreamManager> manager;
273 int saved_stream_index = 0;
275 static AMREX_EXPORT Vector<ExternalStream> external_stream_stack;
278 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
279 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
282 friend StreamManager;
289#if defined(AMREX_USE_GPU)
398 : m_sync_on_exit(sync_on_exit)
400 Device::setExternalStream(stream);
410 Device::resetExternalStream(m_sync_on_exit);
423 if (sz == 0) {
return; }
425 auto& q = Device::streamQueue();
426 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
429 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice,
gpuStream()));,
437 if (sz == 0) {
return; }
439 auto& q = Device::streamQueue();
440 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
443 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost,
gpuStream()));,
451 if (sz == 0) {
return; }
453 auto& q = Device::streamQueue();
454 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
457 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice,
gpuStream()));,
467 if (sz == 0) {
return; }
468 std::memcpy(p_d, p_h, sz);
474 if (sz == 0) {
return; }
475 std::memcpy(p_h, p_d, sz);
479dtod_memcpy_async (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
481 if (sz == 0) {
return; }
482 std::memcpy(p_d_dst, p_d_src, sz);
488htod_memcpy (
void* p_d,
const void* p_h,
const std::size_t sz)
noexcept
490 if (sz == 0) {
return; }
496dtoh_memcpy (
void* p_h,
const void* p_d,
const std::size_t sz)
noexcept
498 if (sz == 0) {
return; }
504dtod_memcpy (
void* p_d_dst,
const void* p_d_src,
const std::size_t sz)
noexcept
506 if (sz == 0) {
return; }
511#ifdef AMREX_USE_HYPRE
512void hypreSynchronize ();
522#if defined(AMREX_USE_CUDA)
524 cudaMemcpyHostToDevice,
526#elif defined(AMREX_USE_HIP)
527 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes,
offset,
528 hipMemcpyHostToDevice,
530#elif defined(AMREX_USE_SYCL)
531 Device::streamQueue().memcpy(dg, src, nbytes,
offset);
533 auto* p = (
char*)(&dg);
534 std::memcpy(p+
offset, src, nbytes);
545#if defined(AMREX_USE_CUDA)
547 cudaMemcpyDeviceToHost,
549#elif defined(AMREX_USE_HIP)
550 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes,
offset,
551 hipMemcpyDeviceToHost,
553#elif defined(AMREX_USE_SYCL)
554 Device::streamQueue().memcpy(dst, dg, nbytes,
offset);
556 auto const* p = (
char const*)(&dg);
557 std::memcpy(dst, p+
offset, nbytes);
576 if (! inNoSyncRegionPreviously()) {
577 Gpu::streamSynchronizeActive();
584struct [[nodiscard]] SyncAtExitOnly {};
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:127
Definition AMReX_GpuDevice.H:78
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:749
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:709
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:702
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:105
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:935
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:1148
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1205
static int managedMemorySupported() noexcept
Definition AMReX_GpuDevice.H:227
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:229
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:728
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:741
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:239
static void streamSynchronizeActive() noexcept
Definition AMReX_GpuDevice.cpp:908
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:196
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:198
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:204
static void streamOrderedFreeAsync(Arena *arena, void *mem, gpuStream_t stream) noexcept
Definition AMReX_GpuDevice.cpp:952
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:194
static void Finalize()
Definition AMReX_GpuDevice.cpp:462
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:843
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:1116
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:199
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:97
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:1182
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:855
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:86
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:207
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1377
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1190
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:193
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1344
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:920
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1366
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:197
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:691
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:195
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:697
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:114
static void setExternalStream(gpuStream_t s)
Definition AMReX_GpuDevice.cpp:769
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:836
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:225
static bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.cpp:976
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1197
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:192
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1263
static void resetExternalStream(ExternalStreamSync sync_stream=ExternalStreamSync::Yes) noexcept
Definition AMReX_GpuDevice.cpp:813
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:203
Definition AMReX_GpuDevice.H:64
std::size_t wait_list_size()
Definition AMReX_GpuDevice.cpp:216
void sync()
Definition AMReX_GpuDevice.cpp:159
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:196
gpuStream_t & getStream()
Definition AMReX_GpuDevice.cpp:149
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:55
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:449
void synchronize() noexcept
Definition AMReX_GpuDevice.H:304
void streamSynchronizeActive() noexcept
Definition AMReX_GpuDevice.H:324
bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.H:354
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:345
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435
ExternalStreamSync
Definition AMReX_GpuDevice.H:57
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:330
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:541
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:298
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:496
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:488
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421
void setExternalGpuStream(gpuStream_t stream)
Provide a user-supplied GPU stream.
Definition AMReX_GpuDevice.H:369
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:504
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:518
bool isStreamActive(gpuStream_t s)
Check if the given stream is active.
Definition AMReX_GpuDevice.cpp:1400
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
void resetExternalGpuStream(ExternalStreamSync sync_stream=ExternalStreamSync::Yes) noexcept
Pop the current user-supplied GPU stream.
Definition AMReX_GpuDevice.H:385
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
Definition AMReX_GpuDevice.H:395
ExternalGpuStreamRegion(gpuStream_t stream, ExternalStreamSync sync_on_exit=ExternalStreamSync::Yes) noexcept
Definition AMReX_GpuDevice.H:396
ExternalGpuStreamRegion(ExternalGpuStreamRegion &&rhs) noexcept=delete
ExternalGpuStreamRegion(ExternalGpuStreamRegion const &)=delete
~ExternalGpuStreamRegion()
Definition AMReX_GpuDevice.H:408
Definition AMReX_GpuControl.H:184
Definition AMReX_GpuDevice.H:574
~SyncAtExitOnly()
Definition AMReX_GpuDevice.H:575