Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuDevice.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_DEVICE_H_
2#define AMREX_GPU_DEVICE_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_Extension.H>
7#include <AMReX_Utility.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuControl.H>
11#include <AMReX_OpenMP.H>
12#include <AMReX_Vector.H>
13
14#include <algorithm>
15#include <array>
16#include <cstdlib>
17#include <cstring>
18#include <memory>
19#include <mutex>
20
21#define AMREX_GPU_MAX_STREAMS 8
22
23#ifdef AMREX_USE_GPU
24namespace amrex {
25#ifdef AMREX_USE_HIP
26using gpuDeviceProp_t = hipDeviceProp_t;
27#elif defined(AMREX_USE_CUDA)
28using gpuDeviceProp_t = cudaDeviceProp;
29#elif defined(AMREX_USE_SYCL)
30 struct gpuDeviceProp_t {
31 std::string name;
32 std::string vendor; // SYCL only (inferred for CUDA and HIP)
33 std::size_t totalGlobalMem;
34 std::size_t sharedMemPerBlock;
35 int multiProcessorCount;
36 int maxThreadsPerMultiProcessor;
37 int maxThreadsPerBlock;
38 int maxThreadsDim[3];
39 int maxGridSize[3];
40 int warpSize;
41 Long maxMemAllocSize; // SYCL only
42 int managedMemory;
43 int concurrentManagedAccess;
44 int maxParameterSize;
45 };
46#endif
47}
48#endif
49
50namespace amrex {
51 class Arena;
52}
53
54namespace amrex::Gpu {
55
56#ifdef AMREX_USE_GPU
58 Yes,
59 No
60};
61#endif
62
63#ifdef AMREX_USE_GPU
65 gpuStream_t m_stream;
66 std::mutex m_mutex;
67 Vector<std::pair<Arena*, void*>> m_free_wait_list;
68public:
69 [[nodiscard]] gpuStream_t& getStream ();
70 [[nodiscard]] gpuStream_t const& getStream () const;
71 void sync ();
72 void free_async (Arena* arena, void* mem);
73 std::size_t wait_list_size ();
74};
75#endif
76
77class Device
78{
79
80public:
81
82 static void Initialize (bool minimal, int a_device_id);
83 static void Finalize ();
84
85#if defined(AMREX_USE_GPU)
86 static gpuStream_t gpuStream () noexcept {
87 if (!external_stream_stack.empty()) {
88 AMREX_ASSERT(external_stream_stack.back().manager != nullptr);
89 return external_stream_stack.back().manager->getStream();
90 } else {
91 int tid = OpenMP::get_thread_num();
92 return gpu_stream_pool[gpu_stream_index[tid]].getStream();
93 }
94 }
95#ifdef AMREX_USE_CUDA
97 static cudaStream_t cudaStream () noexcept { return gpuStream(); }
98#endif
99#ifdef AMREX_USE_SYCL
100 static sycl::queue& streamQueue () noexcept { return *(gpuStream().queue); }
101 static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].getStream().queue); }
102#endif
103#endif
104
105 static int numGpuStreams () noexcept {
106#ifdef AMREX_USE_GPU
107 return (inSingleStreamRegion() || usingExternalStream()) ? 1 : max_gpu_streams;
108#else
109 return 1;
110#endif
111 }
112
113 static void setStreamIndex (int idx) noexcept;
114 static void resetStreamIndex () noexcept { setStreamIndex(0); }
115
116#ifdef AMREX_USE_GPU
117 static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
118
119 static gpuStream_t setStream (gpuStream_t s) noexcept;
120 static gpuStream_t resetStream () noexcept;
121
123 static void setExternalStream (gpuStream_t s);
125 static void resetExternalStream (ExternalStreamSync sync_stream = ExternalStreamSync::Yes) noexcept;
126 static bool usingExternalStream () noexcept;
127#endif
128
129 static int deviceId () noexcept;
130 static int numDevicesUsed () noexcept; // Total number of device used
131 static int numDevicePartners () noexcept; // Number of partners sharing my device
132
137 static void synchronize () noexcept;
138
143 static void streamSynchronize () noexcept;
144#ifdef AMREX_USE_GPU
145 static void streamSynchronize (gpuStream_t s) noexcept;
146#endif
147
153 static void streamSynchronizeActive () noexcept;
154
159 static void streamSynchronizeAll () noexcept;
160
161 static void freeAsync (Arena* arena, void* mem) noexcept;
162#ifdef AMREX_USE_GPU
163 static void streamOrderedFreeAsync (Arena* arena, void* mem, gpuStream_t stream) noexcept;
164#endif
165
166 static bool clearFreeAsyncBuffer () noexcept;
167
168#if defined(__CUDACC__)
170 static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
171 static cudaGraphExec_t stopGraphRecording(bool last_iter);
172
174 static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
175
177 static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
178
179#endif
180
181 static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
182 static void mem_advise_set_readonly (void* p, std::size_t sz);
183
184#ifdef AMREX_USE_GPU
185 static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
186 static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
187 static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
188 dim3& numBlocks, dim3& numThreads) noexcept;
189 static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
190 static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
191
192 static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
193 static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
194 static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
195 static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
196 static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
197 static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
198 static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
199 static std::string deviceName () noexcept { return std::string(device_prop.name); }
200#endif
201
202#ifdef AMREX_USE_CUDA
203 static int devicePropMajor () noexcept { return device_prop.major; }
204 static int devicePropMinor () noexcept { return device_prop.minor; }
205#endif
206
207 static std::string deviceVendor() noexcept
208 {
209#if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
210 return std::string("AMD");
211#elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
212 // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
213 return std::string("NVIDIA");
214#elif defined(AMREX_USE_SYCL)
215 return device_prop.vendor;
216#else
217 return std::string("Unknown");
218#endif
219 }
220
221 static std::size_t freeMemAvailable ();
222 static void profilerStart ();
223 static void profilerStop ();
224
225#ifdef AMREX_USE_GPU
226
227 static int managedMemorySupported () noexcept { return device_prop.managedMemory; }
228
229 static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
230
231#if defined(AMREX_USE_HIP)
232 static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
233#elif defined(AMREX_USE_SYCL)
234 static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
235#else
236 static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
237#endif
238
239 static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
240
241#ifdef AMREX_USE_SYCL
242 static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
243 static sycl::context& syclContext () { return *sycl_context; }
244 static sycl::device& syclDevice () { return *sycl_device; }
245#endif
246#endif
247
248private:
249
250 static void initialize_gpu (bool minimal);
251
252 static AMREX_EXPORT int device_id;
253 static AMREX_EXPORT int num_devices_used;
254 static AMREX_EXPORT int num_device_partners;
255 static AMREX_EXPORT int verbose;
256 static AMREX_EXPORT int max_gpu_streams;
257
258#ifdef AMREX_USE_GPU
259 static AMREX_EXPORT dim3 numThreadsMin;
260 static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
261
262 static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
263 // The non-owning gpu_stream_index is used to store the current stream index that will be used.
264 // gpu_stream_index is a vector so that it's thread safe to write to it.
265 static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
266 static AMREX_EXPORT gpuDeviceProp_t device_prop;
267 static AMREX_EXPORT int memory_pools_supported;
268 static AMREX_EXPORT unsigned int max_blocks_per_launch;
269
270 struct ExternalStream
271 {
272 std::unique_ptr<StreamManager> manager;
273 int saved_stream_index = 0;
274 };
275 static AMREX_EXPORT Vector<ExternalStream> external_stream_stack;
276
277#ifdef AMREX_USE_SYCL
278 static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
279 static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
280#endif
281
282 friend StreamManager;
284#endif
285};
286
287// Put these in amrex::Gpu
288
289#if defined(AMREX_USE_GPU)
290inline gpuStream_t
291gpuStream () noexcept
292{
293 return Device::gpuStream();
294}
295#endif
296
297inline int
298numGpuStreams () noexcept
299{
300 return Device::numGpuStreams();
301}
302
303inline void
304synchronize () noexcept
305{
307}
308
309inline void
311{
313}
314
315#ifdef AMREX_USE_GPU
316inline void
321#endif
322
323inline void
328
329inline void
334
344inline void
345freeAsync (Arena* arena, void* mem) noexcept
346{
347 Device::freeAsync(arena, mem);
348}
349
353inline bool
355{
357}
358
359#ifdef AMREX_USE_GPU
360
368inline void
373
384inline void
389
394struct [[nodiscard]] ExternalGpuStreamRegion
395{
397 ExternalStreamSync sync_on_exit = ExternalStreamSync::Yes) noexcept
398 : m_sync_on_exit(sync_on_exit)
399 {
400 Device::setExternalStream(stream);
401 }
402
404 ExternalGpuStreamRegion& operator= (ExternalGpuStreamRegion&& rhs) noexcept = delete;
407
409 {
410 Device::resetExternalStream(m_sync_on_exit);
411 }
412
413private:
414 ExternalStreamSync m_sync_on_exit = ExternalStreamSync::Yes;
415};
416#endif
417
418#ifdef AMREX_USE_GPU
419
420inline void
421htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
422{
423 if (sz == 0) { return; }
424#ifdef AMREX_USE_SYCL
425 auto& q = Device::streamQueue();
426 q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
427#else
429 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
430 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
431#endif
432}
433
434inline void
435dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
436{
437 if (sz == 0) { return; }
438#ifdef AMREX_USE_SYCL
439 auto& q = Device::streamQueue();
440 q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
441#else
443 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
444 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
445#endif
446}
447
448inline void
449dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
450{
451 if (sz == 0) { return; }
452#ifdef AMREX_USE_SYCL
453 auto& q = Device::streamQueue();
454 q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
455#else
457 AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
458 AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
459#endif
460}
461
462#else // AMREX_USE_GPU
463
464inline void
465htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
466{
467 if (sz == 0) { return; }
468 std::memcpy(p_d, p_h, sz);
469}
470
471inline void
472dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
473{
474 if (sz == 0) { return; }
475 std::memcpy(p_h, p_d, sz);
476}
477
478inline void
479dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
480{
481 if (sz == 0) { return; }
482 std::memcpy(p_d_dst, p_d_src, sz);
483}
484
485#endif // AMREX_USE_GPU
486
487inline void
488htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
489{
490 if (sz == 0) { return; }
491 htod_memcpy_async(p_d, p_h, sz);
493}
494
495inline void
496dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
497{
498 if (sz == 0) { return; }
499 dtoh_memcpy_async(p_h, p_d, sz);
501}
502
503inline void
504dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
505{
506 if (sz == 0) { return; }
507 dtod_memcpy_async(p_d_dst, p_d_src, sz);
509}
510
511#ifdef AMREX_USE_HYPRE
512void hypreSynchronize ();
513#endif
514
517template <typename T>
518void memcpy_from_host_to_device_global_async (T& dg, const void* src,
519 std::size_t nbytes,
520 std::size_t offset = 0)
521{
522#if defined(AMREX_USE_CUDA)
523 AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
524 cudaMemcpyHostToDevice,
526#elif defined(AMREX_USE_HIP)
527 AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
528 hipMemcpyHostToDevice,
530#elif defined(AMREX_USE_SYCL)
531 Device::streamQueue().memcpy(dg, src, nbytes, offset);
532#else
533 auto* p = (char*)(&dg);
534 std::memcpy(p+offset, src, nbytes);
535#endif
536}
537
540template <typename T>
541void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
542 std::size_t nbytes,
543 std::size_t offset = 0)
544{
545#if defined(AMREX_USE_CUDA)
546 AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
547 cudaMemcpyDeviceToHost,
549#elif defined(AMREX_USE_HIP)
550 AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
551 hipMemcpyDeviceToHost,
553#elif defined(AMREX_USE_SYCL)
554 Device::streamQueue().memcpy(dst, dg, nbytes, offset);
555#else
556 auto const* p = (char const*)(&dg);
557 std::memcpy(dst, p+offset, nbytes);
558#endif
559}
560
561#ifdef AMREX_USE_GPU
562
565
572struct [[nodiscard]] SyncAtExitOnly
573 : public NoSyncRegion
574{
576 if (! inNoSyncRegionPreviously()) {
577 Gpu::streamSynchronizeActive();
578 }
579 }
580};
581
582#else
583
584struct [[nodiscard]] SyncAtExitOnly {};
585
586#endif
587
588}
589
590#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:21
#define AMREX_CUDA_SAFE_CALL(call)
Definition AMReX_GpuError.H:73
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
A virtual base class for objects that manage their own dynamic memory allocation.
Definition AMReX_Arena.H:127
Definition AMReX_GpuDevice.H:78
static gpuStream_t setStream(gpuStream_t s) noexcept
Definition AMReX_GpuDevice.cpp:749
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:709
static int numDevicePartners() noexcept
Definition AMReX_GpuDevice.cpp:702
static int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:105
static void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.cpp:935
static void mem_advise_set_readonly(void *p, std::size_t sz)
Definition AMReX_GpuDevice.cpp:1148
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1205
static int managedMemorySupported() noexcept
Definition AMReX_GpuDevice.H:227
static int memoryPoolsSupported() noexcept
Definition AMReX_GpuDevice.H:229
static void setStreamIndex(int idx) noexcept
Definition AMReX_GpuDevice.cpp:728
static gpuStream_t resetStream() noexcept
Definition AMReX_GpuDevice.cpp:741
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:239
static void streamSynchronizeActive() noexcept
Definition AMReX_GpuDevice.cpp:908
static int maxThreadsPerBlock() noexcept
Definition AMReX_GpuDevice.H:196
static int maxBlocksPerGrid(int dir) noexcept
Definition AMReX_GpuDevice.H:198
static int devicePropMinor() noexcept
Definition AMReX_GpuDevice.H:204
static void streamOrderedFreeAsync(Arena *arena, void *mem, gpuStream_t stream) noexcept
Definition AMReX_GpuDevice.cpp:952
static int numMultiProcessors() noexcept
Definition AMReX_GpuDevice.H:194
static void Finalize()
Definition AMReX_GpuDevice.cpp:462
static void synchronize() noexcept
Definition AMReX_GpuDevice.cpp:843
static void mem_advise_set_preferred(void *p, std::size_t sz, int device)
Definition AMReX_GpuDevice.cpp:1116
static std::string deviceName() noexcept
Definition AMReX_GpuDevice.H:199
static cudaStream_t cudaStream() noexcept
Definition AMReX_GpuDevice.H:97
static void setNumThreadsMin(int nx, int ny, int nz) noexcept
Definition AMReX_GpuDevice.cpp:1182
static void streamSynchronize() noexcept
Definition AMReX_GpuDevice.cpp:855
static gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:86
static std::string deviceVendor() noexcept
Definition AMReX_GpuDevice.H:207
static void profilerStop()
Definition AMReX_GpuDevice.cpp:1377
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1190
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
static std::size_t sharedMemPerBlock() noexcept
Definition AMReX_GpuDevice.H:193
static std::size_t freeMemAvailable()
Definition AMReX_GpuDevice.cpp:1344
static void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.cpp:920
static void profilerStart()
Definition AMReX_GpuDevice.cpp:1366
static int maxThreadsPerBlock(int dir) noexcept
Definition AMReX_GpuDevice.H:197
static int deviceId() noexcept
Definition AMReX_GpuDevice.cpp:691
static int maxThreadsPerMultiProcessor() noexcept
Definition AMReX_GpuDevice.H:195
static int numDevicesUsed() noexcept
Definition AMReX_GpuDevice.cpp:697
static void resetStreamIndex() noexcept
Definition AMReX_GpuDevice.H:114
static void setExternalStream(gpuStream_t s)
Definition AMReX_GpuDevice.cpp:769
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:836
static void Initialize(bool minimal, int a_device_id)
Definition AMReX_GpuDevice.cpp:225
static bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.cpp:976
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1197
static std::size_t totalGlobalMem() noexcept
Definition AMReX_GpuDevice.H:192
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1263
static void resetExternalStream(ExternalStreamSync sync_stream=ExternalStreamSync::Yes) noexcept
Definition AMReX_GpuDevice.cpp:813
static int devicePropMajor() noexcept
Definition AMReX_GpuDevice.H:203
Definition AMReX_GpuDevice.H:64
std::size_t wait_list_size()
Definition AMReX_GpuDevice.cpp:216
void sync()
Definition AMReX_GpuDevice.cpp:159
void free_async(Arena *arena, void *mem)
Definition AMReX_GpuDevice.cpp:196
gpuStream_t & getStream()
Definition AMReX_GpuDevice.cpp:149
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
Definition AMReX_BaseFwd.H:55
void dtod_memcpy_async(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:449
void synchronize() noexcept
Definition AMReX_GpuDevice.H:304
void streamSynchronizeActive() noexcept
Definition AMReX_GpuDevice.H:324
bool clearFreeAsyncBuffer() noexcept
Definition AMReX_GpuDevice.H:354
void freeAsync(Arena *arena, void *mem) noexcept
Definition AMReX_GpuDevice.H:345
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435
ExternalStreamSync
Definition AMReX_GpuDevice.H:57
void streamSynchronizeAll() noexcept
Definition AMReX_GpuDevice.H:330
bool inSingleStreamRegion() noexcept
Definition AMReX_GpuControl.H:151
void memcpy_from_device_global_to_host_async(void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:541
int numGpuStreams() noexcept
Definition AMReX_GpuDevice.H:298
void dtoh_memcpy(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:496
void htod_memcpy(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:488
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421
void setExternalGpuStream(gpuStream_t stream)
Provide a user-supplied GPU stream.
Definition AMReX_GpuDevice.H:369
void dtod_memcpy(void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:504
void memcpy_from_host_to_device_global_async(T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
Definition AMReX_GpuDevice.H:518
bool isStreamActive(gpuStream_t s)
Check if the given stream is active.
Definition AMReX_GpuDevice.cpp:1400
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
void resetExternalGpuStream(ExternalStreamSync sync_stream=ExternalStreamSync::Yes) noexcept
Pop the current user-supplied GPU stream.
Definition AMReX_GpuDevice.H:385
constexpr int get_thread_num()
Definition AMReX_OpenMP.H:37
Definition AMReX_Amr.cpp:49
cudaDeviceProp gpuDeviceProp_t
Definition AMReX_GpuDevice.H:28
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
Definition AMReX_GpuDevice.H:395
ExternalGpuStreamRegion(gpuStream_t stream, ExternalStreamSync sync_on_exit=ExternalStreamSync::Yes) noexcept
Definition AMReX_GpuDevice.H:396
ExternalGpuStreamRegion(ExternalGpuStreamRegion &&rhs) noexcept=delete
ExternalGpuStreamRegion(ExternalGpuStreamRegion const &)=delete
~ExternalGpuStreamRegion()
Definition AMReX_GpuDevice.H:408
Definition AMReX_GpuControl.H:184
Definition AMReX_GpuDevice.H:574
~SyncAtExitOnly()
Definition AMReX_GpuDevice.H:575