Namespaces | |
| namespace | Atomic |
| namespace | detail |
| namespace | range_detail |
Typedefs | |
| template<class T > | |
| using | DeviceVector = PODVector< T, ArenaAllocator< T > > |
| A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter. | |
| template<class T > | |
| using | NonManagedDeviceVector = PODVector< T, DeviceArenaAllocator< T > > |
| A PODVector that uses the non-managed device memory arena. | |
| template<class T > | |
| using | ManagedVector = PODVector< T, ManagedArenaAllocator< T > > |
| A PODVector that uses the managed memory arena. | |
| template<class T > | |
| using | PinnedVector = PODVector< T, PinnedArenaAllocator< T > > |
| A PODVector that uses the pinned memory arena. | |
| template<class T > | |
| using | AsyncVector = PODVector< T, AsyncArenaAllocator< T > > |
| A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device. | |
| template<class T > | |
| using | HostVector = PinnedVector< T > |
| A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector. | |
| template<class T > | |
| using | ManagedDeviceVector = PODVector< T, ManagedArenaAllocator< T > > |
| This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release. | |
Functions | |
| template<class InIter , class OutIter > | |
| void | copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept |
| A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class InIter , class OutIter > | |
| void | copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept |
| A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class InIter , class OutIter > | |
| void | copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept |
| A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class InIter , class OutIter > | |
| void | copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept |
| A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class InIter , class OutIter > | |
| void | copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept |
| A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class InIter , class OutIter > | |
| void | copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept |
| A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. | |
| template<class Iter > | |
| void | prefetchToHost (Iter begin, Iter end) noexcept |
| Migrate elements of a container from device to host. This is a no-op for host-only code. | |
| template<class Iter > | |
| void | prefetchToDevice (Iter begin, Iter end) noexcept |
| Migrate elements of a container from host to device. This is a no-op for host-only code. | |
| template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0> | |
| void | fillAsync (IT first, IT last, F const &f) noexcept |
| Fill the elements in the given range using the given calllable. | |
| bool | inLaunchRegion () noexcept |
| bool | notInLaunchRegion () noexcept |
| bool | setLaunchRegion (bool launch) noexcept |
| bool | inGraphRegion () |
| bool | notInGraphRegion () |
| bool | setGraphRegion (bool graph) |
| bool | inSingleStreamRegion () noexcept |
| bool | inNoSyncRegion () noexcept |
| bool | setSingleStreamRegion (bool b) noexcept |
| bool | setNoSyncRegion (bool b) noexcept |
| gpuStream_t | gpuStream () noexcept |
| int | numGpuStreams () noexcept |
| void | synchronize () noexcept |
| void | streamSynchronize () noexcept |
| void | streamSynchronizeAll () noexcept |
| void | freeAsync (Arena *arena, void *mem) noexcept |
| void | htod_memcpy_async (void *p_d, const void *p_h, const std::size_t sz) noexcept |
| void | dtoh_memcpy_async (void *p_h, const void *p_d, const std::size_t sz) noexcept |
| void | dtod_memcpy_async (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept |
| void | htod_memcpy (void *p_d, const void *p_h, const std::size_t sz) noexcept |
| void | dtoh_memcpy (void *p_h, const void *p_d, const std::size_t sz) noexcept |
| void | dtod_memcpy (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept |
| template<typename T > | |
| void | memcpy_from_host_to_device_global_async (T &dg, const void *src, std::size_t nbytes, std::size_t offset=0) |
| template<typename T > | |
| void | memcpy_from_device_global_to_host_async (void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0) |
| void | ErrorCheck (const char *file, int line) noexcept |
| constexpr std::size_t | numThreadsPerBlockParallelFor () |
| __host__ __device__ Box | getThreadBox (const Box &bx, Long offset) noexcept |
| template<int MT> | |
| ExecutionConfig | makeExecutionConfig (Long N) noexcept |
| template<int MT> | |
| ExecutionConfig | makeExecutionConfig (const Box &box) noexcept |
| template<int MT> | |
| Vector< ExecConfig > | makeNExecutionConfigs (Long N) noexcept |
| template<int MT, int dim> | |
| Vector< ExecConfig > | makeNExecutionConfigs (BoxND< dim > const &box) noexcept |
| template<typename T > | |
| __host__ __device__ range_detail::range_impl< T > | Range (T const &b) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceSum (T *dest, T source, Gpu::Handler const &h) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceMin (T *dest, T source, Gpu::Handler const &h) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceMax (T *dest, T source, Gpu::Handler const &h) noexcept |
| __device__ void | deviceReduceLogicalAnd (int *dest, int source, Gpu::Handler const &h) noexcept |
| __device__ void | deviceReduceLogicalOr (int *dest, int source, Gpu::Handler const &h) noexcept |
| template<int warpSize, typename T , typename WARPREDUCE > | |
| __device__ T | blockReduce (T x, WARPREDUCE &&warp_reduce, T x0) |
| template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP > | |
| __device__ void | blockReduce_partial (T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler) |
| template<typename T > | |
| __device__ T | blockReduceSum (T source) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceSum_full (T *dest, T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ T | blockReduceSum (T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ void | deviceReduceSum_full (T *dest, T source) noexcept |
| template<typename T > | |
| __device__ T | blockReduceMin (T source) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceMin_full (T *dest, T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ T | blockReduceMin (T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ void | deviceReduceMin_full (T *dest, T source) noexcept |
| template<typename T > | |
| __device__ T | blockReduceMax (T source) noexcept |
| template<typename T > | |
| __device__ void | deviceReduceMax_full (T *dest, T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ T | blockReduceMax (T source) noexcept |
| template<int BLOCKDIMX, typename T > | |
| __device__ void | deviceReduceMax_full (T *dest, T source) noexcept |
| template<typename T > | |
| __host__ __device__ T | LDG (Array4< T > const &a, int i, int j, int k) noexcept |
| template<typename T > | |
| __host__ __device__ T | LDG (Array4< T > const &a, int i, int j, int k, int n) noexcept |
| bool | isManaged (void const *p) noexcept |
| bool | isDevicePtr (void const *p) noexcept |
| bool | isPinnedPtr (void const *p) noexcept |
| bool | isGpuPtr (void const *p) noexcept |
| template<class T > | |
| __host__ __device__ bool | isnan (T m) noexcept |
| template<class T > | |
| __host__ __device__ bool | isinf (T m) noexcept |
| __host__ __device__ void * | memcpy (void *dest, const void *src, std::size_t count) |
| template<class InIter , class OutIter > | |
| OutIter | inclusive_scan (InIter begin, InIter end, OutIter result) |
| template<class InIter , class OutIter > | |
| OutIter | exclusive_scan (InIter begin, InIter end, OutIter result) |
Variables | |
| static constexpr HostToDevice | hostToDevice {} |
| static constexpr DeviceToHost | deviceToHost {} |
| static constexpr DeviceToDevice | deviceToDevice {} |
| bool | in_launch_region = true |
| bool | in_graph_region = false |
| bool | in_single_stream_region = false |
| bool | in_nosync_region = false |
| using amrex::Gpu::AsyncVector = typedef PODVector<T, AsyncArenaAllocator<T> > |
A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device.
| using amrex::Gpu::DeviceVector = typedef PODVector<T, ArenaAllocator<T> > |
| using amrex::Gpu::HostVector = typedef PinnedVector<T> |
A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector.
| using amrex::Gpu::ManagedDeviceVector = typedef PODVector<T, ManagedArenaAllocator<T> > |
This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release.
| using amrex::Gpu::ManagedVector = typedef PODVector<T, ManagedArenaAllocator<T> > |
A PODVector that uses the managed memory arena.
| using amrex::Gpu::NonManagedDeviceVector = typedef PODVector<T, DeviceArenaAllocator<T> > |
A PODVector that uses the non-managed device memory arena.
| using amrex::Gpu::PinnedVector = typedef PODVector<T, PinnedArenaAllocator<T> > |
A PODVector that uses the pinned memory arena.
|
inline |
|
inline |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
| InIter | The input iterator type |
| OutIter | The output iterator type |
| begin | Where in the input to start reading |
| end | Where in the input to stop reading |
| result | Where in the output to start writing |
Example usage:
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
| OutIter amrex::Gpu::exclusive_scan | ( | InIter | begin, |
| InIter | end, | ||
| OutIter | result | ||
| ) |
|
noexcept |
Fill the elements in the given range using the given calllable.
This function is asynchronous for GPU builds.
| IT | the iterator type |
| F | the callable type |
| first | the inclusive first in the range [first, last) |
| last | the exclusive last in the range [first, last) |
| f | the callable with the function signature of void(T&, Long), where T is the element type and the Long parameter is the index for the element to be filled. |
|
inlinenoexcept |
Deallocate memory belonging to an arena asynchronously. Memory deallocated in this way is held in a pool and will not be reused until the next amrex::Gpu::streamSynchronize(). GPU kernels that were already launched on the currently active stream can still continue to use the memory after this function is called. There is no need to use this function for CPU-only memory or with The_Async_Arena.
| [in] | arena | the arena the memory belongs to |
| [in] | mem | pointer to the memory to be freed |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
| OutIter amrex::Gpu::inclusive_scan | ( | InIter | begin, |
| InIter | end, | ||
| OutIter | result | ||
| ) |
|
inline |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
inline |
| void amrex::Gpu::memcpy_from_device_global_to_host_async | ( | void * | dst, |
| T const & | dg, | ||
| std::size_t | nbytes, | ||
| std::size_t | offset = 0 |
||
| ) |
Copy nbytes bytes from device global variable to host. offset is the offset in bytes from the start of the device global variable.
| void amrex::Gpu::memcpy_from_host_to_device_global_async | ( | T & | dg, |
| const void * | src, | ||
| std::size_t | nbytes, | ||
| std::size_t | offset = 0 |
||
| ) |
Copy nbytes bytes from host to device global variable. offset is the offset in bytes from the start of the device global variable.
|
inline |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlineconstexpr |
|
noexcept |
Migrate elements of a container from host to device. This is a no-op for host-only code.
This version is blocking - CPU execution will halt until the migration is finished.
|
noexcept |
Migrate elements of a container from device to host. This is a no-op for host-only code.
This version is blocking - CPU execution will halt until the migration is finished.
|
noexcept |
|
inline |
|
inlinenoexcept |
Enable/disable GPU kernel launches.
amrex::ParallelFor will be unaffected. Therefore it should not be used for comparing GPU to non-GPU performance or behavior.Will disable the launching of GPU kernels between the calls.
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
| bool amrex::Gpu::in_graph_region = false |
| bool amrex::Gpu::in_launch_region = true |
| bool amrex::Gpu::in_nosync_region = false |
| bool amrex::Gpu::in_single_stream_region = false |