Namespaces | |
Atomic | |
detail | |
range_detail | |
Typedefs | |
template<class T > | |
using | DeviceVector = PODVector< T, ArenaAllocator< T > > |
A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter. More... | |
template<class T > | |
using | NonManagedDeviceVector = PODVector< T, DeviceArenaAllocator< T > > |
A PODVector that uses the non-managed device memory arena. More... | |
template<class T > | |
using | ManagedVector = PODVector< T, ManagedArenaAllocator< T > > |
A PODVector that uses the managed memory arena. More... | |
template<class T > | |
using | PinnedVector = PODVector< T, PinnedArenaAllocator< T > > |
A PODVector that uses the pinned memory arena. More... | |
template<class T > | |
using | AsyncVector = PODVector< T, AsyncArenaAllocator< T > > |
A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device. More... | |
template<class T > | |
using | HostVector = PinnedVector< T > |
A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector. More... | |
template<class T > | |
using | ManagedDeviceVector = PODVector< T, ManagedArenaAllocator< T > > |
This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release. More... | |
Functions | |
template<class InIter , class OutIter > | |
void | copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class InIter , class OutIter > | |
void | copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class InIter , class OutIter > | |
void | copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class InIter , class OutIter > | |
void | copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class InIter , class OutIter > | |
void | copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class InIter , class OutIter > | |
void | copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More... | |
template<class Iter > | |
void | prefetchToHost (Iter begin, Iter end) noexcept |
Migrate elements of a container from device to host. This is a no-op for host-only code. More... | |
template<class Iter > | |
void | prefetchToDevice (Iter begin, Iter end) noexcept |
Migrate elements of a container from host to device. This is a no-op for host-only code. More... | |
template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0> | |
void | fillAsync (IT first, IT last, F const &f) noexcept |
Fill the elements in the given range using the given calllable. More... | |
bool | inLaunchRegion () noexcept |
bool | notInLaunchRegion () noexcept |
bool | setLaunchRegion (bool launch) noexcept |
bool | inGraphRegion () |
bool | notInGraphRegion () |
bool | setGraphRegion (bool graph) |
bool | inSingleStreamRegion () noexcept |
bool | inNoSyncRegion () noexcept |
bool | setSingleStreamRegion (bool b) noexcept |
bool | setNoSyncRegion (bool b) noexcept |
gpuStream_t | gpuStream () noexcept |
int | numGpuStreams () noexcept |
void | synchronize () noexcept |
void | streamSynchronize () noexcept |
void | streamSynchronizeAll () noexcept |
void | htod_memcpy_async (void *p_d, const void *p_h, const std::size_t sz) noexcept |
void | dtoh_memcpy_async (void *p_h, const void *p_d, const std::size_t sz) noexcept |
void | dtod_memcpy_async (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept |
void | htod_memcpy (void *p_d, const void *p_h, const std::size_t sz) noexcept |
void | dtoh_memcpy (void *p_h, const void *p_d, const std::size_t sz) noexcept |
void | dtod_memcpy (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept |
template<typename T > | |
void | memcpy_from_host_to_device_global_async (T &dg, const void *src, std::size_t nbytes, std::size_t offset=0) |
template<typename T > | |
void | memcpy_from_device_global_to_host_async (void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0) |
void | ErrorCheck (const char *file, int line) noexcept |
constexpr std::size_t | numThreadsPerBlockParallelFor () |
AMREX_GPU_HOST_DEVICE Box | getThreadBox (const Box &bx, Long offset) noexcept |
template<int MT> | |
ExecutionConfig | makeExecutionConfig (Long N) noexcept |
template<int MT> | |
ExecutionConfig | makeExecutionConfig (const Box &box) noexcept |
template<int MT> | |
Vector< ExecConfig > | makeNExecutionConfigs (Long N) noexcept |
template<int MT, int dim> | |
Vector< ExecConfig > | makeNExecutionConfigs (BoxND< dim > const &box) noexcept |
template<typename T > | |
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > | Range (T const &b) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceSum (T *dest, T source, Gpu::Handler const &h) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMin (T *dest, T source, Gpu::Handler const &h) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMax (T *dest, T source, Gpu::Handler const &h) noexcept |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceLogicalAnd (int *dest, int source, Gpu::Handler const &h) noexcept |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceLogicalOr (int *dest, int source, Gpu::Handler const &h) noexcept |
template<int warpSize, typename T , typename WARPREDUCE > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduce (T x, WARPREDUCE &&warp_reduce, T x0) |
template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | blockReduce_partial (T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler) |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceSum (T source) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceSum_full (T *dest, T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceSum (T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceSum_full (T *dest, T source) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceMin (T source) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMin_full (T *dest, T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceMin (T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMin_full (T *dest, T source) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceMax (T source) noexcept |
template<typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMax_full (T *dest, T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T | blockReduceMax (T source) noexcept |
template<int BLOCKDIMX, typename T > | |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void | deviceReduceMax_full (T *dest, T source) noexcept |
template<typename T > | |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T | LDG (Array4< T > const &a, int i, int j, int k) noexcept |
template<typename T > | |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T | LDG (Array4< T > const &a, int i, int j, int k, int n) noexcept |
bool | isManaged (void const *p) noexcept |
bool | isDevicePtr (void const *p) noexcept |
bool | isPinnedPtr (void const *p) noexcept |
bool | isGpuPtr (void const *p) noexcept |
template<class T > | |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool | isnan (T m) noexcept |
template<class T > | |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool | isinf (T m) noexcept |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * | memcpy (void *dest, const void *src, std::size_t count) |
template<class InIter , class OutIter > | |
OutIter | inclusive_scan (InIter begin, InIter end, OutIter result) |
template<class InIter , class OutIter > | |
OutIter | exclusive_scan (InIter begin, InIter end, OutIter result) |
Variables | |
static constexpr HostToDevice | hostToDevice {} |
static constexpr DeviceToHost | deviceToHost {} |
static constexpr DeviceToDevice | deviceToDevice {} |
bool | in_launch_region = true |
bool | in_graph_region = false |
bool | in_single_stream_region = false |
bool | in_nosync_region = false |
using amrex::Gpu::AsyncVector = typedef PODVector<T, AsyncArenaAllocator<T> > |
A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device.
using amrex::Gpu::DeviceVector = typedef PODVector<T, ArenaAllocator<T> > |
using amrex::Gpu::HostVector = typedef PinnedVector<T> |
A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector.
using amrex::Gpu::ManagedDeviceVector = typedef PODVector<T, ManagedArenaAllocator<T> > |
This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release.
using amrex::Gpu::ManagedVector = typedef PODVector<T, ManagedArenaAllocator<T> > |
A PODVector that uses the managed memory arena.
using amrex::Gpu::NonManagedDeviceVector = typedef PODVector<T, DeviceArenaAllocator<T> > |
A PODVector that uses the non-managed device memory arena.
using amrex::Gpu::PinnedVector = typedef PODVector<T, PinnedArenaAllocator<T> > |
A PODVector that uses the pinned memory arena.
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduce | ( | T | x, |
WARPREDUCE && | warp_reduce, | ||
T | x0 | ||
) |
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::blockReduce_partial | ( | T * | dest, |
T | x, | ||
WARPREDUCE && | warp_reduce, | ||
ATOMICOP && | atomic_op, | ||
Gpu::Handler const & | handler | ||
) |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
DeviceToDevice | A tag indicating that the copy is from the device to the device |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copy(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
|
noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
DeviceToHost | A tag indicating that the copy is from the device to the host |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copy(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
|
noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is blocking - CPU execution will halt until the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
HostToDevice | A tag indicating that the copy is from the host to the device |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copy(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
|
noexcept |
A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
DeviceToDevice | A tag indicating that the copy is from the device to the device |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copyAsync(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());
|
noexcept |
A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
DeviceToHost | A tag indicating that the copy is from the device to the host |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copyAsync(Gpu::deviceToHost, a.begin(), a.end(), b.begin());
|
noexcept |
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.
This version is asynchronous - CPU execution will continue, whether or not the copy is finished.
InIter | The input iterator type |
OutIter | The output iterator type |
HostToDevice | A tag indicating that the copy is from the host to the device |
begin | Where in the input to start reading |
end | Where in the input to stop reading |
result | Where in the output to start writing |
Example usage:
Gpu::copyAsync(Gpu::hostToDevice, a.begin(), a.end(), b.begin());
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
OutIter amrex::Gpu::exclusive_scan | ( | InIter | begin, |
InIter | end, | ||
OutIter | result | ||
) |
|
noexcept |
Fill the elements in the given range using the given calllable.
This function is asynchronous for GPU builds.
IT | the iterator type |
F | the callable type |
first | the inclusive first in the range [first, last) |
last | the exclusive last in the range [first, last) |
f | the callable with the function signature of void(T&, Long), where T is the element type and the Long parameter is the index for the element to be filled. |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
OutIter amrex::Gpu::inclusive_scan | ( | InIter | begin, |
InIter | end, | ||
OutIter | result | ||
) |
|
inline |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
noexcept |
|
inlinenoexcept |
|
noexcept |
|
inlinenoexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
|
noexcept |
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void* amrex::Gpu::memcpy | ( | void * | dest, |
const void * | src, | ||
std::size_t | count | ||
) |
void amrex::Gpu::memcpy_from_device_global_to_host_async | ( | void * | dst, |
T const & | dg, | ||
std::size_t | nbytes, | ||
std::size_t | offset = 0 |
||
) |
Copy nbytes
bytes from device global variable to host. offset
is the offset in bytes from the start of the device global variable.
void amrex::Gpu::memcpy_from_host_to_device_global_async | ( | T & | dg, |
const void * | src, | ||
std::size_t | nbytes, | ||
std::size_t | offset = 0 |
||
) |
Copy nbytes
bytes from host to device global variable. offset
is the offset in bytes from the start of the device global variable.
|
inline |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlineconstexpr |
|
noexcept |
Migrate elements of a container from host to device. This is a no-op for host-only code.
This version is blocking - CPU execution will halt until the migration is finished.
|
noexcept |
Migrate elements of a container from device to host. This is a no-op for host-only code.
This version is blocking - CPU execution will halt until the migration is finished.
|
noexcept |
|
inline |
|
inlinenoexcept |
Enable/disable GPU kernel launches.
amrex::ParallelFor
will be unaffected. Therefore it should not be used for comparing GPU to non-GPU performance or behavior.Will disable the launching of GPU kernels between the calls.
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
inlinenoexcept |
|
staticconstexpr |
|
staticconstexpr |
|
staticconstexpr |
bool amrex::Gpu::in_graph_region = false |
bool amrex::Gpu::in_launch_region = true |
bool amrex::Gpu::in_nosync_region = false |
bool amrex::Gpu::in_single_stream_region = false |