Block-Structured AMR Software Framework
amrex::Gpu Namespace Reference

Namespaces

 Atomic
 
 detail
 
 range_detail
 

Classes

class  AsyncArray
 
struct  AtomicAdd
 
struct  AtomicMin
 
struct  AtomicMax
 
struct  AtomicLogicalAnd
 
struct  AtomicLogicalOr
 
class  Buffer
 
struct  HostToDevice
 
struct  DeviceToHost
 
struct  DeviceToDevice
 
struct  LaunchSafeGuard
 
struct  GraphSafeGuard
 
struct  SingleStreamRegion
 
struct  NoSyncRegion
 
class  Device
 
class  Elixir
 
class  KernelInfo
 
struct  ComponentBox
 
struct  GridSize
 
struct  ExecutionConfig
 
struct  ExecConfig
 
struct  Managed
 
struct  Pinned
 
struct  Deleter
 
struct  DeviceScalar
 
struct  SharedMemory
 
struct  SharedMemory< double >
 
struct  SharedMemory< float >
 
struct  SharedMemory< long >
 
struct  SharedMemory< long long >
 
struct  SharedMemory< int >
 
struct  SharedMemory< short >
 
struct  SharedMemory< char >
 
struct  SharedMemory< unsigned long >
 
struct  SharedMemory< unsigned long long >
 
struct  SharedMemory< unsigned int >
 
struct  SharedMemory< unsigned short >
 
struct  SharedMemory< unsigned char >
 
struct  SharedMemory< bool >
 
struct  warpReduce
 
struct  Handler
 
struct  StreamItInfo
 
class  StreamIter
 
struct  SharedMemory< openbc::Moments::array_type >
 

Typedefs

template<class T >
using DeviceVector = PODVector< T, ArenaAllocator< T > >
 A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter. More...
 
template<class T >
using NonManagedDeviceVector = PODVector< T, DeviceArenaAllocator< T > >
 A PODVector that uses the non-managed device memory arena. More...
 
template<class T >
using ManagedVector = PODVector< T, ManagedArenaAllocator< T > >
 A PODVector that uses the managed memory arena. More...
 
template<class T >
using PinnedVector = PODVector< T, PinnedArenaAllocator< T > >
 A PODVector that uses the pinned memory arena. More...
 
template<class T >
using AsyncVector = PODVector< T, AsyncArenaAllocator< T > >
 A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device. More...
 
template<class T >
using HostVector = PinnedVector< T >
 A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector. More...
 
template<class T >
using ManagedDeviceVector = PODVector< T, ManagedArenaAllocator< T > >
 This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release. More...
 

Functions

template<class InIter , class OutIter >
void copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
 A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class InIter , class OutIter >
void copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
 A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class InIter , class OutIter >
void copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
 A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class InIter , class OutIter >
void copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
 A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class InIter , class OutIter >
void copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
 A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class InIter , class OutIter >
void copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
 A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement. More...
 
template<class Iter >
void prefetchToHost (Iter begin, Iter end) noexcept
 Migrate elements of a container from device to host. This is a no-op for host-only code. More...
 
template<class Iter >
void prefetchToDevice (Iter begin, Iter end) noexcept
 Migrate elements of a container from host to device. This is a no-op for host-only code. More...
 
template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0>
void fillAsync (IT first, IT last, F const &f) noexcept
 Fill the elements in the given range using the given calllable. More...
 
bool inLaunchRegion () noexcept
 
bool notInLaunchRegion () noexcept
 
bool setLaunchRegion (bool launch) noexcept
 
bool inGraphRegion ()
 
bool notInGraphRegion ()
 
bool setGraphRegion (bool graph)
 
bool inSingleStreamRegion () noexcept
 
bool inNoSyncRegion () noexcept
 
bool setSingleStreamRegion (bool b) noexcept
 
bool setNoSyncRegion (bool b) noexcept
 
gpuStream_t gpuStream () noexcept
 
int numGpuStreams () noexcept
 
void synchronize () noexcept
 
void streamSynchronize () noexcept
 
void streamSynchronizeAll () noexcept
 
void htod_memcpy_async (void *p_d, const void *p_h, const std::size_t sz) noexcept
 
void dtoh_memcpy_async (void *p_h, const void *p_d, const std::size_t sz) noexcept
 
void dtod_memcpy_async (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
 
void htod_memcpy (void *p_d, const void *p_h, const std::size_t sz) noexcept
 
void dtoh_memcpy (void *p_h, const void *p_d, const std::size_t sz) noexcept
 
void dtod_memcpy (void *p_d_dst, const void *p_d_src, const std::size_t sz) noexcept
 
template<typename T >
void memcpy_from_host_to_device_global_async (T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)
 
template<typename T >
void memcpy_from_device_global_to_host_async (void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)
 
void ErrorCheck (const char *file, int line) noexcept
 
constexpr std::size_t numThreadsPerBlockParallelFor ()
 
AMREX_GPU_HOST_DEVICE Box getThreadBox (const Box &bx, Long offset) noexcept
 
template<int MT>
ExecutionConfig makeExecutionConfig (Long N) noexcept
 
template<int MT>
ExecutionConfig makeExecutionConfig (const Box &box) noexcept
 
template<int MT>
Vector< ExecConfigmakeNExecutionConfigs (Long N) noexcept
 
template<int MT, int dim>
Vector< ExecConfigmakeNExecutionConfigs (BoxND< dim > const &box) noexcept
 
template<typename T >
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > Range (T const &b) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum (T *dest, T source, Gpu::Handler const &h) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin (T *dest, T source, Gpu::Handler const &h) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax (T *dest, T source, Gpu::Handler const &h) noexcept
 
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalAnd (int *dest, int source, Gpu::Handler const &h) noexcept
 
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceLogicalOr (int *dest, int source, Gpu::Handler const &h) noexcept
 
template<int warpSize, typename T , typename WARPREDUCE >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduce (T x, WARPREDUCE &&warp_reduce, T x0)
 
template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial (T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceSum (T source) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full (T *dest, T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceSum (T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceSum_full (T *dest, T source) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceMin (T source) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full (T *dest, T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceMin (T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMin_full (T *dest, T source) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceMax (T source) noexcept
 
template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full (T *dest, T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINEblockReduceMax (T source) noexcept
 
template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void deviceReduceMax_full (T *dest, T source) noexcept
 
template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINELDG (Array4< T > const &a, int i, int j, int k) noexcept
 
template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINELDG (Array4< T > const &a, int i, int j, int k, int n) noexcept
 
bool isManaged (void const *p) noexcept
 
bool isDevicePtr (void const *p) noexcept
 
bool isPinnedPtr (void const *p) noexcept
 
bool isGpuPtr (void const *p) noexcept
 
template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool isnan (T m) noexcept
 
template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool isinf (T m) noexcept
 
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy (void *dest, const void *src, std::size_t count)
 
template<class InIter , class OutIter >
OutIter inclusive_scan (InIter begin, InIter end, OutIter result)
 
template<class InIter , class OutIter >
OutIter exclusive_scan (InIter begin, InIter end, OutIter result)
 

Variables

static constexpr HostToDevice hostToDevice {}
 
static constexpr DeviceToHost deviceToHost {}
 
static constexpr DeviceToDevice deviceToDevice {}
 
bool in_launch_region = true
 
bool in_graph_region = false
 
bool in_single_stream_region = false
 
bool in_nosync_region = false
 

Typedef Documentation

◆ AsyncVector

template<class T >
using amrex::Gpu::AsyncVector = typedef PODVector<T, AsyncArenaAllocator<T> >

A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device.

◆ DeviceVector

template<class T >
using amrex::Gpu::DeviceVector = typedef PODVector<T, ArenaAllocator<T> >

A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter.

◆ HostVector

template<class T >
using amrex::Gpu::HostVector = typedef PinnedVector<T>

A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector.

◆ ManagedDeviceVector

template<class T >
using amrex::Gpu::ManagedDeviceVector = typedef PODVector<T, ManagedArenaAllocator<T> >

This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release.

◆ ManagedVector

template<class T >
using amrex::Gpu::ManagedVector = typedef PODVector<T, ManagedArenaAllocator<T> >

A PODVector that uses the managed memory arena.

◆ NonManagedDeviceVector

template<class T >
using amrex::Gpu::NonManagedDeviceVector = typedef PODVector<T, DeviceArenaAllocator<T> >

A PODVector that uses the non-managed device memory arena.

◆ PinnedVector

template<class T >
using amrex::Gpu::PinnedVector = typedef PODVector<T, PinnedArenaAllocator<T> >

A PODVector that uses the pinned memory arena.

Function Documentation

◆ blockReduce()

template<int warpSize, typename T , typename WARPREDUCE >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduce ( x,
WARPREDUCE &&  warp_reduce,
x0 
)

◆ blockReduce_partial()

template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::blockReduce_partial ( T *  dest,
x,
WARPREDUCE &&  warp_reduce,
ATOMICOP &&  atomic_op,
Gpu::Handler const &  handler 
)

◆ blockReduceMax() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMax ( source)
noexcept

◆ blockReduceMax() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMax ( source)
noexcept

◆ blockReduceMin() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMin ( source)
noexcept

◆ blockReduceMin() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMin ( source)
noexcept

◆ blockReduceSum() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceSum ( source)
noexcept

◆ blockReduceSum() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceSum ( source)
noexcept

◆ copy() [1/3]

template<class InIter , class OutIter >
void amrex::Gpu::copy ( DeviceToDevice  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
DeviceToDeviceA tag indicating that the copy is from the device to the device
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copy(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());

◆ copy() [2/3]

template<class InIter , class OutIter >
void amrex::Gpu::copy ( DeviceToHost  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
DeviceToHostA tag indicating that the copy is from the device to the host
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copy(Gpu::deviceToHost, a.begin(), a.end(), b.begin());

◆ copy() [3/3]

template<class InIter , class OutIter >
void amrex::Gpu::copy ( HostToDevice  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
HostToDeviceA tag indicating that the copy is from the host to the device
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copy(Gpu::hostToDevice, a.begin(), a.end(), b.begin());

◆ copyAsync() [1/3]

template<class InIter , class OutIter >
void amrex::Gpu::copyAsync ( DeviceToDevice  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
DeviceToDeviceA tag indicating that the copy is from the device to the device
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());

◆ copyAsync() [2/3]

template<class InIter , class OutIter >
void amrex::Gpu::copyAsync ( DeviceToHost  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
DeviceToHostA tag indicating that the copy is from the device to the host
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::deviceToHost, a.begin(), a.end(), b.begin());

◆ copyAsync() [3/3]

template<class InIter , class OutIter >
void amrex::Gpu::copyAsync ( HostToDevice  ,
InIter  begin,
InIter  end,
OutIter  result 
)
noexcept

A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters
InIterThe input iterator type
OutIterThe output iterator type
Parameters
HostToDeviceA tag indicating that the copy is from the host to the device
beginWhere in the input to start reading
endWhere in the input to stop reading
resultWhere in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::hostToDevice, a.begin(), a.end(), b.begin());

◆ deviceReduceLogicalAnd()

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceLogicalAnd ( int dest,
int  source,
Gpu::Handler const &  h 
)
noexcept

◆ deviceReduceLogicalOr()

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceLogicalOr ( int dest,
int  source,
Gpu::Handler const &  h 
)
noexcept

◆ deviceReduceMax()

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax ( T *  dest,
source,
Gpu::Handler const &  h 
)
noexcept

◆ deviceReduceMax_full() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax_full ( T *  dest,
source 
)
noexcept

◆ deviceReduceMax_full() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax_full ( T *  dest,
source 
)
noexcept

◆ deviceReduceMin()

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin ( T *  dest,
source,
Gpu::Handler const &  h 
)
noexcept

◆ deviceReduceMin_full() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin_full ( T *  dest,
source 
)
noexcept

◆ deviceReduceMin_full() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin_full ( T *  dest,
source 
)
noexcept

◆ deviceReduceSum()

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum ( T *  dest,
source,
Gpu::Handler const &  h 
)
noexcept

◆ deviceReduceSum_full() [1/2]

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum_full ( T *  dest,
source 
)
noexcept

◆ deviceReduceSum_full() [2/2]

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum_full ( T *  dest,
source 
)
noexcept

◆ dtod_memcpy()

void amrex::Gpu::dtod_memcpy ( void *  p_d_dst,
const void *  p_d_src,
const std::size_t  sz 
)
inlinenoexcept

◆ dtod_memcpy_async()

void amrex::Gpu::dtod_memcpy_async ( void *  p_d_dst,
const void *  p_d_src,
const std::size_t  sz 
)
inlinenoexcept

◆ dtoh_memcpy()

void amrex::Gpu::dtoh_memcpy ( void *  p_h,
const void *  p_d,
const std::size_t  sz 
)
inlinenoexcept

◆ dtoh_memcpy_async()

void amrex::Gpu::dtoh_memcpy_async ( void *  p_h,
const void *  p_d,
const std::size_t  sz 
)
inlinenoexcept

◆ ErrorCheck()

void amrex::Gpu::ErrorCheck ( const char *  file,
int  line 
)
inlinenoexcept

◆ exclusive_scan()

template<class InIter , class OutIter >
OutIter amrex::Gpu::exclusive_scan ( InIter  begin,
InIter  end,
OutIter  result 
)

◆ fillAsync()

template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0>
void amrex::Gpu::fillAsync ( IT  first,
IT  last,
F const &  f 
)
noexcept

Fill the elements in the given range using the given calllable.

This function is asynchronous for GPU builds.

Template Parameters
ITthe iterator type
Fthe callable type
Parameters
firstthe inclusive first in the range [first, last)
lastthe exclusive last in the range [first, last)
fthe callable with the function signature of void(T&, Long), where T is the element type and the Long parameter is the index for the element to be filled.

◆ getThreadBox()

AMREX_GPU_HOST_DEVICE Box amrex::Gpu::getThreadBox ( const Box bx,
Long  offset 
)
inlinenoexcept

◆ gpuStream()

gpuStream_t amrex::Gpu::gpuStream ( )
inlinenoexcept

◆ htod_memcpy()

void amrex::Gpu::htod_memcpy ( void *  p_d,
const void *  p_h,
const std::size_t  sz 
)
inlinenoexcept

◆ htod_memcpy_async()

void amrex::Gpu::htod_memcpy_async ( void *  p_d,
const void *  p_h,
const std::size_t  sz 
)
inlinenoexcept

◆ inclusive_scan()

template<class InIter , class OutIter >
OutIter amrex::Gpu::inclusive_scan ( InIter  begin,
InIter  end,
OutIter  result 
)

◆ inGraphRegion()

bool amrex::Gpu::inGraphRegion ( )
inline

◆ inLaunchRegion()

bool amrex::Gpu::inLaunchRegion ( )
inlinenoexcept

◆ inNoSyncRegion()

bool amrex::Gpu::inNoSyncRegion ( )
inlinenoexcept

◆ inSingleStreamRegion()

bool amrex::Gpu::inSingleStreamRegion ( )
inlinenoexcept

◆ isDevicePtr()

bool amrex::Gpu::isDevicePtr ( void const *  p)
inlinenoexcept

◆ isGpuPtr()

bool amrex::Gpu::isGpuPtr ( void const *  p)
inlinenoexcept

◆ isinf()

template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool amrex::Gpu::isinf ( m)
noexcept

◆ isManaged()

bool amrex::Gpu::isManaged ( void const *  p)
inlinenoexcept

◆ isnan()

template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool amrex::Gpu::isnan ( m)
noexcept

◆ isPinnedPtr()

bool amrex::Gpu::isPinnedPtr ( void const *  p)
inlinenoexcept

◆ LDG() [1/2]

template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::LDG ( Array4< T > const &  a,
int  i,
int  j,
int  k 
)
noexcept

◆ LDG() [2/2]

template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::LDG ( Array4< T > const &  a,
int  i,
int  j,
int  k,
int  n 
)
noexcept

◆ makeExecutionConfig() [1/2]

template<int MT>
ExecutionConfig amrex::Gpu::makeExecutionConfig ( const Box box)
noexcept

◆ makeExecutionConfig() [2/2]

template<int MT>
ExecutionConfig amrex::Gpu::makeExecutionConfig ( Long  N)
noexcept

◆ makeNExecutionConfigs() [1/2]

template<int MT, int dim>
Vector<ExecConfig> amrex::Gpu::makeNExecutionConfigs ( BoxND< dim > const &  box)
noexcept

◆ makeNExecutionConfigs() [2/2]

template<int MT>
Vector<ExecConfig> amrex::Gpu::makeNExecutionConfigs ( Long  N)
noexcept

◆ memcpy()

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void* amrex::Gpu::memcpy ( void *  dest,
const void *  src,
std::size_t  count 
)

◆ memcpy_from_device_global_to_host_async()

template<typename T >
void amrex::Gpu::memcpy_from_device_global_to_host_async ( void *  dst,
T const &  dg,
std::size_t  nbytes,
std::size_t  offset = 0 
)

Copy nbytes bytes from device global variable to host. offset is the offset in bytes from the start of the device global variable.

◆ memcpy_from_host_to_device_global_async()

template<typename T >
void amrex::Gpu::memcpy_from_host_to_device_global_async ( T &  dg,
const void *  src,
std::size_t  nbytes,
std::size_t  offset = 0 
)

Copy nbytes bytes from host to device global variable. offset is the offset in bytes from the start of the device global variable.

◆ notInGraphRegion()

bool amrex::Gpu::notInGraphRegion ( )
inline

◆ notInLaunchRegion()

bool amrex::Gpu::notInLaunchRegion ( )
inlinenoexcept

◆ numGpuStreams()

int amrex::Gpu::numGpuStreams ( )
inlinenoexcept

◆ numThreadsPerBlockParallelFor()

constexpr std::size_t amrex::Gpu::numThreadsPerBlockParallelFor ( )
inlineconstexpr

◆ prefetchToDevice()

template<class Iter >
void amrex::Gpu::prefetchToDevice ( Iter  begin,
Iter  end 
)
noexcept

Migrate elements of a container from host to device. This is a no-op for host-only code.

This version is blocking - CPU execution will halt until the migration is finished.

◆ prefetchToHost()

template<class Iter >
void amrex::Gpu::prefetchToHost ( Iter  begin,
Iter  end 
)
noexcept

Migrate elements of a container from device to host. This is a no-op for host-only code.

This version is blocking - CPU execution will halt until the migration is finished.

◆ Range()

template<typename T >
AMREX_GPU_HOST_DEVICE range_detail::range_impl<T> amrex::Gpu::Range ( T const &  b)
noexcept

◆ setGraphRegion()

bool amrex::Gpu::setGraphRegion ( bool  graph)
inline

◆ setLaunchRegion()

bool amrex::Gpu::setLaunchRegion ( bool  launch)
inlinenoexcept

Enable/disable GPU kernel launches.

Note
This will only switch from GPU to CPU for kernels launched with macros. Functions like amrex::ParallelFor will be unaffected. Therefore it should not be used for comparing GPU to non-GPU performance or behavior.
//...
bool setLaunchRegion(bool launch) noexcept
Definition: AMReX_GpuControl.H:108

Will disable the launching of GPU kernels between the calls.

◆ setNoSyncRegion()

bool amrex::Gpu::setNoSyncRegion ( bool  b)
inlinenoexcept

◆ setSingleStreamRegion()

bool amrex::Gpu::setSingleStreamRegion ( bool  b)
inlinenoexcept

◆ streamSynchronize()

void amrex::Gpu::streamSynchronize ( )
inlinenoexcept

◆ streamSynchronizeAll()

void amrex::Gpu::streamSynchronizeAll ( )
inlinenoexcept

◆ synchronize()

void amrex::Gpu::synchronize ( )
inlinenoexcept

Variable Documentation

◆ deviceToDevice

constexpr DeviceToDevice amrex::Gpu::deviceToDevice {}
staticconstexpr

◆ deviceToHost

constexpr DeviceToHost amrex::Gpu::deviceToHost {}
staticconstexpr

◆ hostToDevice

constexpr HostToDevice amrex::Gpu::hostToDevice {}
staticconstexpr

◆ in_graph_region

bool amrex::Gpu::in_graph_region = false

◆ in_launch_region

bool amrex::Gpu::in_launch_region = true

◆ in_nosync_region

bool amrex::Gpu::in_nosync_region = false

◆ in_single_stream_region

bool amrex::Gpu::in_single_stream_region = false