Namespaces
namespace	Atomic

namespace	detail

namespace	range_detail

Classes
class	AsyncArray

struct	AtomicAdd

struct	AtomicLogicalAnd

struct	AtomicLogicalOr

struct	AtomicMax

struct	AtomicMin

class	Buffer

struct	ComponentBox

struct	Deleter

class	Device

struct	DeviceScalar

struct	DeviceToDevice

struct	DeviceToHost

class	Elixir

struct	ExecConfig

struct	ExecutionConfig

struct	GraphSafeGuard

struct	GridSize

struct	Handler

struct	HostToDevice

class	KernelInfo

struct	LaunchSafeGuard

struct	Managed

struct	NoSyncRegion

struct	Pinned

struct	SharedMemory

struct	SharedMemory< bool >

struct	SharedMemory< char >

struct	SharedMemory< double >

struct	SharedMemory< float >

struct	SharedMemory< int >

struct	SharedMemory< long >

struct	SharedMemory< long long >

struct	SharedMemory< openbc::Moments::array_type >

struct	SharedMemory< short >

struct	SharedMemory< unsigned char >

struct	SharedMemory< unsigned int >

struct	SharedMemory< unsigned long >

struct	SharedMemory< unsigned long long >

struct	SharedMemory< unsigned short >

struct	SingleStreamRegion

class	StreamIter

struct	StreamItInfo

struct	warpReduce

Typedefs
template<class T >
using	DeviceVector = PODVector< T, ArenaAllocator< T > >
	A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter.

template<class T >
using	NonManagedDeviceVector = PODVector< T, DeviceArenaAllocator< T > >
	A PODVector that uses the non-managed device memory arena.

template<class T >
using	ManagedVector = PODVector< T, ManagedArenaAllocator< T > >
	A PODVector that uses the managed memory arena.

template<class T >
using	PinnedVector = PODVector< T, PinnedArenaAllocator< T > >
	A PODVector that uses the pinned memory arena.

template<class T >
using	AsyncVector = PODVector< T, AsyncArenaAllocator< T > >
	A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device.

template<class T >
using	HostVector = PinnedVector< T >
	A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector.

template<class T >
using	ManagedDeviceVector = PODVector< T, ManagedArenaAllocator< T > >
	This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release.

Functions
template<class InIter , class OutIter >
void	copy (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
	A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class InIter , class OutIter >
void	copy (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
	A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class InIter , class OutIter >
void	copy (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
	A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class InIter , class OutIter >
void	copyAsync (HostToDevice, InIter begin, InIter end, OutIter result) noexcept
	A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class InIter , class OutIter >
void	copyAsync (DeviceToHost, InIter begin, InIter end, OutIter result) noexcept
	A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class InIter , class OutIter >
void	copyAsync (DeviceToDevice, InIter begin, InIter end, OutIter result) noexcept
	A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

template<class Iter >
void	prefetchToHost (Iter begin, Iter end) noexcept
	Migrate elements of a container from device to host. This is a no-op for host-only code.

template<class Iter >
void	prefetchToDevice (Iter begin, Iter end) noexcept
	Migrate elements of a container from host to device. This is a no-op for host-only code.

template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0>
void	fillAsync (IT first, IT last, F const &f) noexcept
	Fill the elements in the given range using the given calllable.

bool	inLaunchRegion () noexcept

bool	notInLaunchRegion () noexcept

bool	setLaunchRegion (bool launch) noexcept

bool	inGraphRegion ()

bool	notInGraphRegion ()

bool	setGraphRegion (bool graph)

bool	inSingleStreamRegion () noexcept

bool	inNoSyncRegion () noexcept

bool	setSingleStreamRegion (bool b) noexcept

bool	setNoSyncRegion (bool b) noexcept

gpuStream_t	gpuStream () noexcept

int	numGpuStreams () noexcept

void	synchronize () noexcept

void	streamSynchronize () noexcept

void	streamSynchronizeAll () noexcept

void	htod_memcpy_async (void p_d, const void p_h, const std::size_t sz) noexcept

void	dtoh_memcpy_async (void p_h, const void p_d, const std::size_t sz) noexcept

void	dtod_memcpy_async (void p_d_dst, const void p_d_src, const std::size_t sz) noexcept

void	htod_memcpy (void p_d, const void p_h, const std::size_t sz) noexcept

void	dtoh_memcpy (void p_h, const void p_d, const std::size_t sz) noexcept

void	dtod_memcpy (void p_d_dst, const void p_d_src, const std::size_t sz) noexcept

template<typename T >
void	memcpy_from_host_to_device_global_async (T &dg, const void *src, std::size_t nbytes, std::size_t offset=0)

template<typename T >
void	memcpy_from_device_global_to_host_async (void *dst, T const &dg, std::size_t nbytes, std::size_t offset=0)

void	ErrorCheck (const char *file, int line) noexcept

constexpr std::size_t	numThreadsPerBlockParallelFor ()

AMREX_GPU_HOST_DEVICE Box	getThreadBox (const Box &bx, Long offset) noexcept

template<int MT>
ExecutionConfig	makeExecutionConfig (Long N) noexcept

template<int MT>
ExecutionConfig	makeExecutionConfig (const Box &box) noexcept

template<int MT>
Vector< ExecConfig >	makeNExecutionConfigs (Long N) noexcept

template<int MT, int dim>
Vector< ExecConfig >	makeNExecutionConfigs (BoxND< dim > const &box) noexcept

template<typename T >
AMREX_GPU_HOST_DEVICE range_detail::range_impl< T >	Range (T const &b) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceSum (T *dest, T source, Gpu::Handler const &h) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMin (T *dest, T source, Gpu::Handler const &h) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMax (T *dest, T source, Gpu::Handler const &h) noexcept

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceLogicalAnd (int *dest, int source, Gpu::Handler const &h) noexcept

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceLogicalOr (int *dest, int source, Gpu::Handler const &h) noexcept

template<int warpSize, typename T , typename WARPREDUCE >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduce (T x, WARPREDUCE &&warp_reduce, T x0)

template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	blockReduce_partial (T *dest, T x, WARPREDUCE &&warp_reduce, ATOMICOP &&atomic_op, Gpu::Handler const &handler)

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceSum (T source) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceSum_full (T *dest, T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceSum (T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceSum_full (T *dest, T source) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceMin (T source) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMin_full (T *dest, T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceMin (T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMin_full (T *dest, T source) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceMax (T source) noexcept

template<typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMax_full (T *dest, T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T	blockReduceMax (T source) noexcept

template<int BLOCKDIMX, typename T >
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void	deviceReduceMax_full (T *dest, T source) noexcept

template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T	LDG (Array4< T > const &a, int i, int j, int k) noexcept

template<typename T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T	LDG (Array4< T > const &a, int i, int j, int k, int n) noexcept

bool	isManaged (void const *p) noexcept

bool	isDevicePtr (void const *p) noexcept

bool	isPinnedPtr (void const *p) noexcept

bool	isGpuPtr (void const *p) noexcept

template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool	isnan (T m) noexcept

template<class T >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool	isinf (T m) noexcept

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void *	memcpy (void dest, const void src, std::size_t count)

template<class InIter , class OutIter >
OutIter	inclusive_scan (InIter begin, InIter end, OutIter result)

template<class InIter , class OutIter >
OutIter	exclusive_scan (InIter begin, InIter end, OutIter result)

Variables
static constexpr HostToDevice	hostToDevice {}

static constexpr DeviceToHost	deviceToHost {}

static constexpr DeviceToDevice	deviceToDevice {}

bool	in_launch_region = true

bool	in_graph_region = false

bool	in_single_stream_region = false

bool	in_nosync_region = false

Typedef Documentation

◆ AsyncVector

template<class T >

using amrex::Gpu::AsyncVector = typedef PODVector<T, AsyncArenaAllocator<T> >

A PODVector that uses the async memory arena. Maybe useful for temporary vectors inside MFIters that are accessed on the device.

◆ DeviceVector

template<class T >

using amrex::Gpu::DeviceVector = typedef PODVector<T, ArenaAllocator<T> >

A PODVector that uses the standard memory Arena. Note that the memory might or might not be managed depending on the amrex.the_arena_is_managed ParmParse parameter.

◆ HostVector

template<class T >

using amrex::Gpu::HostVector = typedef PinnedVector<T>

A PODVector that uses pinned host memory. Same as PinnedVector. For a vector class that uses std::allocator by default, see amrex::Vector.

◆ ManagedDeviceVector

template<class T >

using amrex::Gpu::ManagedDeviceVector = typedef PODVector<T, ManagedArenaAllocator<T> >

This is identical to ManagedVector<T>. The ManagedDeviceVector form is deprecated and will be removed in a future release.

◆ ManagedVector

template<class T >

using amrex::Gpu::ManagedVector = typedef PODVector<T, ManagedArenaAllocator<T> >

A PODVector that uses the managed memory arena.

◆ NonManagedDeviceVector

template<class T >

using amrex::Gpu::NonManagedDeviceVector = typedef PODVector<T, DeviceArenaAllocator<T> >

A PODVector that uses the non-managed device memory arena.

◆ PinnedVector

template<class T >

using amrex::Gpu::PinnedVector = typedef PODVector<T, PinnedArenaAllocator<T> >

A PODVector that uses the pinned memory arena.

Function Documentation

◆ blockReduce()

template<int warpSize, typename T , typename WARPREDUCE >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduce	(	T	x,
		WARPREDUCE &&	warp_reduce,
		T	x0
	)

◆ blockReduce_partial()

template<int warpSize, typename T , typename WARPREDUCE , typename ATOMICOP >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::blockReduce_partial	(	T *	dest,
		T	x,
		WARPREDUCE &&	warp_reduce,
		ATOMICOP &&	atomic_op,
		Gpu::Handler const &	handler
	)

◆ blockReduceMax() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMax ( T source )

noexcept

◆ blockReduceMax() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMax ( T source )

noexcept

◆ blockReduceMin() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMin ( T source )

noexcept

◆ blockReduceMin() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceMin ( T source )

noexcept

◆ blockReduceSum() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceSum ( T source )

noexcept

◆ blockReduceSum() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::blockReduceSum ( T source )

noexcept

◆ copy() [1/3]

template<class InIter , class OutIter >

void amrex::Gpu::copy	(	DeviceToDevice	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

DeviceToDevice	A tag indicating that the copy is from the device to the device
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copy(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());

◆ copy() [2/3]

template<class InIter , class OutIter >

void amrex::Gpu::copy	(	DeviceToHost	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

DeviceToHost	A tag indicating that the copy is from the device to the host
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copy(Gpu::deviceToHost, a.begin(), a.end(), b.begin());

◆ copy() [3/3]

template<class InIter , class OutIter >

void amrex::Gpu::copy	(	HostToDevice	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is blocking - CPU execution will halt until the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

HostToDevice	A tag indicating that the copy is from the host to the device
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copy(Gpu::hostToDevice, a.begin(), a.end(), b.begin());

◆ copyAsync() [1/3]

template<class InIter , class OutIter >

void amrex::Gpu::copyAsync	(	DeviceToDevice	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A device-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

DeviceToDevice	A tag indicating that the copy is from the device to the device
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::deviceToDevice, a.begin(), a.end(), b.begin());

◆ copyAsync() [2/3]

template<class InIter , class OutIter >

void amrex::Gpu::copyAsync	(	DeviceToHost	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A device-to-host copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

DeviceToHost	A tag indicating that the copy is from the device to the host
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::deviceToHost, a.begin(), a.end(), b.begin());

◆ copyAsync() [3/3]

template<class InIter , class OutIter >

void amrex::Gpu::copyAsync	(	HostToDevice	,
		InIter	begin,
		InIter	end,
		OutIter	result
	)

noexcept

A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous storage. The amrex-provided containers like Gpu::HostVector, Gpu::DeviceVector, etc. meet this requirement.

This version is asynchronous - CPU execution will continue, whether or not the copy is finished.

Template Parameters

InIter	The input iterator type
OutIter	The output iterator type

Parameters

HostToDevice	A tag indicating that the copy is from the host to the device
begin	Where in the input to start reading
end	Where in the input to stop reading
result	Where in the output to start writing

Example usage:

Gpu::copyAsync(Gpu::hostToDevice, a.begin(), a.end(), b.begin());

◆ deviceReduceLogicalAnd()

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceLogicalAnd	(	int *	dest,
		int	source,
		Gpu::Handler const &	h
	)

noexcept

◆ deviceReduceLogicalOr()

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceLogicalOr	(	int *	dest,
		int	source,
		Gpu::Handler const &	h
	)

noexcept

◆ deviceReduceMax()

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax	(	T *	dest,
		T	source,
		Gpu::Handler const &	h
	)

noexcept

◆ deviceReduceMax_full() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax_full	(	T *	dest,
		T	source
	)

noexcept

◆ deviceReduceMax_full() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMax_full	(	T *	dest,
		T	source
	)

noexcept

◆ deviceReduceMin()

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin	(	T *	dest,
		T	source,
		Gpu::Handler const &	h
	)

noexcept

◆ deviceReduceMin_full() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin_full	(	T *	dest,
		T	source
	)

noexcept

◆ deviceReduceMin_full() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceMin_full	(	T *	dest,
		T	source
	)

noexcept

◆ deviceReduceSum()

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum	(	T *	dest,
		T	source,
		Gpu::Handler const &	h
	)

noexcept

◆ deviceReduceSum_full() [1/2]

template<typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum_full	(	T *	dest,
		T	source
	)

noexcept

◆ deviceReduceSum_full() [2/2]

template<int BLOCKDIMX, typename T >

AMREX_GPU_DEVICE AMREX_FORCE_INLINE void amrex::Gpu::deviceReduceSum_full	(	T *	dest,
		T	source
	)

noexcept

◆ dtod_memcpy()

void amrex::Gpu::dtod_memcpy	(	void *	p_d_dst,
		const void *	p_d_src,
		const std::size_t	sz
	)

inlinenoexcept

◆ dtod_memcpy_async()

void amrex::Gpu::dtod_memcpy_async	(	void *	p_d_dst,
		const void *	p_d_src,
		const std::size_t	sz
	)

inlinenoexcept

◆ dtoh_memcpy()

void amrex::Gpu::dtoh_memcpy	(	void *	p_h,
		const void *	p_d,
		const std::size_t	sz
	)

inlinenoexcept

◆ dtoh_memcpy_async()

void amrex::Gpu::dtoh_memcpy_async	(	void *	p_h,
		const void *	p_d,
		const std::size_t	sz
	)

inlinenoexcept

◆ ErrorCheck()

void amrex::Gpu::ErrorCheck	(	const char *	file,
		int	line
	)

inlinenoexcept

◆ exclusive_scan()

template<class InIter , class OutIter >

OutIter amrex::Gpu::exclusive_scan	(	InIter	begin,
		InIter	end,
		OutIter	result
	)

◆ fillAsync()

template<typename IT , typename F , typename T = typename std::iterator_traits<IT>::value_type, std::enable_if_t<(sizeof(T)<=36 *8) &&std::is_trivially_copyable_v< T > &&amrex::IsCallable< F, T &, Long >::value, int > FOO = 0>

void amrex::Gpu::fillAsync	(	IT	first,
		IT	last,
		F const &	f
	)

noexcept

Fill the elements in the given range using the given calllable.

This function is asynchronous for GPU builds.

Template Parameters

IT	the iterator type
F	the callable type

Parameters

first	the inclusive first in the range [first, last)
last	the exclusive last in the range [first, last)
f	the callable with the function signature of void(T&, Long), where T is the element type and the Long parameter is the index for the element to be filled.

◆ getThreadBox()

AMREX_GPU_HOST_DEVICE Box amrex::Gpu::getThreadBox	(	const Box &	bx,
		Long	offset
	)

inlinenoexcept

◆ gpuStream()

gpuStream_t amrex::Gpu::gpuStream ( )

inlinenoexcept

◆ htod_memcpy()

void amrex::Gpu::htod_memcpy	(	void *	p_d,
		const void *	p_h,
		const std::size_t	sz
	)

inlinenoexcept

◆ htod_memcpy_async()

void amrex::Gpu::htod_memcpy_async	(	void *	p_d,
		const void *	p_h,
		const std::size_t	sz
	)

inlinenoexcept

◆ inclusive_scan()

template<class InIter , class OutIter >

OutIter amrex::Gpu::inclusive_scan	(	InIter	begin,
		InIter	end,
		OutIter	result
	)

◆ inGraphRegion()

bool amrex::Gpu::inGraphRegion ( )

inline

◆ inLaunchRegion()

bool amrex::Gpu::inLaunchRegion ( )

inlinenoexcept

◆ inNoSyncRegion()

bool amrex::Gpu::inNoSyncRegion ( )

inlinenoexcept

◆ inSingleStreamRegion()

bool amrex::Gpu::inSingleStreamRegion ( )

inlinenoexcept

◆ isDevicePtr()

bool amrex::Gpu::isDevicePtr ( void const * p )

inlinenoexcept

◆ isGpuPtr()

bool amrex::Gpu::isGpuPtr ( void const * p )

inlinenoexcept

◆ isinf()

template<class T >

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool amrex::Gpu::isinf ( T m )

noexcept

◆ isManaged()

bool amrex::Gpu::isManaged ( void const * p )

inlinenoexcept

◆ isnan()

template<class T >

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool amrex::Gpu::isnan ( T m )

noexcept

◆ isPinnedPtr()

bool amrex::Gpu::isPinnedPtr ( void const * p )

inlinenoexcept

◆ LDG() [1/2]

template<typename T >

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::LDG	(	Array4< T > const &	a,
		int	i,
		int	j,
		int	k
	)

noexcept

◆ LDG() [2/2]

template<typename T >

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T amrex::Gpu::LDG	(	Array4< T > const &	a,
		int	i,
		int	j,
		int	k,
		int	n
	)

noexcept

◆ makeExecutionConfig() [1/2]

template<int MT>

ExecutionConfig amrex::Gpu::makeExecutionConfig ( const Box & box )

noexcept

◆ makeExecutionConfig() [2/2]

template<int MT>

ExecutionConfig amrex::Gpu::makeExecutionConfig ( Long N )

noexcept

◆ makeNExecutionConfigs() [1/2]

template<int MT, int dim>

Vector< ExecConfig > amrex::Gpu::makeNExecutionConfigs ( BoxND< dim > const & box )

noexcept

◆ makeNExecutionConfigs() [2/2]

template<int MT>

Vector< ExecConfig > amrex::Gpu::makeNExecutionConfigs ( Long N )

noexcept

◆ memcpy()

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * amrex::Gpu::memcpy	(	void *	dest,
		const void *	src,
		std::size_t	count
	)

◆ memcpy_from_device_global_to_host_async()

template<typename T >

void amrex::Gpu::memcpy_from_device_global_to_host_async	(	void *	dst,
		T const &	dg,
		std::size_t	nbytes,
		std::size_t	offset = `0`
	)

Copy nbytes bytes from device global variable to host. offset is the offset in bytes from the start of the device global variable.

◆ memcpy_from_host_to_device_global_async()

template<typename T >

void amrex::Gpu::memcpy_from_host_to_device_global_async	(	T &	dg,
		const void *	src,
		std::size_t	nbytes,
		std::size_t	offset = `0`
	)

Copy nbytes bytes from host to device global variable. offset is the offset in bytes from the start of the device global variable.

◆ notInGraphRegion()

bool amrex::Gpu::notInGraphRegion ( )

inline

◆ notInLaunchRegion()

bool amrex::Gpu::notInLaunchRegion ( )

inlinenoexcept

◆ numGpuStreams()

int amrex::Gpu::numGpuStreams ( )

inlinenoexcept

◆ numThreadsPerBlockParallelFor()

constexpr std::size_t amrex::Gpu::numThreadsPerBlockParallelFor ( )

inlineconstexpr

◆ prefetchToDevice()

template<class Iter >

void amrex::Gpu::prefetchToDevice	(	Iter	begin,
		Iter	end
	)

noexcept

Migrate elements of a container from host to device. This is a no-op for host-only code.

This version is blocking - CPU execution will halt until the migration is finished.

◆ prefetchToHost()

template<class Iter >

void amrex::Gpu::prefetchToHost	(	Iter	begin,
		Iter	end
	)

noexcept

Migrate elements of a container from device to host. This is a no-op for host-only code.

This version is blocking - CPU execution will halt until the migration is finished.

◆ Range()

template<typename T >

AMREX_GPU_HOST_DEVICE range_detail::range_impl< T > amrex::Gpu::Range ( T const & b )

noexcept

◆ setGraphRegion()

bool amrex::Gpu::setGraphRegion ( bool graph )

inline

◆ setLaunchRegion()

bool amrex::Gpu::setLaunchRegion ( bool launch )

inlinenoexcept

Enable/disable GPU kernel launches.

Note: This will only switch from GPU to CPU for kernels launched with macros. Functions like amrex::ParallelFor will be unaffected. Therefore it should not be used for comparing GPU to non-GPU performance or behavior.

Gpu::setLaunchRegion(0);
 
//...
 
Gpu::setLaunchRegion(1);

Will disable the launching of GPU kernels between the calls.

◆ setNoSyncRegion()

bool amrex::Gpu::setNoSyncRegion ( bool b )

inlinenoexcept

◆ setSingleStreamRegion()

bool amrex::Gpu::setSingleStreamRegion ( bool b )

inlinenoexcept

◆ streamSynchronize()

void amrex::Gpu::streamSynchronize ( )

inlinenoexcept

◆ streamSynchronizeAll()

void amrex::Gpu::streamSynchronizeAll ( )

inlinenoexcept

◆ synchronize()

void amrex::Gpu::synchronize ( )

inlinenoexcept

Variable Documentation

◆ deviceToDevice

constexpr DeviceToDevice amrex::Gpu::deviceToDevice {}

staticconstexpr

◆ deviceToHost

constexpr DeviceToHost amrex::Gpu::deviceToHost {}

staticconstexpr

◆ hostToDevice

constexpr HostToDevice amrex::Gpu::hostToDevice {}

staticconstexpr

◆ in_graph_region

bool amrex::Gpu::in_graph_region = false

◆ in_launch_region

bool amrex::Gpu::in_launch_region = true

◆ in_nosync_region

bool amrex::Gpu::in_nosync_region = false

◆ in_single_stream_region

bool amrex::Gpu::in_single_stream_region = false

Namespaces

Classes

Typedefs

Functions

Variables

Typedef Documentation

◆ AsyncVector

◆ DeviceVector

◆ HostVector

◆ ManagedDeviceVector

◆ ManagedVector

◆ NonManagedDeviceVector

◆ PinnedVector

Function Documentation

◆ blockReduce()

◆ blockReduce_partial()

◆ blockReduceMax() [1/2]

◆ blockReduceMax() [2/2]

◆ blockReduceMin() [1/2]

◆ blockReduceMin() [2/2]

◆ blockReduceSum() [1/2]

◆ blockReduceSum() [2/2]

◆ copy() [1/3]

◆ copy() [2/3]

◆ copy() [3/3]

◆ copyAsync() [1/3]

◆ copyAsync() [2/3]

◆ copyAsync() [3/3]

◆ deviceReduceLogicalAnd()

◆ deviceReduceLogicalOr()

◆ deviceReduceMax()

◆ deviceReduceMax_full() [1/2]

◆ deviceReduceMax_full() [2/2]

◆ deviceReduceMin()

◆ deviceReduceMin_full() [1/2]

◆ deviceReduceMin_full() [2/2]

◆ deviceReduceSum()

◆ deviceReduceSum_full() [1/2]

◆ deviceReduceSum_full() [2/2]

◆ dtod_memcpy()

◆ dtod_memcpy_async()

◆ dtoh_memcpy()

◆ dtoh_memcpy_async()

◆ ErrorCheck()

◆ exclusive_scan()

◆ fillAsync()

◆ getThreadBox()

◆ gpuStream()

◆ htod_memcpy()

◆ htod_memcpy_async()

◆ inclusive_scan()

◆ inGraphRegion()

◆ inLaunchRegion()

◆ inNoSyncRegion()

◆ inSingleStreamRegion()

◆ isDevicePtr()

◆ isGpuPtr()

◆ isinf()

◆ isManaged()

◆ isnan()

◆ isPinnedPtr()

◆ LDG() [1/2]

◆ LDG() [2/2]

◆ makeExecutionConfig() [1/2]

◆ makeExecutionConfig() [2/2]

◆ makeNExecutionConfigs() [1/2]

◆ makeNExecutionConfigs() [2/2]

◆ memcpy()

◆ memcpy_from_device_global_to_host_async()

◆ memcpy_from_host_to_device_global_async()

◆ notInGraphRegion()

◆ notInLaunchRegion()

◆ numGpuStreams()

◆ numThreadsPerBlockParallelFor()

◆ prefetchToDevice()

◆ prefetchToHost()

◆ Range()

◆ setGraphRegion()

◆ setLaunchRegion()

◆ setNoSyncRegion()