1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
30#define AMREX_GPU_NCELLS_PER_THREAD 3
31#define AMREX_GPU_Y_STRIDE 1
32#define AMREX_GPU_Z_STRIDE 1
35# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
36 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37#elif defined(AMREX_USE_HIP)
38# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
39 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
49#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
50 template<
class L,
class... Lambdas>
56 template<
class L,
class... Lambdas>
68 template<
class L,
class... Lambdas>
70 std::forward<L>(f0)();
75 template <
class T>
class LayoutData;
82 return AMREX_GPU_MAX_THREADS;
109 const auto len = bx.length3d();
110 Long k =
offset / (len[0]*len[1]);
111 Long j = (
offset - k*(len[0]*len[1])) / len[0];
112 Long i = (
offset - k*(len[0]*len[1])) - j*len[0];
115 static_cast<int>(k))};
117 return (bx &
Box(iv,iv,bx.type()));
163 Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
165 numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
167 numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
168 ec.numBlocks.x = numBlocks;
169 ec.numThreads.x = MT;
178 return makeExecutionConfig<MT>(box.numPts());
191 int numblocks_max = std::numeric_limits<int>::max();
193 Long nmax = Long(MT) * numblocks_max;
196 auto nlaunches =
int((N+nmax-1)/nmax);
199 for (
int i = 0; i < nlaunches; ++i) {
202 nblocks = numblocks_max;
205 nblocks =
int((N+MT-1)/MT);
208 r[i].start_idx = ndone;
209 ndone += Long(nblocks) * MT;
211 r[i].nblocks = nblocks;
216 template <
int MT,
int dim>
219 return makeNExecutionConfigs<MT>(box.numPts());
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:30
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:31
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:899
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:884
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:891
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:957
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:188
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:81
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:160
AMREX_GPU_HOST_DEVICE Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:106
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:66
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Returns a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1399
IntVectND< AMREX_SPACEDIM > IntVect
Definition AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_GPU_DEVICE void call_device(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:54
AMREX_GPU_GLOBAL void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:51
Definition AMReX_GpuLaunch.H:90
Box box
Definition AMReX_GpuLaunch.H:91
int ic
Definition AMReX_GpuLaunch.H:92
int nc
Definition AMReX_GpuLaunch.H:93
Definition AMReX_GpuLaunch.H:182
int nblocks
Definition AMReX_GpuLaunch.H:184
Long start_idx
Definition AMReX_GpuLaunch.H:183
Definition AMReX_GpuLaunch.H:128
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:150
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:147
dim3 numBlocks
Definition AMReX_GpuLaunch.H:153
dim3 numThreads
Definition AMReX_GpuLaunch.H:154
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:143
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:132
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:129
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:155
Definition AMReX_GpuLaunch.H:96
int globalBlockId
Definition AMReX_GpuLaunch.H:99
int numBlocks
Definition AMReX_GpuLaunch.H:97
int numThreads
Definition AMReX_GpuLaunch.H:98