1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
30#define AMREX_GPU_NCELLS_PER_THREAD 3
31#define AMREX_GPU_Y_STRIDE 1
32#define AMREX_GPU_Z_STRIDE 1
35# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
36 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
38 amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
39#elif defined(AMREX_USE_HIP)
40# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
41 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
42# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
43 hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
53#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
54 template<
class L,
class... Lambdas>
60 template<
class L,
class... Lambdas>
72 template<
class L,
class... Lambdas>
74 std::forward<L>(f0)();
79 template <
class T>
class LayoutData;
86 return AMREX_GPU_MAX_THREADS;
113 const auto len = bx.length3d();
114 Long k =
offset / (len[0]*len[1]);
115 Long j = (
offset - k*(len[0]*len[1])) / len[0];
116 Long i = (
offset - k*(len[0]*len[1])) - j*len[0];
119 static_cast<int>(k))};
121 return (bx &
Box(iv,iv,bx.type()));
167 Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
169 numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
171 numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
172 ec.numBlocks.x = numBlocks;
173 ec.numThreads.x = MT;
182 return makeExecutionConfig<MT>(box.numPts());
195 int numblocks_max = std::numeric_limits<int>::max();
197 Long nmax = Long(MT) * numblocks_max;
200 auto nlaunches = int((N+nmax-1)/nmax);
203 for (
int i = 0; i < nlaunches; ++i) {
206 nblocks = numblocks_max;
209 nblocks = int((N+MT-1)/MT);
212 r[i].start_idx = ndone;
213 ndone += Long(nblocks) * MT;
215 r[i].nblocks = nblocks;
220 template <
int MT,
int dim>
223 return makeNExecutionConfigs<MT>(box.numPts());
246#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)
248template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
260template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
268template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
272#pragma omp parallel for
273 for (T i = 0; i < n; ++i) {
284#if (AMREX_SPACEDIM == 1)
285#pragma omp parallel for
286 for (
int i = lo.x; i <= hi.x; ++i) {
289#elif (AMREX_SPACEDIM == 2)
290#pragma omp parallel for
291 for (
int j = lo.y; j <= hi.y; ++j) {
293 for (
int i = lo.x; i <= hi.x; ++i) {
298#pragma omp parallel for collapse(2)
299 for (
int k = lo.z; k <= hi.z; ++k) {
300 for (
int j = lo.y; j <= hi.y; ++j) {
302 for (
int i = lo.x; i <= hi.x; ++i) {
310template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
316#if (AMREX_SPACEDIM == 1)
317#pragma omp parallel for collapse(2)
318 for (T n = 0; n < ncomp; ++n) {
319 for (
int i = lo.x; i <= hi.x; ++i) {
323#elif (AMREX_SPACEDIM == 2)
324#pragma omp parallel for collapse(2)
325 for (T n = 0; n < ncomp; ++n) {
326 for (
int j = lo.y; j <= hi.y; ++j) {
328 for (
int i = lo.x; i <= hi.x; ++i) {
334#pragma omp parallel for collapse(3)
335 for (T n = 0; n < ncomp; ++n) {
336 for (
int k = lo.z; k <= hi.z; ++k) {
337 for (
int j = lo.y; j <= hi.y; ++j) {
339 for (
int i = lo.x; i <= hi.x; ++i) {
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_ATTRIBUTE_FLATTEN_FOR
Definition AMReX_Extension.H:151
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:30
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:31
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:983
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:968
static constexpr int warp_size
Definition AMReX_GpuDevice.H:194
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:975
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1041
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
__host__ __device__ Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:110
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:192
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:85
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:164
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:70
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
void ParallelForOMP(T n, L const &f) noexcept
Definition AMReX_GpuLaunch.H:249
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
__host__ __device__ BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Returns a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1417
__device__ void call_device(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:58
BoxND< 3 > Box
Definition AMReX_BaseFwd.H:27
IntVectND< 3 > IntVect
Definition AMReX_BaseFwd.H:30
__global__ void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:55
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_GpuLaunch.H:94
Box box
Definition AMReX_GpuLaunch.H:95
int ic
Definition AMReX_GpuLaunch.H:96
int nc
Definition AMReX_GpuLaunch.H:97
Definition AMReX_GpuLaunch.H:186
int nblocks
Definition AMReX_GpuLaunch.H:188
Long start_idx
Definition AMReX_GpuLaunch.H:187
Definition AMReX_GpuLaunch.H:132
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:154
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:151
dim3 numBlocks
Definition AMReX_GpuLaunch.H:157
dim3 numThreads
Definition AMReX_GpuLaunch.H:158
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:147
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:136
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:133
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:159
Definition AMReX_GpuLaunch.H:100
int globalBlockId
Definition AMReX_GpuLaunch.H:103
int numBlocks
Definition AMReX_GpuLaunch.H:101
int numThreads
Definition AMReX_GpuLaunch.H:102