1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
31#define AMREX_GPU_NCELLS_PER_THREAD 3
32#define AMREX_GPU_Y_STRIDE 1
33#define AMREX_GPU_Z_STRIDE 1
36# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
37 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
38# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
39 amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
40#elif defined(AMREX_USE_HIP)
41# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
42 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
43# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
44 hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
54#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
55 template<
class L,
class... Lambdas>
62 template<
class L,
class... Lambdas>
65 call_device(std::forward<Lambdas>(fs)...);
75 template<
class L,
class... Lambdas>
77 std::forward<L>(f0)();
82 template <
class T>
class LayoutData;
89 return AMREX_GPU_MAX_THREADS;
100 const auto len = bx.length3d();
102 Long j = (
offset - k*(len[0]*len[1])) / len[0];
103 Long i = (
offset - k*(len[0]*len[1])) - j*len[0];
106 static_cast<int>(k))};
108 return (bx &
Box(iv,iv,bx.type()));
154 Long numBlocks = (std::max(N,
Long(1)) + MT - 1) / MT;
156 numBlocks = std::min(numBlocks,
Long(std::numeric_limits<unsigned int>::max()/MT));
158 numBlocks = std::min(numBlocks,
Long(std::numeric_limits<int>::max()));
159 ec.numBlocks.x = numBlocks;
160 ec.numThreads.x = MT;
169 return makeExecutionConfig<MT>(box.numPts());
182 int numblocks_max = std::numeric_limits<int>::max();
184 Long nmax =
Long(MT) * numblocks_max;
187 auto nlaunches =
int((N+nmax-1)/nmax);
190 for (
int i = 0; i < nlaunches; ++i) {
193 nblocks = numblocks_max;
196 nblocks =
int((N+MT-1)/MT);
199 r[i].start_idx = ndone;
200 ndone +=
Long(nblocks) * MT;
202 r[i].nblocks = nblocks;
207 template <
int MT,
int dim>
210 return makeNExecutionConfigs<MT>(box.numPts());
219namespace amrex::detail {
221 template <
typename... L>
constexpr bool is_big_kernel () {
222 return (
sizeof(L) + ... + 0) > 1792;
225 template <
typename... L>
226 struct SyclKernelDevPtr
228 using Ls = GpuTuple<L...>;
233 SyclKernelDevPtr (L
const&... f, gpuStream_t
const& stream)
236 if constexpr (is_big_kernel<L...>()) {
237 std::size_t sz =
sizeof(Ls);
241 auto* l_hp = (
void const*)m_hp;
242 auto* l_dp = (
void*)m_dp;
243 stream.queue->submit([&] (sycl::handler& h) {
244 h.memcpy(l_dp, l_hp, sz);
253 if constexpr (is_big_kernel<Ls>()) {
255 m_stream.queue->wait_and_throw();
256 }
catch (sycl::exception
const& ex) {
262 amrex::Abort(std::string(
"~SyclKernelDevPtr: ")+ex.what());
276 using pointer_t = std::add_pointer_t<
277 std::add_const_t<typename std::tuple_element<N, Ls>::type>>;
279 std::ptrdiff_t
offset = (
char*)(&(amrex::get<N>(*m_hp))) - (
char*)m_hp;
280 return pointer_t((
char const*)m_dp +
offset);
282 return pointer_t(
nullptr);
308#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)
318template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
346template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
354template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
358#pragma omp parallel for
359 for (T i = 0; i < n; ++i) {
370#if (AMREX_SPACEDIM == 1)
371#pragma omp parallel for
372 for (
int i = lo.x; i <= hi.x; ++i) {
375#elif (AMREX_SPACEDIM == 2)
376#pragma omp parallel for
377 for (
int j = lo.y; j <= hi.y; ++j) {
379 for (
int i = lo.x; i <= hi.x; ++i) {
384#pragma omp parallel for collapse(2)
385 for (
int k = lo.z; k <= hi.z; ++k) {
386 for (
int j = lo.y; j <= hi.y; ++j) {
388 for (
int i = lo.x; i <= hi.x; ++i) {
396template <
typename T,
typename L,
typename M=std::enable_if_t<std::is_
integral_v<T>> >
402#if (AMREX_SPACEDIM == 1)
403#pragma omp parallel for collapse(2)
404 for (T n = 0; n < ncomp; ++n) {
405 for (
int i = lo.x; i <= hi.x; ++i) {
409#elif (AMREX_SPACEDIM == 2)
410#pragma omp parallel for collapse(2)
411 for (T n = 0; n < ncomp; ++n) {
412 for (
int j = lo.y; j <= hi.y; ++j) {
414 for (
int i = lo.x; i <= hi.x; ++i) {
420#pragma omp parallel for collapse(3)
421 for (T n = 0; n < ncomp; ++n) {
422 for (
int k = lo.z; k <= hi.z; ++k) {
423 for (
int j = lo.y; j <= hi.y; ++j) {
425 for (
int i = lo.x; i <= hi.x; ++i) {
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_ATTRIBUTE_FLATTEN_FOR
Definition AMReX_Extension.H:151
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:33
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:31
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ const int * hiVect() const &noexcept
Return a constant pointer the array of high end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:191
__host__ __device__ const int * loVect() const &noexcept
Return a constant pointer the array of low end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:186
__host__ __device__ BoxND & coarsen(int ref_ratio) noexcept
Coarsen BoxND by given (positive) refinement ratio. NOTE: if type(dir) = CELL centered: lo <- lo/rati...
Definition AMReX_Box.H:722
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1011
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:996
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1003
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1069
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1331
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
__host__ __device__ Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:97
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:179
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:88
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:151
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:73
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Return a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1522
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
__global__ void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:56
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_GpuLaunch.H:173
int nblocks
Definition AMReX_GpuLaunch.H:175
Long start_idx
Definition AMReX_GpuLaunch.H:174
Definition AMReX_GpuLaunch.H:119
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:141
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:138
dim3 numBlocks
Definition AMReX_GpuLaunch.H:144
dim3 numThreads
Definition AMReX_GpuLaunch.H:145
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:134
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:123
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:120
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:146