Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_GpuLaunch.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
4
7#include <AMReX_GpuControl.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuRange.H>
11#include <AMReX_GpuDevice.H>
12#include <AMReX_GpuMemory.H>
13#include <AMReX_GpuReduce.H>
14#include <AMReX_Tuple.H>
15#include <AMReX_Box.H>
16#include <AMReX_Loop.H>
17#include <AMReX_Extension.H>
18#include <AMReX_BLassert.H>
19#include <AMReX_TypeTraits.H>
21#include <AMReX_RandomEngine.H>
22#include <AMReX_Algorithm.H>
23#include <AMReX_Math.H>
24#include <AMReX_Vector.H>
25#include <cstddef>
26#include <limits>
27#include <algorithm>
28#include <utility>
29
30#define AMREX_GPU_NCELLS_PER_THREAD 3
31#define AMREX_GPU_Y_STRIDE 1
32#define AMREX_GPU_Z_STRIDE 1
33
34#ifdef AMREX_USE_CUDA
35# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
36 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
38 amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
39#elif defined(AMREX_USE_HIP)
40# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
41 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
42# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
43 hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
44#endif
45
46
47namespace amrex {
48
49// We cannot take rvalue lambdas.
50// ************************************************
51// Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.
52
53#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
54 template<class L, class... Lambdas>
55 AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }
56
58 template<class L>
59 AMREX_GPU_DEVICE void call_device (L&& f0) noexcept { f0(); }
60
61 template<class L, class... Lambdas>
62 AMREX_GPU_DEVICE void call_device (L&& f0, Lambdas&&... fs) noexcept {
63 f0();
64 call_device(std::forward<Lambdas>(fs)...);
65 }
67#endif
68
69// CPU variation
70
71 template<class L>
72 void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }
73
74 template<class L, class... Lambdas>
75 void launch_host (L&& f0, Lambdas&&... fs) noexcept {
76 std::forward<L>(f0)();
77 launch_host(std::forward<Lambdas>(fs)...);
78 }
79
80
81 template <class T> class LayoutData;
82 class FabArrayBase;
83
84namespace Gpu {
85
86#ifdef AMREX_USE_GPU
87 inline constexpr std::size_t numThreadsPerBlockParallelFor () {
88 return AMREX_GPU_MAX_THREADS;
89 }
90#else
91 inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
92#endif
93
95 inline
96 Box getThreadBox (const Box& bx, Long offset) noexcept
97 {
99 const auto len = bx.length3d();
100 Long k = offset / (len[0]*len[1]);
101 Long j = (offset - k*(len[0]*len[1])) / len[0];
102 Long i = (offset - k*(len[0]*len[1])) - j*len[0];
103 IntVect iv{AMREX_D_DECL(static_cast<int>(i),
104 static_cast<int>(j),
105 static_cast<int>(k))};
106 iv += bx.smallEnd();
107 return (bx & Box(iv,iv,bx.type()));
108 ))
111 return bx;
112 ))
113 }
114
115// ************************************************
116
117#ifdef AMREX_USE_GPU
122 ExecutionConfig (const Box& box) noexcept {
123 // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,
124 // which assumes the decomposition is 1D.
126#if 0
128 b -= box.smallEnd();
131#endif
132 }
133 ExecutionConfig (const Box& box, int comps) noexcept {
134 const Box& b = amrex::surroundingNodes(box);
136 }
140 ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept
141 : numBlocks(nb), numThreads(nt), sharedMem(sm) {}
142
145 std::size_t sharedMem = 0;
146 };
147
148 template <int MT>
151 {
152 ExecutionConfig ec(dim3{}, dim3{});
153 Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
154 // ensure that blockDim.x*gridDim.x does not overflow
155 numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
156 // ensure that the maximum grid size of 2^31-1 won't be exceeded
157 numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
158 ec.numBlocks.x = numBlocks;
159 ec.numThreads.x = MT;
161 return ec;
162 }
163
164 template <int MT>
165 ExecutionConfig
166 makeExecutionConfig (const Box& box) noexcept
167 {
168 return makeExecutionConfig<MT>(box.numPts());
169 }
170
172 {
175 };
176
177 template <int MT>
179 {
180 // Max # of blocks in a kernel launch
181 int numblocks_max = std::numeric_limits<int>::max();
182 // Max # of threads in a kernel launch
183 Long nmax = Long(MT) * numblocks_max;
184 // # of launches needed for N elements without using grid-stride
185 // loops inside GPU kernels.
186 auto nlaunches = int((N+nmax-1)/nmax);
187 Vector<ExecConfig> r(nlaunches);
188 Long ndone = 0;
189 for (int i = 0; i < nlaunches; ++i) {
190 int nblocks;
191 if (N > nmax) {
192 nblocks = numblocks_max;
193 N -= nmax;
194 } else {
195 nblocks = int((N+MT-1)/MT);
196 }
197 // At which element ID the kernel should start
198 r[i].start_idx = ndone;
199 ndone += Long(nblocks) * MT;
200 // # of blocks in this launch
201 r[i].nblocks = nblocks;
202 }
203 return r;
204 }
205
206 template <int MT, int dim>
208 {
209 return makeNExecutionConfigs<MT>(box.numPts());
210 }
211#endif
212
213}
214}
215
216
217#ifdef AMREX_USE_GPU
220#else
223#include <AMReX_SIMD.H>
224#endif
225
227
229
230namespace amrex {
231
232#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)
233
242template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
243void ParallelForOMP (T n, L const& f) noexcept
244{
245 ParallelFor(n, f);
246}
247
256template <typename L>
257void ParallelForOMP (Box const& box, L const& f) noexcept
258{
259 ParallelFor(box, f);
260}
261
270template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
271void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
272{
273 ParallelFor(box, ncomp, f);
274}
275
276#else /* !defined(AMREX_USE_GPU) && defined(AMREX_USE_OMP) */
277
278template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
280void ParallelForOMP (T n, L const& f) noexcept
281{
282#pragma omp parallel for
283 for (T i = 0; i < n; ++i) {
284 f(i);
285 }
286}
287
288template <typename L>
290void ParallelForOMP (Box const& box, L const& f) noexcept
291{
292 auto lo = amrex::lbound(box);
293 auto hi = amrex::ubound(box);
294#if (AMREX_SPACEDIM == 1)
295#pragma omp parallel for
296 for (int i = lo.x; i <= hi.x; ++i) {
297 f(i,0,0);
298 }
299#elif (AMREX_SPACEDIM == 2)
300#pragma omp parallel for
301 for (int j = lo.y; j <= hi.y; ++j) {
303 for (int i = lo.x; i <= hi.x; ++i) {
304 f(i,j,0);
305 }
306 }
307#else
308#pragma omp parallel for collapse(2)
309 for (int k = lo.z; k <= hi.z; ++k) {
310 for (int j = lo.y; j <= hi.y; ++j) {
312 for (int i = lo.x; i <= hi.x; ++i) {
313 f(i,j,k);
314 }
315 }
316 }
317#endif
318}
319
320template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
322void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
323{
324 auto lo = amrex::lbound(box);
325 auto hi = amrex::ubound(box);
326#if (AMREX_SPACEDIM == 1)
327#pragma omp parallel for collapse(2)
328 for (T n = 0; n < ncomp; ++n) {
329 for (int i = lo.x; i <= hi.x; ++i) {
330 f(i,0,0,n);
331 }
332 }
333#elif (AMREX_SPACEDIM == 2)
334#pragma omp parallel for collapse(2)
335 for (T n = 0; n < ncomp; ++n) {
336 for (int j = lo.y; j <= hi.y; ++j) {
338 for (int i = lo.x; i <= hi.x; ++i) {
339 f(i,j,0,n);
340 }
341 }
342 }
343#else
344#pragma omp parallel for collapse(3)
345 for (T n = 0; n < ncomp; ++n) {
346 for (int k = lo.z; k <= hi.z; ++k) {
347 for (int j = lo.y; j <= hi.y; ++j) {
349 for (int i = lo.x; i <= hi.x; ++i) {
350 f(i,j,k,n);
351 }
352 }
353 }
354 }
355#endif
356}
357
358#endif
359
360}
361
362#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_ATTRIBUTE_FLATTEN_FOR
Definition AMReX_Extension.H:151
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:30
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:31
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ const int * hiVect() const &noexcept
Return a constant pointer the array of high end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:191
__host__ __device__ const int * loVect() const &noexcept
Return a constant pointer the array of low end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:186
__host__ __device__ BoxND & coarsen(int ref_ratio) noexcept
Coarsen BoxND by given (positive) refinement ratio. NOTE: if type(dir) = CELL centered: lo <- lo/rati...
Definition AMReX_Box.H:722
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1007
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:992
static constexpr int warp_size
Definition AMReX_GpuDevice.H:197
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:999
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1065
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:243
__host__ __device__ Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:96
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:178
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:87
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:150
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:72
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Return a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1522
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:27
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__global__ void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:55
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_GpuLaunch.H:172
int nblocks
Definition AMReX_GpuLaunch.H:174
Long start_idx
Definition AMReX_GpuLaunch.H:173
Definition AMReX_GpuLaunch.H:118
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:140
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:137
dim3 numBlocks
Definition AMReX_GpuLaunch.H:143
dim3 numThreads
Definition AMReX_GpuLaunch.H:144
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:133
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:122
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:119
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:145