Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuLaunch.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
4
7#include <AMReX_GpuControl.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuRange.H>
11#include <AMReX_GpuDevice.H>
12#include <AMReX_GpuMemory.H>
13#include <AMReX_GpuReduce.H>
14#include <AMReX_Arena.H>
15#include <AMReX_Tuple.H>
16#include <AMReX_Box.H>
17#include <AMReX_Loop.H>
18#include <AMReX_Extension.H>
19#include <AMReX_BLassert.H>
20#include <AMReX_TypeTraits.H>
22#include <AMReX_RandomEngine.H>
23#include <AMReX_Algorithm.H>
24#include <AMReX_Math.H>
25#include <AMReX_Vector.H>
26#include <cstddef>
27#include <limits>
28#include <algorithm>
29#include <utility>
30
31#define AMREX_GPU_NCELLS_PER_THREAD 3
32#define AMREX_GPU_Y_STRIDE 1
33#define AMREX_GPU_Z_STRIDE 1
34
35#ifdef AMREX_USE_CUDA
36# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
37 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
38# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
39 amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
40#elif defined(AMREX_USE_HIP)
41# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
42 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
43# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
44 hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
45#endif
46
47
48namespace amrex {
49
50// We cannot take rvalue lambdas.
51// ************************************************
52// Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.
53
54#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
55 template<class L, class... Lambdas>
56 AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }
57
59 template<class L>
60 AMREX_GPU_DEVICE void call_device (L&& f0) noexcept { f0(); }
61
62 template<class L, class... Lambdas>
63 AMREX_GPU_DEVICE void call_device (L&& f0, Lambdas&&... fs) noexcept {
64 f0();
65 call_device(std::forward<Lambdas>(fs)...);
66 }
68#endif
69
70// CPU variation
71
72 template<class L>
73 void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }
74
75 template<class L, class... Lambdas>
76 void launch_host (L&& f0, Lambdas&&... fs) noexcept {
77 std::forward<L>(f0)();
78 launch_host(std::forward<Lambdas>(fs)...);
79 }
80
81
82 template <class T> class LayoutData;
83 class FabArrayBase;
84
85namespace Gpu {
86
87#ifdef AMREX_USE_GPU
88 constexpr std::size_t numThreadsPerBlockParallelFor () {
89 return AMREX_GPU_MAX_THREADS;
90 }
91#else
92 constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
93#endif
94
96 inline
97 Box getThreadBox (const Box& bx, Long offset) noexcept
98 {
100 const auto len = bx.length3d();
101 Long k = offset / (len[0]*len[1]);
102 Long j = (offset - k*(len[0]*len[1])) / len[0];
103 Long i = (offset - k*(len[0]*len[1])) - j*len[0];
104 IntVect iv{AMREX_D_DECL(static_cast<int>(i),
105 static_cast<int>(j),
106 static_cast<int>(k))};
107 iv += bx.smallEnd();
108 return (bx & Box(iv,iv,bx.type()));
109 ))
112 return bx;
113 ))
114 }
115
116// ************************************************
117
118#ifdef AMREX_USE_GPU
123 ExecutionConfig (const Box& box) noexcept {
124 // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,
125 // which assumes the decomposition is 1D.
127#if 0
129 b -= box.smallEnd();
132#endif
133 }
134 ExecutionConfig (const Box& box, int comps) noexcept {
135 const Box& b = amrex::surroundingNodes(box);
137 }
141 ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept
142 : numBlocks(nb), numThreads(nt), sharedMem(sm) {}
143
146 std::size_t sharedMem = 0;
147 };
148
149 template <int MT>
152 {
153 ExecutionConfig ec(dim3{}, dim3{});
154 Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
155 // ensure that blockDim.x*gridDim.x does not overflow
156 numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
157 // ensure that the maximum grid size of 2^31-1 won't be exceeded
158 numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
159 ec.numBlocks.x = numBlocks;
160 ec.numThreads.x = MT;
162 return ec;
163 }
164
165 template <int MT>
166 ExecutionConfig
167 makeExecutionConfig (const Box& box) noexcept
168 {
169 return makeExecutionConfig<MT>(box.numPts());
170 }
171
173 {
176 };
177
178 template <int MT>
180 {
181 // Max # of blocks in a kernel launch
182 int numblocks_max = std::numeric_limits<int>::max();
183 // Max # of threads in a kernel launch
184 Long nmax = Long(MT) * numblocks_max;
185 // # of launches needed for N elements without using grid-stride
186 // loops inside GPU kernels.
187 auto nlaunches = int((N+nmax-1)/nmax);
188 Vector<ExecConfig> r(nlaunches);
189 Long ndone = 0;
190 for (int i = 0; i < nlaunches; ++i) {
191 int nblocks;
192 if (N > nmax) {
193 nblocks = numblocks_max;
194 N -= nmax;
195 } else {
196 nblocks = int((N+MT-1)/MT);
197 }
198 // At which element ID the kernel should start
199 r[i].start_idx = ndone;
200 ndone += Long(nblocks) * MT;
201 // # of blocks in this launch
202 r[i].nblocks = nblocks;
203 }
204 return r;
205 }
206
207 template <int MT, int dim>
209 {
210 return makeNExecutionConfigs<MT>(box.numPts());
211 }
212#endif
213
214}
215}
216
217#ifdef AMREX_USE_SYCL
219namespace amrex::detail {
220
221 template <typename... L> constexpr bool is_big_kernel () {
222 return (sizeof(L) + ... + 0) > 1792;
223 }
224
225 template <typename... L>
226 struct SyclKernelDevPtr
227 {
228 using Ls = GpuTuple<L...>;
229 Ls* m_dp = nullptr;
230 Ls* m_hp = nullptr;
231 gpuStream_t m_stream;
232
233 SyclKernelDevPtr (L const&... f, gpuStream_t const& stream)
234 : m_stream(stream)
235 {
236 if constexpr (is_big_kernel<L...>()) {
237 std::size_t sz = sizeof(Ls);
238 m_dp = (Ls*)The_Arena()->alloc(sz);
239 m_hp = (Ls*)The_Pinned_Arena()->alloc(sz);
240 new (m_hp) Ls(f...);
241 auto* l_hp = (void const*)m_hp;
242 auto* l_dp = (void*)m_dp;
243 stream.queue->submit([&] (sycl::handler& h) {
244 h.memcpy(l_dp, l_hp, sz);
245 });
246 } else {
248 }
249 }
250
251 ~SyclKernelDevPtr ()
252 {
253 if constexpr (is_big_kernel<Ls>()) {
254 try {
255 m_stream.queue->wait_and_throw();
256 } catch (sycl::exception const& ex) {
257 if (m_dp) {
258 The_Arena()->free((void*)m_dp);
259 m_dp = nullptr;
260 }
261 if (m_hp) {
262 m_hp->~Ls();
263 The_Pinned_Arena()->free((void*)m_hp);
264 m_hp = nullptr;
265 }
266 amrex::Abort(std::string("~SyclKernelDevPtr: ")+ex.what());
267 }
268 if (m_dp) {
269 The_Arena()->free((void*)m_dp);
270 m_dp = nullptr;
271 }
272 if (m_hp) {
273 m_hp->~Ls();
274 The_Pinned_Arena()->free((void*)m_hp);
275 m_hp = nullptr;
276 }
277 }
278 }
279
280 template <int N>
281 auto get () const
282 {
283 using pointer_t = std::add_pointer_t<
284 std::add_const_t<typename std::tuple_element<N, Ls>::type>>;
285 if (m_hp && m_dp) {
286 std::ptrdiff_t offset = (char*)(&(amrex::get<N>(*m_hp))) - (char*)m_hp;
287 return pointer_t((char const*)m_dp + offset);
288 } else {
289 return pointer_t(nullptr);
290 }
291 }
292 };
293
294}
296#endif
297
298
299#ifdef AMREX_USE_GPU
302#else
305#include <AMReX_SIMD.H>
306#endif
308
310
312
313namespace amrex {
314
315#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)
316
325template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
326void ParallelForOMP (T n, L const& f) noexcept
327{
328 ParallelFor(n, f);
329}
330
339template <typename L>
340void ParallelForOMP (Box const& box, L const& f) noexcept
341{
342 ParallelFor(box, f);
343}
344
353template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
354void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
355{
356 ParallelFor(box, ncomp, f);
357}
358
359#else /* !defined(AMREX_USE_GPU) && defined(AMREX_USE_OMP) */
360
361template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
363void ParallelForOMP (T n, L const& f) noexcept
364{
365#pragma omp parallel for
366 for (T i = 0; i < n; ++i) {
367 f(i);
368 }
369}
370
371template <typename L>
373void ParallelForOMP (Box const& box, L const& f) noexcept
374{
375 auto lo = amrex::lbound(box);
376 auto hi = amrex::ubound(box);
377#if (AMREX_SPACEDIM == 1)
378#pragma omp parallel for
379 for (int i = lo.x; i <= hi.x; ++i) {
380 f(i,0,0);
381 }
382#elif (AMREX_SPACEDIM == 2)
383#pragma omp parallel for
384 for (int j = lo.y; j <= hi.y; ++j) {
386 for (int i = lo.x; i <= hi.x; ++i) {
387 f(i,j,0);
388 }
389 }
390#else
391#pragma omp parallel for collapse(2)
392 for (int k = lo.z; k <= hi.z; ++k) {
393 for (int j = lo.y; j <= hi.y; ++j) {
395 for (int i = lo.x; i <= hi.x; ++i) {
396 f(i,j,k);
397 }
398 }
399 }
400#endif
401}
402
403template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
405void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept
406{
407 auto lo = amrex::lbound(box);
408 auto hi = amrex::ubound(box);
409#if (AMREX_SPACEDIM == 1)
410#pragma omp parallel for collapse(2)
411 for (T n = 0; n < ncomp; ++n) {
412 for (int i = lo.x; i <= hi.x; ++i) {
413 f(i,0,0,n);
414 }
415 }
416#elif (AMREX_SPACEDIM == 2)
417#pragma omp parallel for collapse(2)
418 for (T n = 0; n < ncomp; ++n) {
419 for (int j = lo.y; j <= hi.y; ++j) {
421 for (int i = lo.x; i <= hi.x; ++i) {
422 f(i,j,0,n);
423 }
424 }
425 }
426#else
427#pragma omp parallel for collapse(3)
428 for (T n = 0; n < ncomp; ++n) {
429 for (int k = lo.z; k <= hi.z; ++k) {
430 for (int j = lo.y; j <= hi.y; ++j) {
432 for (int i = lo.x; i <= hi.x; ++i) {
433 f(i,j,k,n);
434 }
435 }
436 }
437 }
438#endif
439}
440
441#endif
442
443}
444
445#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_ATTRIBUTE_FLATTEN_FOR
Definition AMReX_Extension.H:151
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:33
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:31
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ const int * hiVect() const &noexcept
Return a constant pointer the array of high end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:191
__host__ __device__ const int * loVect() const &noexcept
Return a constant pointer the array of low end coordinates. Useful for calls to FORTRAN.
Definition AMReX_Box.H:186
__host__ __device__ BoxND & coarsen(int ref_ratio) noexcept
Coarsen BoxND by given (positive) refinement ratio. NOTE: if type(dir) = CELL centered: lo <- lo/rati...
Definition AMReX_Box.H:722
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1205
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1190
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1197
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1263
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:326
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1331
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
__host__ __device__ Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:97
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:179
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:88
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:151
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:73
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Return a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1522
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:83
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
__global__ void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:56
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1335
Definition AMReX_GpuLaunch.H:173
int nblocks
Definition AMReX_GpuLaunch.H:175
Long start_idx
Definition AMReX_GpuLaunch.H:174
Definition AMReX_GpuLaunch.H:119
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:141
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:138
dim3 numBlocks
Definition AMReX_GpuLaunch.H:144
dim3 numThreads
Definition AMReX_GpuLaunch.H:145
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:134
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:123
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:120
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:146