Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_GpuLaunch.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_LAUNCH_H_
2#define AMREX_GPU_LAUNCH_H_
3#include <AMReX_Config.H>
4
7#include <AMReX_GpuControl.H>
8#include <AMReX_GpuTypes.H>
9#include <AMReX_GpuError.H>
10#include <AMReX_GpuRange.H>
11#include <AMReX_GpuDevice.H>
12#include <AMReX_GpuMemory.H>
13#include <AMReX_GpuReduce.H>
14#include <AMReX_Tuple.H>
15#include <AMReX_Box.H>
16#include <AMReX_Loop.H>
17#include <AMReX_Extension.H>
18#include <AMReX_BLassert.H>
19#include <AMReX_TypeTraits.H>
21#include <AMReX_RandomEngine.H>
22#include <AMReX_Algorithm.H>
23#include <AMReX_Math.H>
24#include <AMReX_Vector.H>
25#include <cstddef>
26#include <limits>
27#include <algorithm>
28#include <utility>
29
30#define AMREX_GPU_NCELLS_PER_THREAD 3
31#define AMREX_GPU_Y_STRIDE 1
32#define AMREX_GPU_Z_STRIDE 1
33
34#ifdef AMREX_USE_CUDA
35# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
36 amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
38 amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
39#elif defined(AMREX_USE_HIP)
40# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
41 hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
42# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
43 hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
44#endif
45
46
47namespace amrex {
48
49// We cannot take rvalue lambdas.
50// ************************************************
51// Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.
52
53#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
54 template<class L, class... Lambdas>
55 AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }
56
57 template<class L>
58 AMREX_GPU_DEVICE void call_device (L&& f0) noexcept { f0(); }
59
60 template<class L, class... Lambdas>
61 AMREX_GPU_DEVICE void call_device (L&& f0, Lambdas&&... fs) noexcept {
62 f0();
63 call_device(std::forward<Lambdas>(fs)...);
64 }
65#endif
66
67// CPU variation
68
69 template<class L>
70 void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }
71
72 template<class L, class... Lambdas>
73 void launch_host (L&& f0, Lambdas&&... fs) noexcept {
74 std::forward<L>(f0)();
75 launch_host(std::forward<Lambdas>(fs)...);
76 }
77
78
79 template <class T> class LayoutData;
80 class FabArrayBase;
81
82namespace Gpu {
83
84#ifdef AMREX_USE_GPU
85 inline constexpr std::size_t numThreadsPerBlockParallelFor () {
86 return AMREX_GPU_MAX_THREADS;
87 }
88#else
89 inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
90#endif
91
92// ************************************************
93
94 struct ComponentBox {
96 int ic;
97 int nc;
98 };
99
105
106// ************************************************
107
109 inline
110 Box getThreadBox (const Box& bx, Long offset) noexcept
111 {
113 const auto len = bx.length3d();
114 Long k = offset / (len[0]*len[1]);
115 Long j = (offset - k*(len[0]*len[1])) / len[0];
116 Long i = (offset - k*(len[0]*len[1])) - j*len[0];
117 IntVect iv{AMREX_D_DECL(static_cast<int>(i),
118 static_cast<int>(j),
119 static_cast<int>(k))};
120 iv += bx.smallEnd();
121 return (bx & Box(iv,iv,bx.type()));
122 ))
125 return bx;
126 ))
127 }
128
129// ************************************************
130
131#ifdef AMREX_USE_GPU
136 ExecutionConfig (const Box& box) noexcept {
137 // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,
138 // which assumes the decomposition is 1D.
140#if 0
142 b -= box.smallEnd();
145#endif
146 }
147 ExecutionConfig (const Box& box, int comps) noexcept {
148 const Box& b = amrex::surroundingNodes(box);
150 }
154 ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept
155 : numBlocks(nb), numThreads(nt), sharedMem(sm) {}
156
159 std::size_t sharedMem = 0;
160 };
161
162 template <int MT>
164 makeExecutionConfig (Long N) noexcept
165 {
166 ExecutionConfig ec(dim3{}, dim3{});
167 Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
168 // ensure that blockDim.x*gridDim.x does not overflow
169 numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
170 // ensure that the maximum grid size of 2^31-1 won't be exceeded
171 numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
172 ec.numBlocks.x = numBlocks;
173 ec.numThreads.x = MT;
175 return ec;
176 }
177
178 template <int MT>
179 ExecutionConfig
180 makeExecutionConfig (const Box& box) noexcept
181 {
182 return makeExecutionConfig<MT>(box.numPts());
183 }
184
186 {
189 };
190
191 template <int MT>
193 {
194 // Max # of blocks in a kernel launch
195 int numblocks_max = std::numeric_limits<int>::max();
196 // Max # of threads in a kernel launch
197 Long nmax = Long(MT) * numblocks_max;
198 // # of launches needed for N elements without using grid-stride
199 // loops inside GPU kernels.
200 auto nlaunches = int((N+nmax-1)/nmax);
201 Vector<ExecConfig> r(nlaunches);
202 Long ndone = 0;
203 for (int i = 0; i < nlaunches; ++i) {
204 int nblocks;
205 if (N > nmax) {
206 nblocks = numblocks_max;
207 N -= nmax;
208 } else {
209 nblocks = int((N+MT-1)/MT);
210 }
211 // At which element ID the kernel should start
212 r[i].start_idx = ndone;
213 ndone += Long(nblocks) * MT;
214 // # of blocks in this launch
215 r[i].nblocks = nblocks;
216 }
217 return r;
218 }
219
220 template <int MT, int dim>
222 {
223 return makeNExecutionConfigs<MT>(box.numPts());
224 }
225#endif
226
227}
228}
229
230
231#ifdef AMREX_USE_GPU
234#else
237#endif
238
240
242
243#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:32
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:30
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:31
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:899
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:884
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:891
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:957
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:192
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:85
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:164
AMREX_GPU_HOST_DEVICE Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:110
Definition AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:70
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Returns a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1399
IntVectND< AMREX_SPACEDIM > IntVect
Definition AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
const int[]
Definition AMReX_BLProfiler.cpp:1664
AMREX_GPU_DEVICE void call_device(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:58
AMREX_GPU_GLOBAL void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:55
Definition AMReX_GpuLaunch.H:94
Box box
Definition AMReX_GpuLaunch.H:95
int ic
Definition AMReX_GpuLaunch.H:96
int nc
Definition AMReX_GpuLaunch.H:97
Definition AMReX_GpuLaunch.H:186
int nblocks
Definition AMReX_GpuLaunch.H:188
Long start_idx
Definition AMReX_GpuLaunch.H:187
Definition AMReX_GpuLaunch.H:132
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:154
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:151
dim3 numBlocks
Definition AMReX_GpuLaunch.H:157
dim3 numThreads
Definition AMReX_GpuLaunch.H:158
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:147
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:136
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:133
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:159
Definition AMReX_GpuLaunch.H:100
int globalBlockId
Definition AMReX_GpuLaunch.H:103
int numBlocks
Definition AMReX_GpuLaunch.H:101
int numThreads
Definition AMReX_GpuLaunch.H:102