Block-Structured AMR Software Framework
AMReX_GpuLaunch.H
Go to the documentation of this file.
1 #ifndef AMREX_GPU_LAUNCH_H_
2 #define AMREX_GPU_LAUNCH_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_GpuQualifiers.H>
6 #include <AMReX_GpuKernelInfo.H>
7 #include <AMReX_GpuControl.H>
8 #include <AMReX_GpuTypes.H>
9 #include <AMReX_GpuError.H>
10 #include <AMReX_GpuRange.H>
11 #include <AMReX_GpuDevice.H>
12 #include <AMReX_GpuMemory.H>
13 #include <AMReX_GpuReduce.H>
14 #include <AMReX_Tuple.H>
15 #include <AMReX_Box.H>
16 #include <AMReX_Loop.H>
17 #include <AMReX_Extension.H>
18 #include <AMReX_BLassert.H>
19 #include <AMReX_TypeTraits.H>
20 #include <AMReX_GpuLaunchGlobal.H>
21 #include <AMReX_RandomEngine.H>
22 #include <AMReX_Algorithm.H>
23 #include <AMReX_Math.H>
24 #include <AMReX_Vector.H>
25 #include <cstddef>
26 #include <limits>
27 #include <algorithm>
28 #include <utility>
29 
30 #define AMREX_GPU_NCELLS_PER_THREAD 3
31 #define AMREX_GPU_Y_STRIDE 1
32 #define AMREX_GPU_Z_STRIDE 1
33 
34 #ifdef AMREX_USE_CUDA
35 # define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
36  amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37 #elif defined(AMREX_USE_HIP)
38 # define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
39  hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
40 #endif
41 
42 
43 namespace amrex {
44 
45 // We cannot take rvalue lambdas.
46 // ************************************************
47 // Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.
48 
49 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
50  template<class L, class... Lambdas>
51  AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }
52 
53  template<class L>
54  AMREX_GPU_DEVICE void call_device (L&& f0) noexcept { f0(); }
55 
56  template<class L, class... Lambdas>
57  AMREX_GPU_DEVICE void call_device (L&& f0, Lambdas&&... fs) noexcept {
58  f0();
59  call_device(std::forward<Lambdas>(fs)...);
60  }
61 #endif
62 
63 // CPU variation
64 
65  template<class L>
66  void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }
67 
68  template<class L, class... Lambdas>
69  void launch_host (L&& f0, Lambdas&&... fs) noexcept {
70  std::forward<L>(f0)();
71  launch_host(std::forward<Lambdas>(fs)...);
72  }
73 
74 
75  template <class T> class LayoutData;
76  class FabArrayBase;
77 
78 namespace Gpu {
79 
80 #ifdef AMREX_USE_GPU
81  inline constexpr std::size_t numThreadsPerBlockParallelFor () {
82  return AMREX_GPU_MAX_THREADS;
83  }
84 #else
85  inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }
86 #endif
87 
88 // ************************************************
89 
90  struct ComponentBox {
92  int ic;
93  int nc;
94  };
95 
96  struct GridSize {
97  int numBlocks;
100  };
101 
102 // ************************************************
103 
105  inline
106  Box getThreadBox (const Box& bx, Long offset) noexcept
107  {
109  const auto len = bx.length3d();
110  Long k = offset / (len[0]*len[1]);
111  Long j = (offset - k*(len[0]*len[1])) / len[0];
112  Long i = (offset - k*(len[0]*len[1])) - j*len[0];
113  IntVect iv{AMREX_D_DECL(static_cast<int>(i),
114  static_cast<int>(j),
115  static_cast<int>(k))};
116  iv += bx.smallEnd();
117  return (bx & Box(iv,iv,bx.type()));
118  ))
121  return bx;
122  ))
123  }
124 
125 // ************************************************
126 
127 #ifdef AMREX_USE_GPU
129  ExecutionConfig () noexcept {
131  }
132  ExecutionConfig (const Box& box) noexcept {
133  // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,
134  // which assumes the decomposition is 1D.
136 #if 0
138  b -= box.smallEnd();
141 #endif
142  }
143  ExecutionConfig (const Box& box, int comps) noexcept {
144  const Box& b = amrex::surroundingNodes(box);
145  Gpu::Device::c_comps_threads_and_blocks(b.loVect(), b.hiVect(), comps, numBlocks, numThreads);
146  }
147  ExecutionConfig (Long N) noexcept {
149  }
150  ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept
151  : numBlocks(nb), numThreads(nt), sharedMem(sm) {}
152 
153  dim3 numBlocks;
155  std::size_t sharedMem = 0;
156  };
157 
158  template <int MT>
160  makeExecutionConfig (Long N) noexcept
161  {
162  ExecutionConfig ec(dim3{}, dim3{});
163  Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;
164  // ensure that blockDim.x*gridDim.x does not overflow
165  numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));
166  // ensure that the maximum grid size of 2^31-1 won't be exceeded
167  numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));
168  ec.numBlocks.x = numBlocks;
169  ec.numThreads.x = MT;
171  return ec;
172  }
173 
174  template <int MT>
175  ExecutionConfig
176  makeExecutionConfig (const Box& box) noexcept
177  {
178  return makeExecutionConfig<MT>(box.numPts());
179  }
180 
181  struct ExecConfig
182  {
183  Long start_idx;
184  int nblocks;
185  };
186 
187  template <int MT>
189  {
190  // Max # of blocks in a kernel launch
191  int numblocks_max = std::numeric_limits<int>::max();
192  // Max # of threads in a kernel launch
193  Long nmax = Long(MT) * numblocks_max;
194  // # of launches needed for N elements without using grid-stride
195  // loops inside GPU kernels.
196  auto nlaunches = int((N+nmax-1)/nmax);
197  Vector<ExecConfig> r(nlaunches);
198  Long ndone = 0;
199  for (int i = 0; i < nlaunches; ++i) {
200  int nblocks;
201  if (N > nmax) {
202  nblocks = numblocks_max;
203  N -= nmax;
204  } else {
205  nblocks = int((N+MT-1)/MT);
206  }
207  // At which element ID the kernel should start
208  r[i].start_idx = ndone;
209  ndone += Long(nblocks) * MT;
210  // # of blocks in this launch
211  r[i].nblocks = nblocks;
212  }
213  return r;
214  }
215 
216  template <int MT, int dim>
218  {
219  return makeNExecutionConfigs<MT>(box.numPts());
220  }
221 #endif
222 
223 }
224 }
225 
226 
227 #ifdef AMREX_USE_GPU
228 #include <AMReX_GpuLaunchMacrosG.H>
229 #include <AMReX_GpuLaunchFunctsG.H>
230 #else
231 #include <AMReX_GpuLaunchMacrosC.H>
232 #include <AMReX_GpuLaunchFunctsC.H>
233 #endif
234 
235 #include <AMReX_GpuLaunch.nolint.H>
236 
238 
239 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_GPU_Z_STRIDE
Definition: AMReX_GpuLaunch.H:32
#define AMREX_GPU_NCELLS_PER_THREAD
Definition: AMReX_GpuLaunch.H:30
#define AMREX_GPU_Y_STRIDE
Definition: AMReX_GpuLaunch.H:31
#define AMREX_IF_ON_DEVICE(CODE)
Definition: AMReX_GpuQualifiers.H:56
#define AMREX_GPU_GLOBAL
Definition: AMReX_GpuQualifiers.H:19
#define AMREX_IF_ON_HOST(CODE)
Definition: AMReX_GpuQualifiers.H:58
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition: AMReX_SPACE.H:104
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:859
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:844
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:851
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition: AMReX_GpuDevice.cpp:917
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition: AMReX_GpuLaunch.H:81
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition: AMReX_GpuLaunch.H:160
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition: AMReX_GpuLaunch.H:188
AMREX_GPU_HOST_DEVICE Box getThreadBox(const Box &bx, Long offset) noexcept
Definition: AMReX_GpuLaunch.H:106
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
Definition: AMReX_Amr.cpp:49
void launch_host(L &&f0) noexcept
Definition: AMReX_GpuLaunch.H:66
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Returns a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition: AMReX_Box.H:1399
const int[]
Definition: AMReX_BLProfiler.cpp:1664
AMREX_GPU_DEVICE void call_device(L &&f0) noexcept
Definition: AMReX_GpuLaunch.H:54
AMREX_GPU_GLOBAL void launch_global(L f0, Lambdas... fs)
Definition: AMReX_GpuLaunch.H:51
Definition: AMReX_GpuLaunch.H:90
Box box
Definition: AMReX_GpuLaunch.H:91
int ic
Definition: AMReX_GpuLaunch.H:92
int nc
Definition: AMReX_GpuLaunch.H:93
Definition: AMReX_GpuLaunch.H:182
int nblocks
Definition: AMReX_GpuLaunch.H:184
Long start_idx
Definition: AMReX_GpuLaunch.H:183
Definition: AMReX_GpuLaunch.H:128
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition: AMReX_GpuLaunch.H:150
ExecutionConfig(Long N) noexcept
Definition: AMReX_GpuLaunch.H:147
dim3 numBlocks
Definition: AMReX_GpuLaunch.H:153
dim3 numThreads
Definition: AMReX_GpuLaunch.H:154
ExecutionConfig(const Box &box, int comps) noexcept
Definition: AMReX_GpuLaunch.H:143
ExecutionConfig(const Box &box) noexcept
Definition: AMReX_GpuLaunch.H:132
ExecutionConfig() noexcept
Definition: AMReX_GpuLaunch.H:129
std::size_t sharedMem
Definition: AMReX_GpuLaunch.H:155
Definition: AMReX_GpuLaunch.H:96
int globalBlockId
Definition: AMReX_GpuLaunch.H:99
int numBlocks
Definition: AMReX_GpuLaunch.H:97
int numThreads
Definition: AMReX_GpuLaunch.H:98