Block-Structured AMR Software Framework
AMReX_TagParallelFor.H
Go to the documentation of this file.
1 #ifndef AMREX_TAG_PARALLELFOR_H_
2 #define AMREX_TAG_PARALLELFOR_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_Arena.H>
6 #include <AMReX_Array4.H>
7 #include <AMReX_Box.H>
8 #include <AMReX_GpuLaunch.H>
9 #include <AMReX_Vector.H>
10 #include <limits>
11 #include <utility>
12 
13 namespace amrex {
14 
15 template <class T>
16 struct Array4PairTag {
20 
22  Box const& box () const noexcept { return dbox; }
23 };
24 
25 template <class T0, class T1=T0>
26 struct Array4CopyTag {
30  Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
31 
33  Box const& box () const noexcept { return dbox; }
34 };
35 
36 template <class T0, class T1=T0>
42  Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
43 
45  Box const& box () const noexcept { return dbox; }
46 };
47 
48 template <class T>
49 struct Array4Tag {
51 
53  Box box () const noexcept { return Box(dfab); }
54 };
55 
56 template <class T>
57 struct Array4BoxTag {
60 
62  Box const& box () const noexcept { return dbox; }
63 };
64 
65 template <class T>
69  T val;
70 
72  Box const& box () const noexcept { return dbox; }
73 };
74 
75 template <class T>
80 
82  Box const& box() const noexcept { return bx; }
83 };
84 
85 template <class T>
90 
92  Box const& box() const noexcept { return bx; }
93 };
94 
95 template <class T>
96 struct VectorTag {
97  T* p;
98  Long m_size;
99 
101  Long size () const noexcept { return m_size; }
102 };
103 
104 #ifdef AMREX_USE_GPU
105 
106 namespace detail {
107 
108 template <typename T>
109 std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<T>().box())>, Box>::value,
110  Long>
111 get_tag_size (T const& tag) noexcept
112 {
113  AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
114  return static_cast<int>(tag.box().numPts());
115 }
116 
117 template <typename T>
118 std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<T>().size())> >::value,
119  Long>
120 get_tag_size (T const& tag) noexcept
121 {
122  AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
123  return tag.size();
124 }
125 
126 template <typename T, typename F>
128 std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<T>().box())>, Box>::value>
130 #ifdef AMREX_USE_SYCL
131  sycl::nd_item<1> const& item,
132 #endif
133  int icell, T const& tag, F&& f) noexcept
134 {
135  int ncells = tag.box().numPts();
136  const auto len = amrex::length(tag.box());
137  const auto lo = amrex::lbound(tag.box());
138  int k = icell / (len.x*len.y);
139  int j = (icell - k*(len.x*len.y)) / len.x;
140  int i = (icell - k*(len.x*len.y)) - j*len.x;
141  i += lo.x;
142  j += lo.y;
143  k += lo.z;
144 #ifdef AMREX_USE_SYCL
145  f(item, icell, ncells, i, j, k, tag);
146 #else
147  f( icell, ncells, i, j, k, tag);
148 #endif
149 }
150 
151 template <typename T, typename F>
153 std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<T>().size())> >::value>
155 #ifdef AMREX_USE_SYCL
156  sycl::nd_item<1> const& item,
157 #endif
158  int i, T const& tag, F&& f) noexcept
159 {
160  int N = tag.size();
161 #ifdef AMREX_USE_SYCL
162  f(item, i, N, tag);
163 #else
164  f( i, N, tag);
165 #endif
166 }
167 
168 template <class TagType, class F>
169 void
170 ParallelFor_doit (Vector<TagType> const& tags, F && f)
171 {
172  const int ntags = tags.size();
173  if (ntags == 0) { return; }
174 
175  Long l_ntotwarps = 0;
176  int ntotwarps = 0;
177  Vector<int> nwarps;
178  nwarps.reserve(ntags+1);
179  for (int i = 0; i < ntags; ++i)
180  {
181  auto& tag = tags[i];
182  nwarps.push_back(ntotwarps);
184  l_ntotwarps += nw;
185  ntotwarps += static_cast<int>(nw);
186  }
187  nwarps.push_back(ntotwarps);
188 
189  std::size_t sizeof_tags = ntags*sizeof(TagType);
190  std::size_t offset_nwarps = Arena::align(sizeof_tags);
191  std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
192  std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
193 
194  char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
195  char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
196 
197  std::memcpy(h_buffer, tags.data(), sizeof_tags);
198  std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
199  Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
200 
201  auto d_tags = reinterpret_cast<TagType*>(d_buffer);
202  auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
203 
204  constexpr int nthreads = 256;
205  constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
206  int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
207 
208  amrex::ignore_unused(l_ntotwarps);
209  AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
210 
211  amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
212 #ifdef AMREX_USE_SYCL
213  [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
214  [[sycl::reqd_work_group_size(nthreads)]]
215  [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
216 #else
217  [=] AMREX_GPU_DEVICE () noexcept
218 #endif
219  {
220 #ifdef AMREX_USE_SYCL
221  std::size_t g_tid = item.get_global_id(0);
222 #else
223  auto g_tid = std::size_t(blockDim.x)*blockIdx.x + threadIdx.x;
224 #endif
225  auto g_wid = int(g_tid / Gpu::Device::warp_size);
226  if (g_wid >= ntotwarps) { return; }
227 
228  int tag_id = amrex::bisect(d_nwarps, 0, ntags, g_wid);
229 
230  int b_wid = g_wid - d_nwarps[tag_id]; // b_wid'th warp on this box
231 #ifdef AMREX_USE_SYCL
232  int lane = item.get_local_id(0) % Gpu::Device::warp_size;
233 #else
234  int lane = threadIdx.x % Gpu::Device::warp_size;
235 #endif
236  int icell = b_wid*Gpu::Device::warp_size + lane;
237 
238 #ifdef AMREX_USE_SYCL
239  tagparfor_call_f(item, icell, d_tags[tag_id], f);
240 #else
241  tagparfor_call_f( icell, d_tags[tag_id], f);
242 #endif
243  });
244 
246  The_Pinned_Arena()->free(h_buffer);
247  The_Arena()->free(d_buffer);
248 }
249 
250 }
251 
252 template <class TagType, class F>
253 std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<TagType>().box())>,
254  Box>::value>
255 ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
256 {
258  [=] AMREX_GPU_DEVICE (
259 #ifdef AMREX_USE_SYCL
260  sycl::nd_item<1> const& /*item*/,
261 #endif
262  int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
263  {
264  if (icell < ncells) {
265  for (int n = 0; n < ncomp; ++n) {
266  f(i,j,k,n,tag);
267  }
268  }
269  });
270 }
271 
272 template <class TagType, class F>
273 std::enable_if_t<std::is_same<std::decay_t<decltype(std::declval<TagType>().box())>, Box>::value>
274 ParallelFor (Vector<TagType> const& tags, F && f)
275 {
277  [=] AMREX_GPU_DEVICE (
278 #ifdef AMREX_USE_SYCL
279  sycl::nd_item<1> const& /*item*/,
280 #endif
281  int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
282  {
283  if (icell < ncells) {
284  f(i,j,k,tag);
285  }
286  });
287 }
288 
289 template <class TagType, class F>
290 std::enable_if_t<std::is_integral<std::decay_t<decltype(std::declval<TagType>().size())> >::value>
291 ParallelFor (Vector<TagType> const& tags, F && f)
292 {
294  [=] AMREX_GPU_DEVICE (
295 #ifdef AMREX_USE_SYCL
296  sycl::nd_item<1> const& /*item*/,
297 #endif
298  int icell, int ncells, TagType const& tag) noexcept
299  {
300  if (icell < ncells) {
301  f(icell,tag);
302  }
303  });
304 }
305 
306 #endif
307 
308 }
309 
310 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition: AMReX_Arena.cpp:143
virtual void * alloc(std::size_t sz)=0
static constexpr AMREX_EXPORT int warp_size
Definition: AMReX_GpuDevice.H:173
Encapsulation of the Orientation of the Faces of a Box.
Definition: AMReX_Orientation.H:29
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27
Long size() const noexcept
Definition: AMReX_Vector.H:50
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void * memcpy(void *dest, const void *src, std::size_t count)
Definition: AMReX_GpuUtility.H:214
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ max
Definition: AMReX_ParallelReduce.H:17
void ParallelFor_doit(Vector< TagType > const &tags, F &&f)
Definition: AMReX_TagParallelFor.H:170
std::enable_if_t< std::is_same< std::decay_t< decltype(std::declval< T >).box())>, Box >::value, Long > get_tag_size(T const &tag) noexcept
Definition: AMReX_TagParallelFor.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_same< std::decay_t< decltype(std::declval< T >).box())>, Box >::value > tagparfor_call_f(int icell, T const &tag, F &&f) noexcept
Definition: AMReX_TagParallelFor.H:129
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
void launch(T const &n, L &&f) noexcept
Definition: AMReX_GpuLaunchFunctsC.H:120
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition: AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:308
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:322
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:649
const int[]
Definition: AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609
Definition: AMReX_FabArrayCommI.H:841
Definition: AMReX_TagParallelFor.H:86
Box bx
Definition: AMReX_TagParallelFor.H:88
Array4< T > fab
Definition: AMReX_TagParallelFor.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:92
Dim3 offset
Definition: AMReX_TagParallelFor.H:89
Definition: AMReX_TagParallelFor.H:76
Array4< T > fab
Definition: AMReX_TagParallelFor.H:77
Box bx
Definition: AMReX_TagParallelFor.H:78
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:82
Orientation face
Definition: AMReX_TagParallelFor.H:79
Definition: AMReX_TagParallelFor.H:57
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:58
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:62
Box dbox
Definition: AMReX_TagParallelFor.H:59
Definition: AMReX_TagParallelFor.H:66
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:72
T val
Definition: AMReX_TagParallelFor.H:69
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:67
Box dbox
Definition: AMReX_TagParallelFor.H:68
Definition: AMReX_TagParallelFor.H:26
Array4< T1 const > sfab
Definition: AMReX_TagParallelFor.H:28
Dim3 offset
Definition: AMReX_TagParallelFor.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:33
Array4< T0 > dfab
Definition: AMReX_TagParallelFor.H:27
Box dbox
Definition: AMReX_TagParallelFor.H:29
Definition: AMReX_TagParallelFor.H:37
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:45
Dim3 offset
Definition: AMReX_TagParallelFor.H:42
Box dbox
Definition: AMReX_TagParallelFor.H:41
Array4< T0 > dfab
Definition: AMReX_TagParallelFor.H:38
Array4< int > mask
Definition: AMReX_TagParallelFor.H:40
Array4< T1 const > sfab
Definition: AMReX_TagParallelFor.H:39
Definition: AMReX_TagParallelFor.H:16
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:17
Box dbox
Definition: AMReX_TagParallelFor.H:19
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition: AMReX_TagParallelFor.H:22
Array4< T const > sfab
Definition: AMReX_TagParallelFor.H:18
Definition: AMReX_TagParallelFor.H:49
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:50
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box box() const noexcept
Definition: AMReX_TagParallelFor.H:53
Definition: AMReX_Array4.H:61
Definition: AMReX_Dim3.H:12
Definition: AMReX_TagParallelFor.H:96
T * p
Definition: AMReX_TagParallelFor.H:97
Long m_size
Definition: AMReX_TagParallelFor.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Long size() const noexcept
Definition: AMReX_TagParallelFor.H:101