Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_TagParallelFor.H
Go to the documentation of this file.
1#ifndef AMREX_TAG_PARALLELFOR_H_
2#define AMREX_TAG_PARALLELFOR_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Arena.H>
6#include <AMReX_Array4.H>
7#include <AMReX_Box.H>
8#include <AMReX_GpuLaunch.H>
9#include <AMReX_Vector.H>
10#include <limits>
11#include <utility>
12
13namespace amrex {
14
15template <class T>
20
22 Box const& box () const noexcept { return dbox; }
23};
24
25template <class T0, class T1=T0>
30 Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
31
33 Box const& box () const noexcept { return dbox; }
34};
35
36template <class T0, class T1=T0>
42 Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
43
45 Box const& box () const noexcept { return dbox; }
46};
47
48template <class T>
49struct Array4Tag {
51
53 Box box () const noexcept { return Box(dfab); }
54};
55
56template <class T>
60
62 Box const& box () const noexcept { return dbox; }
63};
64
65template <class T>
69 T val;
70
72 Box const& box () const noexcept { return dbox; }
73};
74
75template <class T>
80
82 Box const& box() const noexcept { return bx; }
83};
84
85template <class T>
90
92 Box const& box() const noexcept { return bx; }
93};
94
95template <class T>
96struct VectorTag {
97 T* p;
98 Long m_size;
99
101 Long size () const noexcept { return m_size; }
102};
103
104namespace detail {
105
106 template <typename T>
107 std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, Long>
108 get_tag_size (T const& tag) noexcept
109 {
110 AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
111 return static_cast<int>(tag.box().numPts());
112 }
113
114 template <typename T>
115 std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, Long>
116 get_tag_size (T const& tag) noexcept
117 {
118 AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
119 return tag.size();
120 }
121
122 template <typename T>
123 constexpr
124 std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, bool>
125 is_box_tag (T const&) { return true; }
126
127 template <typename T>
128 constexpr
129 std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, bool>
130 is_box_tag (T const&) { return false; }
131
132}
133
134template <class TagType>
135struct TagVector {
136
137 char* h_buffer = nullptr;
138 char* d_buffer = nullptr;
139 TagType* d_tags = nullptr;
140 int* d_nwarps = nullptr;
141 int ntags = 0;
142 int ntotwarps = 0;
143 int nblocks = 0;
144 bool defined = false;
145 static constexpr int nthreads = 256;
146
147 TagVector () = default;
148
150 define(tags);
151 }
152
154 if (defined) {
155 undefine();
156 }
157 }
158
159 TagVector (const TagVector& other) = delete;
160 TagVector& operator= (const TagVector& other) = delete;
161 TagVector (TagVector&& other) noexcept
162 : h_buffer{other.h_buffer},
163 d_buffer{other.d_buffer},
164 d_tags{other.d_tags},
165 d_nwarps{other.d_nwarps},
166 ntags{other.ntags},
167 ntotwarps{other.ntotwarps},
168 nblocks{other.nblocks},
169 defined{other.defined}
170 {
171 other.h_buffer = nullptr;
172 other.d_buffer = nullptr;
173 other.d_tags = nullptr;
174 other.d_nwarps = nullptr;
175 other.ntags = 0;
176 other.ntotwarps = 0;
177 other.nblocks = 0;
178 other.defined = false;
179 }
180 TagVector& operator= (TagVector&& other) noexcept {
181 if (this == &other) {
182 return *this;
183 }
184 undefine();
185 h_buffer = other.h_buffer;
186 other.h_buffer = nullptr;
187 d_buffer = other.d_buffer;
188 other.d_buffer = nullptr;
189 d_tags = other.d_tags;
190 other.d_tags = nullptr;
191 d_nwarps = other.d_nwarps;
192 other.d_nwarps = nullptr;
193 ntags = other.ntags;
194 other.ntags = 0;
195 ntotwarps = other.ntotwarps;
196 other.ntotwarps = 0;
197 nblocks = other.nblocks;
198 other.nblocks = 0;
199 defined = other.defined;
200 other.defined = false;
201 return *this;
202 }
203
204 [[nodiscard]] bool is_defined () const { return defined; }
205
206 void define (Vector<TagType> const& tags) {
207 if (defined) {
208 undefine();
209 }
210
211 ntags = tags.size();
212 if (ntags == 0) {
213 defined = true;
214 return;
215 }
216
217#ifdef AMREX_USE_GPU
218 Long l_ntotwarps = 0;
219 ntotwarps = 0;
220 Vector<int> nwarps;
221 nwarps.reserve(ntags+1);
222 for (int i = 0; i < ntags; ++i)
223 {
224 auto& tag = tags[i];
225 nwarps.push_back(ntotwarps);
226 auto nw = (detail::get_tag_size(tag) + Gpu::Device::warp_size-1) /
228 l_ntotwarps += nw;
229 ntotwarps += static_cast<int>(nw);
230 }
231 nwarps.push_back(ntotwarps);
232
233 std::size_t sizeof_tags = ntags*sizeof(TagType);
234 std::size_t offset_nwarps = Arena::align(sizeof_tags);
235 std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
236 std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
237
238 h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
239 d_buffer = (char*)The_Arena()->alloc(total_buf_size);
240
241 std::memcpy(h_buffer, tags.data(), sizeof_tags);
242 std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
243 Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
244
245 d_tags = reinterpret_cast<TagType*>(d_buffer);
246 d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
247
248 constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
249 nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
250
251 defined = true;
252
253 amrex::ignore_unused(l_ntotwarps);
254 AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
255#else
256 std::size_t sizeof_tags = ntags*sizeof(TagType);
257 h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags);
258
259 std::memcpy(h_buffer, tags.data(), sizeof_tags);
260
261 d_tags = reinterpret_cast<TagType*>(h_buffer);
262
263 defined = true;
264#endif
265 }
266
267 void undefine () {
268 if (defined) {
272 h_buffer = nullptr;
273 d_buffer = nullptr;
274 d_tags = nullptr;
275 d_nwarps = nullptr;
276 ntags = 0;
277 ntotwarps = 0;
278 nblocks = 0;
279 defined = false;
280 }
281 }
282};
283
284namespace detail {
285
286#ifdef AMREX_USE_GPU
287
288template <typename T, typename F>
290std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<T>().box())>, Box>, void>
292#ifdef AMREX_USE_SYCL
293 sycl::nd_item<1> const& item,
294#endif
295 int icell, T const& tag, F&& f) noexcept
296{
297 int ncells = tag.box().numPts();
298 const auto len = amrex::length(tag.box());
299 const auto lo = amrex::lbound(tag.box());
300 int k = icell / (len.x*len.y);
301 int j = (icell - k*(len.x*len.y)) / len.x;
302 int i = (icell - k*(len.x*len.y)) - j*len.x;
303 i += lo.x;
304 j += lo.y;
305 k += lo.z;
306#ifdef AMREX_USE_SYCL
307 f(item, icell, ncells, i, j, k, tag);
308#else
309 f( icell, ncells, i, j, k, tag);
310#endif
311}
312
313template <typename T, typename F>
315std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >, void>
317#ifdef AMREX_USE_SYCL
318 sycl::nd_item<1> const& item,
319#endif
320 int i, T const& tag, F&& f) noexcept
321{
322 int N = tag.size();
323#ifdef AMREX_USE_SYCL
324 f(item, i, N, tag);
325#else
326 f( i, N, tag);
327#endif
328}
329
330template <class TagType, class F>
331void
333{
335
336 if (tv.ntags == 0) { return; }
337
338 const auto d_tags = tv.d_tags;
339 const auto d_nwarps = tv.d_nwarps;
340 const auto ntags = tv.ntags;
341 const auto ntotwarps = tv.ntotwarps;
342 constexpr auto nthreads = TagVector<TagType>::nthreads;
343
344 amrex::launch<nthreads>(tv.nblocks, Gpu::gpuStream(),
345#ifdef AMREX_USE_SYCL
346 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
347 [[sycl::reqd_work_group_size(nthreads)]]
348 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
349#else
350 [=] AMREX_GPU_DEVICE () noexcept
351#endif
352 {
353#ifdef AMREX_USE_SYCL
354 std::size_t g_tid = item.get_global_id(0);
355#else
356 auto g_tid = std::size_t(blockDim.x)*blockIdx.x + threadIdx.x;
357#endif
358 auto g_wid = int(g_tid / Gpu::Device::warp_size);
359 if (g_wid >= ntotwarps) { return; }
360
361 int tag_id = amrex::bisect(d_nwarps, 0, ntags, g_wid);
362
363 int b_wid = g_wid - d_nwarps[tag_id]; // b_wid'th warp on this box
364#ifdef AMREX_USE_SYCL
365 int lane = item.get_local_id(0) % Gpu::Device::warp_size;
366#else
367 int lane = threadIdx.x % Gpu::Device::warp_size;
368#endif
369 int icell = b_wid*Gpu::Device::warp_size + lane;
370
371#ifdef AMREX_USE_SYCL
372 tagparfor_call_f(item, icell, d_tags[tag_id], f);
373#else
374 tagparfor_call_f( icell, d_tags[tag_id], f);
375#endif
376 });
377}
378
379#else // ifdef AMREX_USE_GPU
380
381template <class TagType, class F>
382void
383ParallelFor_doit (TagVector<TagType> const& tv, F const& f)
384{
385 // Note: this CPU version may not have optimal performance:
386 // The loop over ncomp is the innermost instead of the outermost
387 // There is no load-balancing or splitting of tags
389
390 if (tv.ntags == 0) { return; }
391
392 const auto d_tags = tv.d_tags;
393 const auto ntags = tv.ntags;
394
395#ifdef AMREX_USE_OMP
396#pragma omp parallel for
397#endif
398 for (int itag = 0; itag < ntags; ++itag) {
399
400 const auto& t = d_tags[itag];
401
402 if constexpr (is_box_tag(t)) {
403 const auto lo = amrex::lbound(t.box());
404 const auto hi = amrex::ubound(t.box());
405
406 for (int k = lo.z; k <= hi.z; ++k) {
407 for (int j = lo.y; j <= hi.y; ++j) {
409 for (int i = lo.x; i <= hi.x; ++i) {
410 f(0, 1, i, j, k, t);
411 }
412 }
413 }
414 } else {
415 const auto size = t.size();
416
418 for (int i = 0; i < size; ++i) {
419 f(i, size, t);
420 }
421 }
422 }
423}
424
425#endif
426
427}
428
429template <class TagType, class F>
430std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>>
431ParallelFor (TagVector<TagType> const& tv, int ncomp, F const& f)
432{
434 [=] AMREX_GPU_DEVICE (
435#ifdef AMREX_USE_SYCL
436 sycl::nd_item<1> const& /*item*/,
437#endif
438 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
439 {
440 if (icell < ncells) {
441 for (int n = 0; n < ncomp; ++n) {
442 f(i,j,k,n,tag);
443 }
444 }
445 });
446}
447
448template <class TagType, class F>
449std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>, void>
450ParallelFor (TagVector<TagType> const& tv, F const& f)
451{
453 [=] AMREX_GPU_DEVICE (
454#ifdef AMREX_USE_SYCL
455 sycl::nd_item<1> const& /*item*/,
456#endif
457 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
458 {
459 if (icell < ncells) {
460 f(i,j,k,tag);
461 }
462 });
463}
464
465template <class TagType, class F>
466std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >, void>
467ParallelFor (TagVector<TagType> const& tv, F const& f)
468{
470 [=] AMREX_GPU_DEVICE (
471#ifdef AMREX_USE_SYCL
472 sycl::nd_item<1> const& /*item*/,
473#endif
474 int icell, int ncells, TagType const& tag) noexcept
475 {
476 if (icell < ncells) {
477 f(icell,tag);
478 }
479 });
480}
481
482template <class TagType, class F>
483std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>>
484ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
485{
486 TagVector<TagType> tv{tags};
487 ParallelFor(tv, ncomp, std::forward<F>(f));
488}
489
490template <class TagType, class F>
491std::enable_if_t<std::is_same_v<std::decay_t<decltype(std::declval<TagType>().box())>, Box>, void>
492ParallelFor (Vector<TagType> const& tags, F && f)
493{
494 TagVector<TagType> tv{tags};
495 ParallelFor(tv, std::forward<F>(f));
496}
497
498template <class TagType, class F>
499std::enable_if_t<std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >, void>
500ParallelFor (Vector<TagType> const& tags, F && f)
501{
502 TagVector<TagType> tv{tags};
503 ParallelFor(tv, std::forward<F>(f));
504}
505
506}
507
508#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:143
AMREX_GPU_HOST_DEVICE IntVectND< dim > size() const noexcept
Return the length of the BoxND.
Definition AMReX_Box.H:139
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:346
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173
Encapsulation of the Orientation of the Faces of a Box.
Definition AMReX_Orientation.H:29
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Long size() const noexcept
Definition AMReX_Vector.H:50
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition AMReX_GpuRange.H:26
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:251
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
Long get_tag_size(T const &tag) noexcept
Definition AMReX_TagParallelFor.H:108
void ParallelFor_doit(TagVector< TagType > const &tv, F const &f)
Definition AMReX_TagParallelFor.H:332
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void tagparfor_call_f(int icell, T const &tag, F &&f) noexcept
Definition AMReX_TagParallelFor.H:291
constexpr bool is_box_tag(T const &)
Definition AMReX_TagParallelFor.H:125
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:656
const int[]
Definition AMReX_BLProfiler.cpp:1664
Arena * The_Arena()
Definition AMReX_Arena.cpp:616
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_TagParallelFor.H:86
Box bx
Definition AMReX_TagParallelFor.H:88
Array4< T > fab
Definition AMReX_TagParallelFor.H:87
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:92
Dim3 offset
Definition AMReX_TagParallelFor.H:89
Definition AMReX_TagParallelFor.H:76
Array4< T > fab
Definition AMReX_TagParallelFor.H:77
Box bx
Definition AMReX_TagParallelFor.H:78
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:82
Orientation face
Definition AMReX_TagParallelFor.H:79
Definition AMReX_TagParallelFor.H:57
Array4< T > dfab
Definition AMReX_TagParallelFor.H:58
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:62
Box dbox
Definition AMReX_TagParallelFor.H:59
Definition AMReX_TagParallelFor.H:66
T val
Definition AMReX_TagParallelFor.H:69
Array4< T > dfab
Definition AMReX_TagParallelFor.H:67
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:72
Box dbox
Definition AMReX_TagParallelFor.H:68
Definition AMReX_TagParallelFor.H:26
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:33
Array4< T1 const > sfab
Definition AMReX_TagParallelFor.H:28
Dim3 offset
Definition AMReX_TagParallelFor.H:30
Array4< T0 > dfab
Definition AMReX_TagParallelFor.H:27
Box dbox
Definition AMReX_TagParallelFor.H:29
Definition AMReX_TagParallelFor.H:37
Dim3 offset
Definition AMReX_TagParallelFor.H:42
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:45
Box dbox
Definition AMReX_TagParallelFor.H:41
Array4< T0 > dfab
Definition AMReX_TagParallelFor.H:38
Array4< int > mask
Definition AMReX_TagParallelFor.H:40
Array4< T1 const > sfab
Definition AMReX_TagParallelFor.H:39
Definition AMReX_TagParallelFor.H:16
Array4< T > dfab
Definition AMReX_TagParallelFor.H:17
Box dbox
Definition AMReX_TagParallelFor.H:19
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:22
Array4< T const > sfab
Definition AMReX_TagParallelFor.H:18
Definition AMReX_TagParallelFor.H:49
Array4< T > dfab
Definition AMReX_TagParallelFor.H:50
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box box() const noexcept
Definition AMReX_TagParallelFor.H:53
Definition AMReX_Array4.H:61
Definition AMReX_Dim3.H:12
Definition AMReX_TagParallelFor.H:135
TagVector(TagVector &&other) noexcept
Definition AMReX_TagParallelFor.H:161
~TagVector()
Definition AMReX_TagParallelFor.H:153
char * h_buffer
Definition AMReX_TagParallelFor.H:137
TagVector(const TagVector &other)=delete
TagType * d_tags
Definition AMReX_TagParallelFor.H:139
bool defined
Definition AMReX_TagParallelFor.H:144
int * d_nwarps
Definition AMReX_TagParallelFor.H:140
TagVector(Vector< TagType > const &tags)
Definition AMReX_TagParallelFor.H:149
bool is_defined() const
Definition AMReX_TagParallelFor.H:204
int ntotwarps
Definition AMReX_TagParallelFor.H:142
char * d_buffer
Definition AMReX_TagParallelFor.H:138
int ntags
Definition AMReX_TagParallelFor.H:141
TagVector & operator=(const TagVector &other)=delete
int nblocks
Definition AMReX_TagParallelFor.H:143
static constexpr int nthreads
Definition AMReX_TagParallelFor.H:145
void undefine()
Definition AMReX_TagParallelFor.H:267
TagVector()=default
void define(Vector< TagType > const &tags)
Definition AMReX_TagParallelFor.H:206
Definition AMReX_TagParallelFor.H:96
T * p
Definition AMReX_TagParallelFor.H:97
Long m_size
Definition AMReX_TagParallelFor.H:98
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Long size() const noexcept
Definition AMReX_TagParallelFor.H:101