Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_TagParallelFor.H
Go to the documentation of this file.
1#ifndef AMREX_TAG_PARALLELFOR_H_
2#define AMREX_TAG_PARALLELFOR_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Arena.H>
6#include <AMReX_Array4.H>
7#include <AMReX_Box.H>
8#include <AMReX_GpuLaunch.H>
9#include <AMReX_Vector.H>
10#include <limits>
11#include <utility>
12
13namespace amrex {
14
15template <class T>
20
22 Box const& box () const noexcept { return dbox; }
23};
24
25template <class T0, class T1=T0>
28 int dindex;
31 Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
32
34 Box const& box () const noexcept { return dbox; }
35};
36
37template <class T0, class T1=T0>
43 Dim3 offset; // sbox.smallEnd() - dbox.smallEnd()
44
46 Box const& box () const noexcept { return dbox; }
47};
48
49template <class T>
50struct Array4Tag {
52
54 Box box () const noexcept { return Box(dfab); }
55};
56
57template <class T>
61
63 Box const& box () const noexcept { return dbox; }
64};
65
66template <class T>
70 T val;
71
73 Box const& box () const noexcept { return dbox; }
74};
75
76template <class T>
81
83 Box const& box() const noexcept { return bx; }
84};
85
86template <class T>
91
93 Box const& box() const noexcept { return bx; }
94};
95
96template <class T>
97struct VectorTag {
98 T* p;
100
102 Long size () const noexcept { return m_size; }
103};
104
105template <class T>
106struct CommRecvBufTag { // for unpacking recv buffer
108 std::ptrdiff_t poff;
110
112 Box const& box () const noexcept { return bx; }
113};
114
115template <class T>
116struct CommSendBufTag { // for packing send buffer
118 std::ptrdiff_t poff;
120
122 Box const& box () const noexcept { return bx; }
123};
124
126namespace detail {
127
128 template <typename T>
129 requires (std::same_as<std::decay_t<decltype(std::declval<T>().box())>, Box>)
130 Long get_tag_size (T const& tag) noexcept
131 {
132 AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits<int>::max()));
133 return static_cast<int>(tag.box().numPts());
134 }
135
136 template <typename T>
137 requires (std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >)
138 Long get_tag_size (T const& tag) noexcept
139 {
140 AMREX_ASSERT(tag.size() < Long(std::numeric_limits<int>::max()));
141 return tag.size();
142 }
143
144 template <typename T>
145 requires (std::same_as<std::decay_t<decltype(std::declval<T>().box())>, Box>)
146 constexpr bool is_box_tag (T const&) { return true; }
147
148 template <typename T>
149 requires (std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >)
150 constexpr bool is_box_tag (T const&) { return false; }
151
152}
154
155template <class TagType>
156struct TagVector {
157
158 char* h_buffer = nullptr;
159 char* d_buffer = nullptr;
160 TagType* d_tags = nullptr;
161 int* d_nwarps = nullptr;
162 int ntags = 0;
163 int ntotwarps = 0;
164 int nblocks = 0;
165 bool defined = false;
166 static constexpr int nthreads = 256;
167
168 TagVector () = default;
169
171 define(tags);
172 }
173
175 if (defined) {
176 undefine();
177 }
178 }
179
180 TagVector (const TagVector& other) = delete;
181 TagVector& operator= (const TagVector& other) = delete;
182 TagVector (TagVector&& other) noexcept
183 : h_buffer{other.h_buffer},
184 d_buffer{other.d_buffer},
185 d_tags{other.d_tags},
186 d_nwarps{other.d_nwarps},
187 ntags{other.ntags},
188 ntotwarps{other.ntotwarps},
189 nblocks{other.nblocks},
190 defined{other.defined}
191 {
192 other.h_buffer = nullptr;
193 other.d_buffer = nullptr;
194 other.d_tags = nullptr;
195 other.d_nwarps = nullptr;
196 other.ntags = 0;
197 other.ntotwarps = 0;
198 other.nblocks = 0;
199 other.defined = false;
200 }
201 TagVector& operator= (TagVector&& other) noexcept {
202 if (this == &other) {
203 return *this;
204 }
205 undefine();
206 h_buffer = other.h_buffer;
207 other.h_buffer = nullptr;
208 d_buffer = other.d_buffer;
209 other.d_buffer = nullptr;
210 d_tags = other.d_tags;
211 other.d_tags = nullptr;
212 d_nwarps = other.d_nwarps;
213 other.d_nwarps = nullptr;
214 ntags = other.ntags;
215 other.ntags = 0;
216 ntotwarps = other.ntotwarps;
217 other.ntotwarps = 0;
218 nblocks = other.nblocks;
219 other.nblocks = 0;
220 defined = other.defined;
221 other.defined = false;
222 return *this;
223 }
224
225 [[nodiscard]] bool is_defined () const { return defined; }
226
227 void define (Vector<TagType> const& tags) {
228 if (defined) {
229 undefine();
230 }
231
232 ntags = tags.size();
233 if (ntags == 0) {
234 defined = true;
235 return;
236 }
237
238#ifdef AMREX_USE_GPU
239 Long l_ntotwarps = 0;
240 ntotwarps = 0;
241 Vector<int> nwarps;
242 nwarps.reserve(ntags+1);
243 for (int i = 0; i < ntags; ++i)
244 {
245 auto& tag = tags[i];
246 nwarps.push_back(ntotwarps);
247 auto nw = (detail::get_tag_size(tag) + Gpu::Device::warp_size-1) /
249 l_ntotwarps += nw;
250 ntotwarps += static_cast<int>(nw);
251 }
252 nwarps.push_back(ntotwarps);
253
254 std::size_t sizeof_tags = ntags*sizeof(TagType);
255 std::size_t offset_nwarps = Arena::align(sizeof_tags);
256 std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
257 std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
258
259 h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
260 d_buffer = (char*)The_Arena()->alloc(total_buf_size);
261
262 std::memcpy(h_buffer, tags.data(), sizeof_tags);
263 std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
264 Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
265
266 d_tags = reinterpret_cast<TagType*>(d_buffer);
267 d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
268
269 constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
270 nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
271
272 defined = true;
273
274 amrex::ignore_unused(l_ntotwarps);
275 AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits<int>::max()));
276#else
277 std::size_t sizeof_tags = ntags*sizeof(TagType);
278 h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags);
279
280 std::memcpy(h_buffer, tags.data(), sizeof_tags);
281
282 d_tags = reinterpret_cast<TagType*>(h_buffer);
283
284 defined = true;
285#endif
286 }
287
288 void undefine () {
289 if (defined) {
293 h_buffer = nullptr;
294 d_buffer = nullptr;
295 d_tags = nullptr;
296 d_nwarps = nullptr;
297 ntags = 0;
298 ntotwarps = 0;
299 nblocks = 0;
300 defined = false;
301 }
302 }
303};
304
306namespace detail {
307
308#ifdef AMREX_USE_GPU
309
310template <typename T, typename F>
311requires (std::same_as<std::decay_t<decltype(std::declval<T>().box())>, Box>)
313void tagparfor_call_f (int icell, T const& tag, F&& f) noexcept
314{
315 int ncells = tag.box().numPts();
316 const auto len = amrex::length(tag.box());
317 const auto lo = amrex::lbound(tag.box());
318 int k = icell / (len.x*len.y);
319 int j = (icell - k*(len.x*len.y)) / len.x;
320 int i = (icell - k*(len.x*len.y)) - j*len.x;
321 i += lo.x;
322 j += lo.y;
323 k += lo.z;
324 f(icell, ncells, i, j, k, tag);
325}
326
327template <typename T, typename F>
328requires (std::is_integral_v<std::decay_t<decltype(std::declval<T>().size())> >)
330void tagparfor_call_f (int i, T const& tag, F&& f) noexcept
331{
332 int N = tag.size();
333 f(i, N, tag);
334}
335
336template <class TagType, class F>
337void
338ParallelFor_doit (TagVector<TagType> const& tv, F const& f)
339{
340 AMREX_ALWAYS_ASSERT(tv.is_defined());
341
342 if (tv.ntags == 0 || tv.ntotwarps == 0) { return; }
343
344 const auto d_tags = tv.d_tags;
345 const auto d_nwarps = tv.d_nwarps;
346 const auto ntags = tv.ntags;
347 const auto ntotwarps = tv.ntotwarps;
348 constexpr auto nthreads = TagVector<TagType>::nthreads;
349
350 amrex::launch<nthreads>(tv.nblocks, Gpu::gpuStream(),
351#ifdef AMREX_USE_SYCL
352 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
353 [[sycl::reqd_work_group_size(nthreads)]]
354 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
355#else
356 [=] AMREX_GPU_DEVICE () noexcept
357#endif
358 {
359#ifdef AMREX_USE_SYCL
360 std::size_t g_tid = item.get_global_id(0);
361#else
362 auto g_tid = std::size_t(blockDim.x)*blockIdx.x + threadIdx.x;
363#endif
364 auto g_wid = int(g_tid / Gpu::Device::warp_size);
365 if (g_wid >= ntotwarps) { return; }
366
367 int tag_id = amrex::bisect(d_nwarps, 0, ntags, g_wid);
368
369 int b_wid = g_wid - d_nwarps[tag_id]; // b_wid'th warp on this box
370#ifdef AMREX_USE_SYCL
371 int lane = item.get_local_id(0) % Gpu::Device::warp_size;
372#else
373 int lane = threadIdx.x % Gpu::Device::warp_size;
374#endif
375 int icell = b_wid*Gpu::Device::warp_size + lane;
376
377 tagparfor_call_f(icell, d_tags[tag_id], f);
378 });
379}
380
381#else // ifdef AMREX_USE_GPU
382
383template <class TagType, class F>
384void
385ParallelFor_doit (TagVector<TagType> const& tv, F const& f)
386{
387 // Note: this CPU version may not have optimal performance:
388 // The loop over ncomp is the innermost instead of the outermost
389 // There is no load-balancing or splitting of tags
390 AMREX_ALWAYS_ASSERT(tv.is_defined());
391
392 constexpr bool tag_type = is_box_tag(TagType{});
393
394 if (tv.ntags == 0) { return; }
395
396 const auto d_tags = tv.d_tags;
397 const auto ntags = tv.ntags;
398
399#ifdef AMREX_USE_OMP
400#pragma omp parallel for
401#endif
402 for (int itag = 0; itag < ntags; ++itag) {
403
404 const auto& t = d_tags[itag];
405
406 if constexpr (tag_type) {
407 const auto lo = amrex::lbound(t.box());
408 const auto hi = amrex::ubound(t.box());
409
410 for (int k = lo.z; k <= hi.z; ++k) {
411 for (int j = lo.y; j <= hi.y; ++j) {
413 for (int i = lo.x; i <= hi.x; ++i) {
414 f(0, 1, i, j, k, t);
415 }
416 }
417 }
418 } else {
419 const auto size = t.size();
420
422 for (int i = 0; i < size; ++i) {
423 f(i, size, t);
424 }
425 }
426 }
427}
428
429#endif
430
431}
433
434template <class TagType, class F>
435requires (std::same_as<std::decay_t<decltype(std::declval<TagType>().box())>, Box>)
436void ParallelFor (TagVector<TagType> const& tv, int ncomp, F const& f)
437{
438 detail::ParallelFor_doit(tv,
439 [=] AMREX_GPU_DEVICE (
440 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
441 {
442 if (icell < ncells) {
443 for (int n = 0; n < ncomp; ++n) {
444 f(i,j,k,n,tag);
445 }
446 }
447 });
448}
449
450template <class TagType, class F>
451requires (std::same_as<std::decay_t<decltype(std::declval<TagType>().box())>, Box>)
452void ParallelFor (TagVector<TagType> const& tv, F const& f)
453{
454 detail::ParallelFor_doit(tv,
455 [=] AMREX_GPU_DEVICE (
456 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
457 {
458 if (icell < ncells) {
459 f(i,j,k,tag);
460 }
461 });
462}
463
464template <class TagType, class F>
465requires (std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >)
466void ParallelFor (TagVector<TagType> const& tv, F const& f)
467{
468 detail::ParallelFor_doit(tv,
469 [=] AMREX_GPU_DEVICE (
470 int icell, int ncells, TagType const& tag) noexcept
471 {
472 if (icell < ncells) {
473 f(icell,tag);
474 }
475 });
476}
477
478template <class TagType, class F>
479requires (std::same_as<std::decay_t<decltype(std::declval<TagType>().box())>, Box>)
480void ParallelFor (Vector<TagType> const& tags, int ncomp, F && f)
481{
482 TagVector<TagType> tv{tags};
483 ParallelFor(tv, ncomp, std::forward<F>(f));
484}
485
486template <class TagType, class F>
487requires (std::same_as<std::decay_t<decltype(std::declval<TagType>().box())>, Box>)
488void ParallelFor (Vector<TagType> const& tags, F && f)
489{
490 TagVector<TagType> tv{tags};
491 ParallelFor(tv, std::forward<F>(f));
492}
493
494template <class TagType, class F>
495requires (std::is_integral_v<std::decay_t<decltype(std::declval<TagType>().size())> >)
496void ParallelFor (Vector<TagType> const& tags, F && f)
497{
498 TagVector<TagType> tv{tags};
499 ParallelFor(tv, std::forward<F>(f));
500}
501
502}
503
504#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
static std::size_t align(std::size_t sz)
Given a minimum required arena size of sz bytes, this returns the next largest arena size that will a...
Definition AMReX_Arena.cpp:167
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
Encapsulation of the Orientation of the Faces of a Box.
Definition AMReX_Orientation.H:29
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:29
Long size() const noexcept
Definition AMReX_Vector.H:54
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1359
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1373
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
__host__ __device__ T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Find a root of a scalar function on a bracketing interval using bisection.
Definition AMReX_Algorithm.H:151
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_TagParallelFor.H:87
Box bx
Definition AMReX_TagParallelFor.H:89
Array4< T > fab
Definition AMReX_TagParallelFor.H:88
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:93
Dim3 offset
Definition AMReX_TagParallelFor.H:90
Definition AMReX_TagParallelFor.H:77
Array4< T > fab
Definition AMReX_TagParallelFor.H:78
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:83
Box bx
Definition AMReX_TagParallelFor.H:79
Orientation face
Definition AMReX_TagParallelFor.H:80
Definition AMReX_TagParallelFor.H:58
Array4< T > dfab
Definition AMReX_TagParallelFor.H:59
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:63
Box dbox
Definition AMReX_TagParallelFor.H:60
Definition AMReX_TagParallelFor.H:67
T val
Definition AMReX_TagParallelFor.H:70
Array4< T > dfab
Definition AMReX_TagParallelFor.H:68
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:73
Box dbox
Definition AMReX_TagParallelFor.H:69
Definition AMReX_TagParallelFor.H:26
Array4< T1 const > sfab
Definition AMReX_TagParallelFor.H:29
Dim3 offset
Definition AMReX_TagParallelFor.H:31
int dindex
Definition AMReX_TagParallelFor.H:28
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:34
Array4< T0 > dfab
Definition AMReX_TagParallelFor.H:27
Box dbox
Definition AMReX_TagParallelFor.H:30
Definition AMReX_TagParallelFor.H:38
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:46
Dim3 offset
Definition AMReX_TagParallelFor.H:43
Box dbox
Definition AMReX_TagParallelFor.H:42
Array4< T0 > dfab
Definition AMReX_TagParallelFor.H:39
Array4< int > mask
Definition AMReX_TagParallelFor.H:41
Array4< T1 const > sfab
Definition AMReX_TagParallelFor.H:40
Definition AMReX_TagParallelFor.H:16
Array4< T > dfab
Definition AMReX_TagParallelFor.H:17
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:22
Box dbox
Definition AMReX_TagParallelFor.H:19
Array4< T const > sfab
Definition AMReX_TagParallelFor.H:18
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
__host__ __device__ Box box() const noexcept
Definition AMReX_TagParallelFor.H:54
A multidimensional array accessor.
Definition AMReX_Array4.H:285
Definition AMReX_TagParallelFor.H:106
Box bx
Definition AMReX_TagParallelFor.H:109
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:112
std::ptrdiff_t poff
Definition AMReX_TagParallelFor.H:108
Array4< T > dfab
Definition AMReX_TagParallelFor.H:107
Definition AMReX_TagParallelFor.H:116
std::ptrdiff_t poff
Definition AMReX_TagParallelFor.H:118
__host__ __device__ Box const & box() const noexcept
Definition AMReX_TagParallelFor.H:122
Array4< T const > sfab
Definition AMReX_TagParallelFor.H:117
Box bx
Definition AMReX_TagParallelFor.H:119
Definition AMReX_Dim3.H:13
Definition AMReX_TagParallelFor.H:156
TagVector(TagVector &&other) noexcept
Definition AMReX_TagParallelFor.H:182
~TagVector()
Definition AMReX_TagParallelFor.H:174
char * h_buffer
Definition AMReX_TagParallelFor.H:158
TagVector(const TagVector &other)=delete
TagType * d_tags
Definition AMReX_TagParallelFor.H:160
bool defined
Definition AMReX_TagParallelFor.H:165
int * d_nwarps
Definition AMReX_TagParallelFor.H:161
TagVector(Vector< TagType > const &tags)
Definition AMReX_TagParallelFor.H:170
bool is_defined() const
Definition AMReX_TagParallelFor.H:225
int ntotwarps
Definition AMReX_TagParallelFor.H:163
char * d_buffer
Definition AMReX_TagParallelFor.H:159
int ntags
Definition AMReX_TagParallelFor.H:162
TagVector & operator=(const TagVector &other)=delete
int nblocks
Definition AMReX_TagParallelFor.H:164
static constexpr int nthreads
Definition AMReX_TagParallelFor.H:166
void undefine()
Definition AMReX_TagParallelFor.H:288
TagVector()=default
void define(Vector< TagType > const &tags)
Definition AMReX_TagParallelFor.H:227
Definition AMReX_TagParallelFor.H:97
T * p
Definition AMReX_TagParallelFor.H:98
Long m_size
Definition AMReX_TagParallelFor.H:99
__host__ __device__ Long size() const noexcept
Definition AMReX_TagParallelFor.H:102