Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4namespace amrex {
5
6template <class FAB>
7struct FabCopyTag {
8 FAB const* sfab;
10 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
11};
12
14 char const* p;
16};
17
19namespace detail {
20
21#ifdef AMREX_USE_GPU
22
23template <class T0, class T1>
24struct CellStore
25{
27 operator() (T0* d, T1 s) const noexcept
28 {
29 *d = static_cast<T0>(s);
30 }
31};
32
33template <class T0, class T1>
34struct CellAdd
35{
37 operator() (T0* d, T1 s) const noexcept
38 {
39 *d += static_cast<T0>(s);
40 }
41};
42
43template <class T0, class T1>
44struct CellAtomicAdd
45{
46 template<class U0=T0>
49 operator() (U0* d, T1 s) const noexcept
50 {
51 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
52 }
53};
54
55template <class T0, class T1, class F>
56void
57fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
58 F && f)
59{
60 TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
61
62 detail::ParallelFor_doit(tv,
64 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const& tag) noexcept
65 {
66 if (icell < ncells) {
67 for (int n = 0; n < ncomp; ++n) {
68 f(&(tag.dfab(i,j,k,n+dcomp)),
69 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
70 }
71 }
72 });
73}
74
75template <class TagType, class F>
76void
77fab_to_fab_store (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
78{
80 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
81 {
82 int m = Gpu::Atomic::Add(&(tag.mask(i,j,k)), 1);
83 if (m == 0) {
84 for (int n = 0; n < ncomp; ++n) {
85 f(&(tag.dfab(i,j,k,n+dcomp)),
86 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
87 }
88 }
89 });
90}
91
92template <class TagType, class F>
93void
94fab_to_fab_other (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
95{
97 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
98 {
99 int* m = &(tag.mask(i,j,k));
100 bool my_turn = false;
101 do {
102#if defined(AMREX_USE_SYCL)
103 my_turn = (Gpu::Atomic::Exch(m, 1) == 0);
104#else
105 my_turn = (Gpu::Atomic::CAS(m, 0, 1) == 0);
106#endif
107 if (my_turn) {
108#if defined(AMREX_USE_SYCL)
109 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
110#else
111 __threadfence();
112#endif
113 for (int n = 0; n < ncomp; ++n) {
114 f(&(tag.dfab(i,j,k,n+dcomp)),
115 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
116 }
117#if defined(AMREX_USE_SYCL)
118 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
119#else
120 __threadfence(); // It appears that we need this fence too if a GPU is shared.
121#endif
122 Gpu::Atomic::Exch(m, 0);
123 }
124 else {
125#if defined(AMREX_USE_CUDA)
126
127#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
128#if defined(_WIN32)
129 volatile int tmp; // prevent optimization
130 for (int c = 0; c < 2; ++c) {
131 ++tmp;
132 }
133#else
134 for (int c = 0; c < 2; ++c) {
135 __asm__ volatile(""); // prevent optimization
136 }
137#endif
138#else
139 __nanosleep(1);
140#endif
141
142#elif defined(AMREX_USE_HIP)
143
144 __builtin_amdgcn_s_sleep(1);
145
146#elif defined(AMREX_USE_SYCL)
147
148 for (int c = 0; c < 2; ++c) {
149 __asm__ volatile(""); // prevent optimization
150 }
151
152#endif
153 }
154 } while (!my_turn);
155 });
156}
157
158template <class T0, class T1, class F>
159void
160fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
161 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
162{
163 using TagType = Array4MaskCopyTag<T0, T1>;
164 Vector<TagType> tags;
165 const int N = copy_tags.size();
166 tags.reserve(N);
167 for (int i = 0; i < N; ++i) {
168 tags.push_back(TagType{.dfab = copy_tags[i].dfab, .sfab = copy_tags[i].sfab,
169 .mask = masks[i].dfab, .dbox = copy_tags[i].dbox,
170 .offset = copy_tags[i].offset});
171 }
172
173 if constexpr (std::is_same_v<F, CellStore<T0,T1>>)
174 {
175 fab_to_fab_store(tags, scomp, dcomp, ncomp, std::forward<F>(f));
176 }
177 else
178 {
179 fab_to_fab_other(tags, scomp, dcomp, ncomp, std::forward<F>(f));
180 }
181 // Note that Tag ParalleFor has a stream sync call in the end.
182}
183
184template <typename T0, typename T1>
186void
187fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
188 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
189{
190 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
191}
192
193template <typename T0, typename T1>
195void
196fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
197 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
198{
199 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
200}
201
202template <typename T0, typename T1>
204void
205fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
206 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
207{
208 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
209}
210
211template <typename T0, typename T1>
213void
214fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
215 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
216{
217 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
218}
219
220template <typename T0, typename T1, class F>
221void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int scomp,
222 int dcomp, int ncomp, F const& f)
223{
224 if (a_tags.empty()) { return; }
225
226 using TagType = Array4CopyTag<T0,T1>;
227
228 struct TiledTag {
229 int tag_index;
230 std::pair<int,Box> dindex_tilebox;
231 bool operator< (TiledTag const& rhs) const noexcept {
232 return this->dindex_tilebox < rhs.dindex_tilebox;
233 }
234 bool operator!= (TiledTag const& rhs) const noexcept {
235 return this->dindex_tilebox != rhs.dindex_tilebox;
236 }
237 };
238 Vector<TiledTag> tiled_tags;
239
240 auto const ixtype = a_tags[0].dbox.ixType();
241
242 constexpr int tile_size = 64;
243 for (int itag = 0; itag < a_tags.size(); ++itag) {
244 auto const& tag = a_tags[itag];
245 auto const& dlo = tag.dbox.smallEnd();
246 auto const& dhi = tag.dbox.bigEnd();
247 IntVect tlo(AMREX_D_DECL(amrex::coarsen<tile_size>(dlo[0]),
248 amrex::coarsen<tile_size>(dlo[1]),
249 amrex::coarsen<tile_size>(dlo[2])));
250 IntVect thi(AMREX_D_DECL(amrex::coarsen<tile_size>(dhi[0]),
251 amrex::coarsen<tile_size>(dhi[1]),
252 amrex::coarsen<tile_size>(dhi[2])));
253#if (AMREX_SPACEDIM == 3)
254 for (int kt = tlo[2]; kt <= thi[2]; ++kt)
255#endif
256 {
257#if (AMREX_SPACEDIM >= 2)
258 for (int jt = tlo[1]; jt <= thi[1]; ++jt)
259#endif
260 {
261 for (int it = tlo[0]; it <= thi[0]; ++it)
262 {
263 IntVect lo(AMREX_D_DECL(it*tile_size,
264 jt*tile_size,
265 kt*tile_size));
266 tiled_tags.push_back(TiledTag{
267 .tag_index = itag,
268 .dindex_tilebox = std::make_pair(tag.dindex, Box(lo, lo+(tile_size-1), ixtype))
269 });
270 }
271 }
272 }
273 }
274
275 std::sort(tiled_tags.begin(), tiled_tags.end());
276
277 Gpu::HostVector<unsigned int> h_ntags;
278 Gpu::HostVector<TagType> h_tags;
279 h_tags.reserve(tiled_tags.size());
280
281 for (unsigned int itag = 0; itag < tiled_tags.size(); ++itag) {
282 if (itag == 0) {
283 h_ntags.push_back(0);
284 } else if (tiled_tags[itag-1] != tiled_tags[itag]) {
285 h_ntags.push_back(itag);
286 }
287 auto const& ttag = tiled_tags[itag];
288 auto const& btag = a_tags[ttag.tag_index];
289 h_tags.push_back(TagType{.dfab = btag.dfab, .dindex = btag.dindex, .sfab = btag.sfab,
290 .dbox = btag.dbox & ttag.dindex_tilebox.second,
291 .offset = btag.offset});
292 }
293 h_ntags.push_back((unsigned int)tiled_tags.size());
294
295 Gpu::DeviceVector<TagType> d_tags(h_tags.size());
296 Gpu::DeviceVector<unsigned int> d_ntags(h_ntags.size());
297 Gpu::copyAsync(Gpu::hostToDevice,h_tags.begin(),h_tags.end(),d_tags.begin());
298 Gpu::copyAsync(Gpu::hostToDevice,h_ntags.begin(),h_ntags.end(),d_ntags.begin());
299 auto const* ptag = d_tags.data();
300 auto const* pntags = d_ntags.data();
301 auto const nblocks = int(h_ntags.size()-1);
302 constexpr auto nthreads = 256;
303 amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
304#ifdef AMREX_USE_SYCL
305 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
306 [[sycl::reqd_work_group_size(nthreads)]]
307#else
308 [=] AMREX_GPU_DEVICE () noexcept
309#endif
310 {
311#ifdef AMREX_USE_SYCL
312 Dim1 blockIdx{item.get_group_linear_id()};
313 Dim1 threadIdx{item.get_local_linear_id()};
314#endif
315
316 for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
317 auto const tag = ptag[itag];
318 auto ncells = int(tag.dbox.numPts());
319 const auto len = amrex::length(tag.dbox);
320 const auto lo = amrex::lbound(tag.dbox);
321 for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
322 int k = icell / (len.x*len.y);
323 int j = (icell - k*(len.x*len.y)) / len.x;
324 int i = (icell - k*(len.x*len.y)) - j*len.x;
325 i += lo.x;
326 j += lo.y;
327 k += lo.z;
328 for (int n = 0; n < ncomp; ++n) {
329 f(tag.dfab.ptr(i,j,k,n+dcomp),
330 tag.sfab(i + tag.offset.x,
331 j + tag.offset.y,
332 k + tag.offset.z, n+scomp));
333 }
334 }
335
336 if (itag+1 < pntags[blockIdx.x+1]) {
337#ifdef AMREX_USE_SYCL
338 sycl::group_barrier(item.get_group());
339#else
340 __syncthreads();
341#endif
342 }
343 }
344 });
346}
347
348template <typename B, typename V, typename TT>
350void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
351 int dcomp, int ncomp)
352{
353 detail::ParallelFor_doit(tv,
354 [=] AMREX_GPU_DEVICE (
355 int icell, int ncells, int i, int j, int k, TT const& tag) noexcept
356 {
357 if (icell < ncells) {
358 Array4<B const> sfab{(B const*)(pbuffer+tag.poff),
359 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
360 for (int n = 0; n < ncomp; ++n) {
361 Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n+dcomp),
362 (V)sfab(i,j,k,n));
363 }
364 }
365 });
366}
367
368template <typename B, typename V, typename TT>
370void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
371 int dcomp, int ncomp)
372{
373 amrex::ignore_unused(pbuffer, tv, dcomp, ncomp);
374 amrex::Abort("unpack_recv_buffer_gpu: should NOT get here");
375}
376
377#endif /* AMREX_USE_GPU */
378
379}
381
382template <class FAB>
383void
384FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
385{
386 auto const& LocTags = *(TheFB.m_LocTags);
387 auto N_locs = static_cast<int>(LocTags.size());
388 if (N_locs == 0) { return; }
389 bool is_thread_safe = TheFB.m_threadsafe_loc;
390 if (is_thread_safe)
391 {
392#ifdef AMREX_USE_OMP
393#pragma omp parallel for
394#endif
395 for (int i = 0; i < N_locs; ++i)
396 {
397 const CopyComTag& tag = LocTags[i];
398
399 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
400 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
401
402 const FAB* sfab = &(get(tag.srcIndex));
403 FAB* dfab = &(get(tag.dstIndex));
404 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
405 }
406 }
407 else
408 {
410 for (int i = 0; i < N_locs; ++i)
411 {
412 const CopyComTag& tag = LocTags[i];
413
414 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
415 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
416
417 loc_copy_tags[tag.dstIndex].push_back
418 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
419 }
420#ifdef AMREX_USE_OMP
421#pragma omp parallel
422#endif
423 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
424 {
425 const auto& tags = loc_copy_tags[mfi];
426 auto dfab = this->array(mfi);
427 for (auto const & tag : tags)
428 {
429 auto const sfab = tag.sfab->array();
430 const auto offset = tag.offset.dim3();
431 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
432 [=] (int i, int j, int k, int n) noexcept
433 {
434 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
435 });
436 }
437 }
438 }
439}
440
441template <class FAB>
442void
443FabArray<FAB>::FB_local_add_cpu (const FB& TheFB, int scomp, int ncomp)
444{
445 auto const& LocTags = *(TheFB.m_LocTags);
446 auto N_locs = static_cast<int>(LocTags.size());
447 if (N_locs == 0) { return; }
448
450 // We must make a temporary copy of the data first to avoid race condition.
451 std::vector<FAB> src_fabs(N_locs);
452 for (int itag = 0; itag < N_locs; ++itag) {
453 const CopyComTag& tag = LocTags[itag];
454 src_fabs[itag].resize(tag.sbox,ncomp);
455 loc_copy_tags[tag.dstIndex].push_back
456 (FabCopyTag<FAB>{&(src_fabs[itag]),
457 tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
458 }
459
460#ifdef AMREX_USE_OMP
461#pragma omp parallel for
462#endif
463 for (int itag = 0; itag < N_locs; ++itag) {
464 const CopyComTag& tag = LocTags[itag];
465 src_fabs[itag].template copy<RunOn::Host>(this->operator[](tag.srcIndex), scomp, 0, ncomp);
466 }
467
468#ifdef AMREX_USE_OMP
469#pragma omp parallel
470#endif
471 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
472 {
473 const auto& tags = loc_copy_tags[mfi];
474 const auto& dfab = this->array(mfi);
475 for (auto const & tag : tags)
476 {
477 auto const sfab = tag.sfab->array();
478 const auto offset = tag.offset.dim3();
479 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
480 [&] (int i, int j, int k, int n) noexcept
481 {
482 dfab(i,j,k,n+scomp) += sfab(i+offset.x,j+offset.y,k+offset.z,n);
483 });
484 }
485 }
486}
487
488#ifdef AMREX_USE_GPU
489
490template <class FAB>
493{
494 auto const& LocTags = *(TheFB.m_LocTags);
495 int N_locs = LocTags.size();
496
497 using TagType = Array4CopyTag<value_type>;
498
500 if (auto it = m_fb_local_copy_handler.find(TheFB.m_id);
501 it != m_fb_local_copy_handler.end())
502 {
503 tv = it->second.get();
504 } else {
505 Vector<TagType> loc_copy_tags;
506 loc_copy_tags.reserve(N_locs);
507
508 for (int i = 0; i < N_locs; ++i)
509 {
510 const CopyComTag& tag = LocTags[i];
511
512 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
513 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
514
515 int li = this->localindex(tag.dstIndex);
516 loc_copy_tags.push_back
517 (TagType{.dfab = this->atLocalIdx(li).array(),
518 .dindex = tag.dstIndex,
519 .sfab = this->fabPtr(tag.srcIndex)->const_array(),
520 .dbox = tag.dbox,
521 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
522 }
523
524 auto utv = std::make_unique<TagVector<TagType>>(loc_copy_tags);
525 tv = utv.get();
526 m_fb_local_copy_handler[TheFB.m_id] = std::move(utv);
527 }
528
529 return tv;
530}
531
532template <class FAB>
533void
534FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
535{
536 auto const& LocTags = *(TheFB.m_LocTags);
537 int N_locs = LocTags.size();
538 if (N_locs == 0) { return; }
539 bool is_thread_safe = TheFB.m_threadsafe_loc;
540
541 using TagType = Array4CopyTag<value_type>;
542
543 if (is_thread_safe || amrex::IsStoreAtomic<value_type>::value)
544 {
545 auto* tv = FB_get_local_copy_tag_vector(TheFB);
546
547 detail::ParallelFor_doit(*tv,
548 [=] AMREX_GPU_DEVICE (
549 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
550 {
551 if (icell < ncells) {
552 for (int n = 0; n < ncomp; ++n) {
553 tag.dfab(i,j,k,n+scomp) = tag.sfab(i+tag.offset.x,
554 j+tag.offset.y,
555 k+tag.offset.z,n+scomp);
556 }
557 }
558 });
559 }
560 else
561 {
562 Vector<TagType> loc_copy_tags;
563 loc_copy_tags.reserve(N_locs);
564
565 Vector<BaseFab<int>> maskfabs(this->local_size());
566 Vector<Array4Tag<int> > masks_unique;
567 masks_unique.reserve(this->local_size());
568 Vector<Array4Tag<int> > masks;
569 masks.reserve(N_locs);
570
571 for (int i = 0; i < N_locs; ++i)
572 {
573 const CopyComTag& tag = LocTags[i];
574
575 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
576 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
577
578 int li = this->localindex(tag.dstIndex);
579 loc_copy_tags.push_back
580 (TagType{.dfab = this->atLocalIdx(li).array(),
581 .dindex = tag.dstIndex,
582 .sfab = this->fabPtr(tag.srcIndex)->const_array(),
583 .dbox = tag.dbox,
584 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
585
586 if (!maskfabs[li].isAllocated()) {
587 maskfabs[li].resize(this->atLocalIdx(li).box());
588 masks_unique.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
589 }
590 masks.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
591 }
592
593 amrex::ParallelFor(masks_unique,
594 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
595 {
596 msk.dfab(i,j,k) = 0;
597 });
598
599 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
600 loc_copy_tags, scomp, scomp, ncomp, masks);
601 }
602}
603
604template <class FAB>
605void
606FabArray<FAB>::FB_local_add_gpu (const FB& TheFB, int scomp, int ncomp, bool deterministic)
607{
608 auto const& LocTags = *(TheFB.m_LocTags);
609 int N_locs = LocTags.size();
610 if (N_locs == 0) { return; }
611
612 using TagType = Array4CopyTag<value_type>;
613
614 Vector<TagType> loc_copy_tags_1, loc_copy_tags_2;
615 loc_copy_tags_1.reserve(N_locs);
616 loc_copy_tags_2.reserve(N_locs);
617
618 Vector<FAB> src_fabs(N_locs);
619 for (int itag = 0; itag < N_locs; ++itag) {
620 const CopyComTag& tag = LocTags[itag];
621 src_fabs[itag].resize(tag.sbox,ncomp);
622 loc_copy_tags_1.push_back(
623 TagType{.dfab = src_fabs[itag].array(), .dindex = -1,
624 .sfab = this->const_array(tag.srcIndex,scomp), .dbox = tag.sbox,
625 .offset = Dim3{.x = 0, .y = 0, .z = 0}});
626 loc_copy_tags_2.push_back(
627 TagType{.dfab = this->array(tag.dstIndex,scomp), .dindex = tag.dstIndex,
628 .sfab = src_fabs[itag].const_array(), .dbox = tag.dbox,
629 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
630 }
631
632 // Note that we have shifted the starting component to zero in the code above.
633
634 // TODO: We could try to cache the tags like in FillBoundary.
635
636 detail::fab_to_fab(loc_copy_tags_1, 0, 0, ncomp,
637 detail::CellStore<value_type, value_type>{});
638 if (deterministic || ! amrex::HasAtomicAdd<value_type>::value) {
639 detail::deterministic_fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
640 detail::CellAdd<value_type,value_type>{});
641 } else {
643 detail::fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
644 detail::CellAtomicAdd<value_type, value_type>{});
645 }
646 ((void)0);
647 }
648
649 // Note that fab_to_fab is synchronous.
650}
651
652template <class FAB>
653void
655 const CommMetaData& thecmd, int scomp, int ncomp)
656{
657 auto const& LocTags = *(thecmd.m_LocTags);
658 int N_locs = LocTags.size();
659 if (N_locs == 0) { return; }
660 bool is_thread_safe = thecmd.m_threadsafe_loc;
661
662 using TagType = Array4BoxTag<value_type>;
663 Vector<TagType> loc_setval_tags;
664 loc_setval_tags.reserve(N_locs);
665
667
668 for (int i = 0; i < N_locs; ++i)
669 {
670 const CopyComTag& tag = LocTags[i];
671 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
672 loc_setval_tags.push_back(TagType{.dfab = this->array(tag.dstIndex), .dbox = tag.dbox});
673 }
674
675 amrex::ParallelFor(loc_setval_tags, ncomp,
676 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
677 {
678 tag.dfab(i,j,k,n+scomp) = x;
679 });
680}
681
682template <class FAB>
683void
685 const CommMetaData& thecmd, int scomp, int ncomp)
686{
687 auto const& RcvTags = *(thecmd.m_RcvTags);
688 bool is_thread_safe = thecmd.m_threadsafe_rcv;
689
690 using TagType = Array4BoxTag<value_type>;
691 Vector<TagType> rcv_setval_tags;
692
693 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
694 for (auto const& tag: it->second) {
695 rcv_setval_tags.push_back(TagType{.dfab = this->array(tag.dstIndex), .dbox = tag.dbox});
696 }
697 }
698
699 if (rcv_setval_tags.empty()) { return; }
700
702
703 amrex::ParallelFor(rcv_setval_tags, ncomp,
704 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
705 {
706 tag.dfab(i,j,k,n+scomp) = x;
707 });
708}
709
710#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
711template <class FAB>
712void
713FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
714{
715 const int N_locs = (*TheFB.m_LocTags).size();
717 for (int i = 0; i < N_locs; ++i)
718 {
719 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
720
721 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
722 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
723
724 loc_copy_tags[tag.dstIndex].push_back
725 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
726 }
727
728 // Create Graph if one is needed.
729 if ( !(TheFB.m_localCopy.ready()) )
730 {
731 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
732
733 int idx = 0;
734 // Record the graph.
735 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
736 {
737 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
738 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
739 (TheFB).m_localCopy.getDevicePtr(0),
740 std::size_t(sizeof(CopyMemory)*N_locs) );
741
742 const auto& tags = loc_copy_tags[mfi];
743 for (auto const & tag : tags)
744 {
745 const auto offset = tag.offset.dim3();
746 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
747 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
748 {
749 // Build the Array4's.
750 auto const dst = cmem->getDst<value_type>();
751 auto const src = cmem->getSrc<value_type>();
752 for (int n = 0; n < cmem->ncomp; ++n) {
753 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
754 }
755 });
756 }
757
758 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
759 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
760 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
761 }
762 }
763
764 // Setup Launch Parameters
765 // This is perfectly threadable, right?
766 // Additional optimization -> Check to see whether values need to be reset?
767 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
768 int idx = 0;
769 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
770 {
771 auto const dst_array = this->array(mfi);
772 const auto& tags = loc_copy_tags[mfi];
773 for (auto const & tag : tags)
774 {
775 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
776 dst_array,
777 scomp, ncomp));
778 }
779 }
780
781 // Launch Graph
782 TheFB.m_localCopy.executeGraph();
783}
784
785#ifdef AMREX_USE_MPI
786template <class FAB>
787void
788FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
789{
790 const int N_locs = TheFB.m_LocTags->size();
791
792 int launches = 0; // Used for graphs only.
793 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
794 for (int i = 0; i < N_locs; ++i)
795 {
796 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
797
798 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
799 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
800
801 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
802 {
803 loc_copy_tags[tag.dstIndex].push_back
804 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
805 launches++;
806 }
807 }
808
809 FillBoundary_test();
810
811 if ( !(TheFB.m_localCopy.ready()) )
812 {
813 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
814
815 int idx = 0;
816 int cuda_stream = 0;
817 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
818 {
819 const auto& tags = loc_copy_tags[mfi];
820 for (int t = 0; t<tags.size(); ++t)
821 {
822 Gpu::Device::setStreamIndex(cuda_stream++);
823 amrex::Gpu::Device::startGraphRecording( (idx == 0),
824 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
825 (TheFB).m_localCopy.getDevicePtr(0),
826 std::size_t(sizeof(CopyMemory)*launches) );
827
828 const auto& tag = tags[t];
829 const Dim3 offset = tag.offset.dim3();
830
831 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
832 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
833 {
834 auto const dst = cmem->getDst<value_type>();
835 auto const src = cmem->getSrc<value_type>();
836 for (int n = 0; n < cmem->ncomp; ++n) {
837 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
838 }
839 });
840
841 bool last_iter = idx == launches;
842 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
843 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
844 }
845 }
846 }
847
848 // Setup Launch Parameters
849 // This is perfectly threadable, right?
850 int idx = 0;
851 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
852 {
853 const auto& dst_array = this->array(mfi);
854 const auto& tags = loc_copy_tags[mfi];
855 for (auto const & tag : tags)
856 {
857 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
858 dst_array,
859 scomp, ncomp));
860 }
861 }
862
863 // Launch Graph without synch. Local work is entirely independent.
864 TheFB.m_localCopy.executeGraph(false);
865}
866#endif /* AMREX_USE_MPI */
867
868#endif /* __CUDACC__ */
869
870#endif /* AMREX_USE_GPU */
871
872#ifdef AMREX_USE_MPI
873
874#ifdef AMREX_USE_GPU
875
876#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
877
878template <class FAB>
879void
880FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
881 Vector<char*>& send_data,
882 Vector<std::size_t> const& send_size,
883 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
884{
885 const int N_snds = send_data.size();
886 if (N_snds == 0) { return; }
887
888 if ( !(TheFB.m_copyToBuffer.ready()) )
889 {
890 // Set size of CudaGraph buffer.
891 // Is the conditional ever expected false?
892 int launches = 0;
893 for (int send = 0; send < N_snds; ++send) {
894 if (send_size[send] > 0) {
895 launches += send_cctc[send]->size();
896 }
897 }
898 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
899
900 // Record the graph.
901 int idx = 0;
902 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
903 sit.isValid(); ++sit)
904 {
905 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
906 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
907 (TheFB).m_copyToBuffer.getDevicePtr(0),
908 std::size_t(sizeof(CopyMemory)*launches) );
909
910 const int j = sit();
911 if (send_size[j] > 0)
912 {
913 auto const& cctc = *send_cctc[j];
914 for (auto const& tag : cctc)
915 {
916 const Box& bx = tag.sbox;
917 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
918 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
919 {
920 auto const pfab = cmem->getDst<value_type>();
921 auto const sfab = cmem->getSrc<value_type>();
922 for (int n = 0; n < cmem->ncomp; ++n)
923 {
924 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
925 }
926 });
927 }
928 }
929
930 bool last_iter = sit() == (N_snds-1);
931 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
932 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
933 }
934 }
935
936 // Setup Launch Parameters
937 int idx = 0;
938 for (int send = 0; send < N_snds; ++send)
939 {
940 const int j = send;
941 if (send_size[j] > 0)
942 {
943 char* dptr = send_data[j];
944 auto const& cctc = *send_cctc[j];
945 for (auto const& tag : cctc)
946 {
947 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
948 amrex::makeArray4((value_type*)(dptr),
949 tag.sbox,
950 ncomp),
951 scomp, ncomp));
952
953 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
954 }
955 amrex::ignore_unused(send_size);
956 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
957 }
958 }
959
960 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
961 TheFB.m_copyToBuffer.executeGraph();
962}
963
964template <class FAB>
965void
966FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
967 Vector<char*> const& recv_data,
968 Vector<std::size_t> const& recv_size,
969 Vector<CopyComTagsContainer const*> const& recv_cctc,
970 bool /*is_thread_safe*/)
971{
972 const int N_rcvs = recv_cctc.size();
973 if (N_rcvs == 0) { return; }
974
975 int launches = 0;
976 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
977 for (int k = 0; k < N_rcvs; ++k)
978 {
979 if (recv_size[k] > 0)
980 {
981 const char* dptr = recv_data[k];
982 auto const& cctc = *recv_cctc[k];
983 for (auto const& tag : cctc)
984 {
985 recv_copy_tags[tag.dstIndex].push_back(VoidCopyTag{.p = dptr, .dbox = tag.dbox});
986 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
987 launches++;
988 }
989 amrex::ignore_unused(recv_size);
990 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
991 }
992 }
993
994 if ( !(TheFB.m_copyFromBuffer.ready()) )
995 {
996 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
997
998 int idx = 0;
999 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
1000 {
1001 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
1002 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
1003 (TheFB).m_copyFromBuffer.getDevicePtr(0),
1004 std::size_t(sizeof(CopyMemory)*launches) );
1005
1006 const auto& tags = recv_copy_tags[mfi];
1007 for (auto const & tag : tags)
1008 {
1009 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
1010 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
1011 {
1012 auto const pfab = cmem->getSrc<value_type>();
1013 auto const dfab = cmem->getDst<value_type>();
1014 for (int n = 0; n < cmem->ncomp; ++n)
1015 {
1016 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
1017 }
1018 });
1019 }
1020
1021 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
1022 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
1023 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
1024 }
1025 }
1026
1027 // Setup graph.
1028 int idx = 0;
1029 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
1030 {
1031 auto dst_array = this->array(mfi);
1032 const auto & tags = recv_copy_tags[mfi];
1033 for (auto const & tag : tags)
1034 {
1035 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
1036 tag.dbox,
1037 ncomp),
1038 dst_array,
1039 dcomp, ncomp));
1040 }
1041 }
1042
1043 // Launch Graph - synced because next action is freeing recv buffer.
1044 TheFB.m_copyFromBuffer.executeGraph();
1045}
1046
1047#endif /* __CUDACC__ */
1048
1049template <class FAB>
1050template <typename BUF>
1051auto
1053 Vector<std::size_t> const& send_size,
1054 Vector<CopyComTagsContainer const*> const& send_cctc,
1055 int ncomp, std::uint64_t id) const
1057{
1058 using TagType = CommSendBufTag<value_type>;
1059
1060 auto kit = std::find_if(send_cctc.begin(), send_cctc.end(),
1061 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1062 if (kit == send_cctc.end()) {
1063 return nullptr;
1064 }
1065
1066 auto get_tags = [&] () -> Vector<TagType>
1067 {
1068 Vector<TagType> snd_copy_tags;
1069 char* pbuf = send_data[0];
1070 const int N_snds = send_data.size();
1071 for (int j = 0; j < N_snds; ++j)
1072 {
1073 if (send_size[j] > 0)
1074 {
1075 char* dptr = send_data[j];
1076 auto const& cctc = *send_cctc[j];
1077 for (auto const& tag : cctc)
1078 {
1079 snd_copy_tags.emplace_back
1080 (TagType{.sfab = this->const_array(tag.srcIndex), .poff = dptr-pbuf, .bx = tag.sbox});
1081 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
1082 }
1083 }
1084 }
1085 return snd_copy_tags;
1086 };
1087
1089 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1090
1091 if (auto it = m_send_copy_handler.find(key); it != m_send_copy_handler.end()) {
1092 tv = it->second.get();
1093 } else {
1094 if (m_send_copy_handler.size() > 32) {
1095 // Just in case. If this is used in ParallelCopy, it's possible
1096 // that the sending FabArray is the same, but the receiving
1097 // FabArray is different every time. Then the size of this map
1098 // could increase indefinitely.
1099 m_send_copy_handler.clear();
1100 }
1101 auto snd_copy_tags = get_tags();
1102 auto utv = std::make_unique<TagVector<TagType>>(snd_copy_tags);
1103 tv = utv.get();
1104 m_send_copy_handler[key] = std::move(utv);
1105 }
1106
1107 return tv;
1108}
1109
1110template <class FAB>
1111template <typename BUF>
1112void
1113FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
1114 Vector<char*> const& send_data,
1115 Vector<std::size_t> const& send_size,
1116 Vector<CopyComTagsContainer const*> const& send_cctc,
1117 std::uint64_t id)
1118{
1119 const int N_snds = send_data.size();
1120 if (N_snds == 0) { return; }
1121
1122 using TagType = CommSendBufTag<value_type>;
1123
1124 auto* tv = src.template get_send_copy_tag_vector<BUF>
1125 (send_data, send_size, send_cctc, ncomp, id);
1126 if (tv == nullptr) { return; }
1127
1128 char* pbuffer = send_data[0];
1129
1130 detail::ParallelFor_doit(*tv,
1131 [=] AMREX_GPU_DEVICE (
1132 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1133 {
1134 if (icell < ncells) {
1135 Array4<BUF> dfab{(BUF*)(pbuffer+tag.poff),
1136 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1137 for (int n = 0; n < ncomp; ++n) {
1138 dfab(i,j,k,n) = (BUF)tag.sfab(i,j,k,n+scomp);
1139 }
1140 }
1141 });
1142
1143 Gpu::streamSynchronize();
1144}
1145
1146template <class FAB>
1147template <typename BUF>
1148auto
1150 Vector<std::size_t> const& recv_size,
1151 Vector<CopyComTagsContainer const*> const& recv_cctc,
1152 int ncomp, std::uint64_t id)
1154{
1155 using TagType = CommRecvBufTag<value_type>;
1156
1157 auto kit = std::find_if(recv_cctc.begin(), recv_cctc.end(),
1158 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1159 if (kit == recv_cctc.end()) {
1160 return nullptr;
1161 }
1162
1163 auto get_tags = [&] () -> Vector<TagType>
1164 {
1165 Vector<TagType> recv_copy_tags;
1166 char* pbuf = recv_data[0];
1167 const int N_rcvs = recv_cctc.size();
1168 for (int k = 0; k < N_rcvs; ++k)
1169 {
1170 if (recv_size[k] > 0)
1171 {
1172 char* dptr = recv_data[k];
1173 auto const& cctc = *recv_cctc[k];
1174 for (auto const& tag : cctc)
1175 {
1176 const int li = this->localindex(tag.dstIndex);
1177 recv_copy_tags.emplace_back
1178 (TagType{.dfab = this->atLocalIdx(li).array(), .poff = dptr-pbuf, .bx = tag.dbox});
1179 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1180 }
1181 }
1182 }
1183 return recv_copy_tags;
1184 };
1185
1187 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1188
1189 if (auto it = m_recv_copy_handler.find(key); it != m_recv_copy_handler.end()) {
1190 tv = it->second.get();
1191 } else {
1192 if (m_recv_copy_handler.size() > 32) {
1193 // Just in case. If this is used in ParallelCopy, it's possible
1194 // that the receiving FabArray is the same, but the sending
1195 // FabArray is different every time. Then the size of this map
1196 // could increase indefinitely.
1197 m_recv_copy_handler.clear();
1198 }
1199 auto recv_copy_tags = get_tags();
1200 auto utv = std::make_unique<TagVector<TagType>>(recv_copy_tags);
1201 tv = utv.get();
1202 m_recv_copy_handler[key] = std::move(utv);
1203 }
1204
1205 return tv;
1206}
1207
1208template <class FAB>
1209template <typename BUF>
1210void
1212 Vector<char*> const& recv_data,
1213 Vector<std::size_t> const& recv_size,
1214 Vector<CopyComTagsContainer const*> const& recv_cctc,
1215 CpOp op, bool is_thread_safe, std::uint64_t id,
1216 bool deterministic)
1217{
1218 const int N_rcvs = recv_cctc.size();
1219 if (N_rcvs == 0) { return; }
1220
1221 bool use_mask = false;
1222 if (!is_thread_safe)
1223 {
1224 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
1225 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
1226 {
1227 use_mask = true;
1228 }
1229 }
1230
1231 if (deterministic)
1232 {
1233 AMREX_ALWAYS_ASSERT(op == FabArrayBase::ADD); // Only ADD for now
1234 using TagType = Array4CopyTag<value_type,BUF>;
1235 Vector<TagType> tags;
1236 tags.reserve(N_rcvs);
1237 for (int k = 0; k < N_rcvs; ++k) {
1238 if (recv_size[k] > 0) {
1239 char const* dptr = recv_data[k];
1240 auto const& cctc = *recv_cctc[k];
1241 for (auto const& tag : cctc) {
1242 tags.emplace_back(
1243 TagType{.dfab = dst.array(tag.dstIndex), .dindex = tag.dstIndex,
1244 .sfab = Array4<BUF const>((BUF const*)dptr,
1245 amrex::begin(tag.dbox),
1246 amrex::end(tag.dbox), ncomp),
1247 .dbox = tag.dbox,
1248 .offset = Dim3{.x = 0, .y = 0, .z = 0}});
1249 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1250 }
1251 }
1252 }
1254 detail::deterministic_fab_to_fab<value_type,BUF>
1255 (tags, 0, dcomp, ncomp, detail::CellAdd<value_type,BUF>{});
1256 } else {
1257 amrex::Abort("SumBoundary requires operator+=");
1258 }
1259 }
1260 else if (!use_mask)
1261 {
1262 using TagType = CommRecvBufTag<value_type>;
1263 auto* tv = dst.template get_recv_copy_tag_vector<BUF>
1264 (recv_data, recv_size, recv_cctc, ncomp, id);
1265 if (tv == nullptr) { return; }
1266
1267 char* pbuffer = recv_data[0];
1268
1269 if (op == FabArrayBase::COPY)
1270 {
1271 detail::ParallelFor_doit(*tv,
1272 [=] AMREX_GPU_DEVICE (
1273 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1274 {
1275 if (icell < ncells) {
1276 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1277 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1278 for (int n = 0; n < ncomp; ++n) {
1279 tag.dfab(i,j,k,n+dcomp) = (value_type)sfab(i,j,k,n);
1280 }
1281 }
1282 });
1283 }
1284 else
1285 {
1286 if (is_thread_safe) {
1287 detail::ParallelFor_doit(*tv,
1288 [=] AMREX_GPU_DEVICE (
1289 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1290 {
1291 if (icell < ncells) {
1292 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1293 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1294 for (int n = 0; n < ncomp; ++n) {
1295 tag.dfab(i,j,k,n+dcomp) += (value_type)sfab(i,j,k,n);
1296 }
1297 }
1298 });
1299 } else {
1300 detail::unpack_recv_buffer_gpu_atomic_add<BUF, value_type>
1301 (pbuffer, *tv, dcomp, ncomp);
1302 }
1303 }
1304 Gpu::streamSynchronize();
1305 }
1306 else
1307 {
1308 char* pbuffer = recv_data[0];
1309
1310 using TagType = Array4CopyTag<value_type, BUF>;
1311 Vector<TagType> recv_copy_tags;
1312 recv_copy_tags.reserve(N_rcvs);
1313
1314 Vector<BaseFab<int> > maskfabs(dst.local_size());
1315 Vector<Array4Tag<int> > masks_unique;
1316 masks_unique.reserve(dst.local_size());
1317 Vector<Array4Tag<int> > masks;
1318
1319 for (int k = 0; k < N_rcvs; ++k)
1320 {
1321 if (recv_size[k] > 0)
1322 {
1323 std::size_t offset = recv_data[k]-recv_data[0];
1324 const char* dptr = pbuffer + offset;
1325 auto const& cctc = *recv_cctc[k];
1326 for (auto const& tag : cctc)
1327 {
1328 const int li = dst.localindex(tag.dstIndex);
1329 recv_copy_tags.emplace_back(TagType{
1330 .dfab = dst.atLocalIdx(li).array(), .dindex = tag.dstIndex,
1331 .sfab = amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
1332 .dbox = tag.dbox,
1333 .offset = Dim3{.x = 0, .y = 0, .z = 0}
1334 });
1335 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1336
1337 if (!maskfabs[li].isAllocated()) {
1338 maskfabs[li].resize(dst.atLocalIdx(li).box());
1339 masks_unique.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
1340 }
1341 masks.emplace_back(Array4Tag<int>{.dfab = maskfabs[li].array()});
1342 }
1343 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
1344 }
1345 }
1346
1347 amrex::ParallelFor(masks_unique,
1348 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
1349 {
1350 msk.dfab(i,j,k) = 0;
1351 });
1352
1353 if (op == FabArrayBase::COPY)
1354 {
1355 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
1356 recv_copy_tags, 0, dcomp, ncomp, masks);
1357 }
1358 else
1359 {
1360 detail::fab_to_fab_atomic_add<value_type, BUF>(
1361 recv_copy_tags, 0, dcomp, ncomp, masks);
1362 }
1363
1364 // There is Gpu::streamSynchronize in fab_to_fab.
1365 }
1366}
1368#endif /* AMREX_USE_GPU */
1369
1370template <class FAB>
1371template <typename BUF>
1372void
1373FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
1374 Vector<char*> const& send_data,
1375 Vector<std::size_t> const& send_size,
1376 Vector<CopyComTagsContainer const*> const& send_cctc)
1377{
1378 amrex::ignore_unused(send_size);
1379
1380 auto const N_snds = static_cast<int>(send_data.size());
1381 if (N_snds == 0) { return; }
1382
1383#ifdef AMREX_USE_OMP
1384#pragma omp parallel for
1385#endif
1386 for (int j = 0; j < N_snds; ++j)
1387 {
1388 if (send_size[j] > 0)
1389 {
1390 char* dptr = send_data[j];
1391 auto const& cctc = *send_cctc[j];
1392 for (auto const& tag : cctc)
1393 {
1394 const Box& bx = tag.sbox;
1395 auto const sfab = src.array(tag.srcIndex);
1396 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
1397 amrex::LoopConcurrentOnCpu( bx, ncomp,
1398 [=] (int ii, int jj, int kk, int n) noexcept
1399 {
1400 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
1401 });
1402 dptr += (bx.numPts() * ncomp * sizeof(BUF));
1403 }
1404 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
1405 }
1406 }
1407}
1408
1409template <class FAB>
1410template <typename BUF>
1411void
1413 Vector<char*> const& recv_data,
1414 Vector<std::size_t> const& recv_size,
1415 Vector<CopyComTagsContainer const*> const& recv_cctc,
1416 CpOp op, bool is_thread_safe)
1417{
1418 amrex::ignore_unused(recv_size);
1419
1420 auto const N_rcvs = static_cast<int>(recv_cctc.size());
1421 if (N_rcvs == 0) { return; }
1422
1423 if (is_thread_safe)
1424 {
1425#ifdef AMREX_USE_OMP
1426#pragma omp parallel for
1427#endif
1428 for (int k = 0; k < N_rcvs; ++k)
1429 {
1430 if (recv_size[k] > 0)
1431 {
1432 const char* dptr = recv_data[k];
1433 auto const& cctc = *recv_cctc[k];
1434 for (auto const& tag : cctc)
1435 {
1436 const Box& bx = tag.dbox;
1437 FAB& dfab = dst[tag.dstIndex];
1438 if (op == FabArrayBase::COPY)
1439 {
1440 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
1441 }
1442 else
1443 {
1444 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
1445 }
1446 dptr += bx.numPts() * ncomp * sizeof(BUF);
1447 }
1448 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1449 }
1450 }
1452 else
1453 {
1454 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
1455 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
1456 for (int k = 0; k < N_rcvs; ++k)
1458 if (recv_size[k] > 0)
1459 {
1460 const char* dptr = recv_data[k];
1461 auto const& cctc = *recv_cctc[k];
1462 for (auto const& tag : cctc)
1463 {
1464 recv_copy_tags[tag.dstIndex].push_back(VoidCopyTag{.p = dptr, .dbox = tag.dbox});
1465 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1466 }
1467 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1468 }
1469 }
1470
1471#ifdef AMREX_USE_OMP
1472#pragma omp parallel
1473#endif
1474 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1475 {
1476 const auto& tags = recv_copy_tags[mfi];
1477 auto dfab = dst.array(mfi);
1478 for (auto const & tag : tags)
1479 {
1480 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1481 if (op == FabArrayBase::COPY)
1482 {
1483 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1484 [=] (int i, int j, int k, int n) noexcept
1485 {
1486 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1487 });
1488 }
1489 else
1490 {
1491 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1492 [=] (int i, int j, int k, int n) noexcept
1493 {
1494 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1495 });
1496 }
1497 }
1498 }
1499 }
1500}
1501
1502#endif /* AMREX_USE_MPI */
1503
1504}
1505
1506#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:112
CopyComTag::CopyComTagsContainer CopyComTagsContainer
Definition AMReX_FabArrayBase.H:220
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:119
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:113
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:344
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:355
void CMD_remote_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:684
void FB_local_add_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:443
void FB_local_add_gpu(const FB &TheFB, int scomp, int ncomp, bool deterministic)
Definition AMReX_FBI.H:606
void FB_local_copy_gpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:534
void CMD_local_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:654
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:561
void FB_local_copy_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:384
TagVector< Array4CopyTag< value_type > > const * FB_get_local_copy_tag_vector(const FB &TheFB)
Definition AMReX_FBI.H:491
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:532
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition AMReX_LayoutData.H:25
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:29
Long size() const noexcept
Definition AMReX_Vector.H:54
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1373
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:487
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:513
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:200
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
__host__ __device__ Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:92
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2867
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2018
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
const int[]
Definition AMReX_BLProfiler.cpp:1664
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2028
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2862
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334
Definition AMReX_TagParallelFor.H:58
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
A multidimensional array accessor.
Definition AMReX_Array4.H:285
Definition AMReX_TagParallelFor.H:106
Definition AMReX_TagParallelFor.H:116
Definition AMReX_Dim3.H:13
int x
Definition AMReX_Dim3.H:13
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:474
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:195
Box sbox
Definition AMReX_FabArrayBase.H:197
int srcIndex
Definition AMReX_FabArrayBase.H:199
Box dbox
Definition AMReX_FabArrayBase.H:196
int dstIndex
Definition AMReX_FabArrayBase.H:198
FillBoundary.
Definition AMReX_FabArrayBase.H:488
Definition AMReX_FBI.H:7
IntVect offset
Definition AMReX_FBI.H:10
Box dbox
Definition AMReX_FBI.H:9
FAB const * sfab
Definition AMReX_FBI.H:8
Definition AMReX_TypeTraits.H:51
Definition AMReX_TypeTraits.H:61
Definition AMReX_TypeTraits.H:277
Definition AMReX_TagParallelFor.H:156
Definition AMReX_FBI.H:13
Box dbox
Definition AMReX_FBI.H:15
char const * p
Definition AMReX_FBI.H:14