Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4namespace amrex {
5
6template <class FAB>
7struct FabCopyTag {
8 FAB const* sfab;
10 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
11};
12
14 char const* p;
16};
17
19namespace detail {
20
21#ifdef AMREX_USE_GPU
22
23template <class T0, class T1>
24struct CellStore
25{
27 operator() (T0* d, T1 s) const noexcept
28 {
29 *d = static_cast<T0>(s);
30 }
31};
32
33template <class T0, class T1>
34struct CellAdd
35{
37 operator() (T0* d, T1 s) const noexcept
38 {
39 *d += static_cast<T0>(s);
40 }
41};
42
43template <class T0, class T1>
44struct CellAtomicAdd
45{
46 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
48 operator() (U0* d, T1 s) const noexcept
49 {
50 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
51 }
52};
53
54template <class T0, class T1, class F>
55void
56fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
57 F && f)
58{
59 TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
60
61 detail::ParallelFor_doit(tv,
63 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const& tag) noexcept
64 {
65 if (icell < ncells) {
66 for (int n = 0; n < ncomp; ++n) {
67 f(&(tag.dfab(i,j,k,n+dcomp)),
68 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
69 }
70 }
71 });
72}
73
74template <class TagType, class F>
75void
76fab_to_fab_store (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
77{
79 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
80 {
81 int m = Gpu::Atomic::Add(&(tag.mask(i,j,k)), 1);
82 if (m == 0) {
83 for (int n = 0; n < ncomp; ++n) {
84 f(&(tag.dfab(i,j,k,n+dcomp)),
85 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
86 }
87 }
88 });
89}
90
91template <class TagType, class F>
92void
93fab_to_fab_other (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
94{
96 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
97 {
98 int* m = &(tag.mask(i,j,k));
99 bool my_turn = false;
100 do {
101#if defined(AMREX_USE_SYCL)
102 my_turn = (Gpu::Atomic::Exch(m, 1) == 0);
103#else
104 my_turn = (Gpu::Atomic::CAS(m, 0, 1) == 0);
105#endif
106 if (my_turn) {
107#if defined(AMREX_USE_SYCL)
108 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
109#else
110 __threadfence();
111#endif
112 for (int n = 0; n < ncomp; ++n) {
113 f(&(tag.dfab(i,j,k,n+dcomp)),
114 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
115 }
116#if defined(AMREX_USE_SYCL)
117 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
118#else
119 __threadfence(); // It appears that we need this fence too if a GPU is shared.
120#endif
121 Gpu::Atomic::Exch(m, 0);
122 }
123 else {
124#if defined(AMREX_USE_CUDA)
125
126#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
127#if defined(_WIN32)
128 volatile int tmp; // prevent optimization
129 for (int c = 0; c < 2; ++c) {
130 ++tmp;
131 }
132#else
133 for (int c = 0; c < 2; ++c) {
134 __asm__ volatile(""); // prevent optimization
135 }
136#endif
137#else
138 __nanosleep(1);
139#endif
140
141#elif defined(AMREX_USE_HIP)
142
143 __builtin_amdgcn_s_sleep(1);
144
145#elif defined(AMREX_USE_SYCL)
146
147 for (int c = 0; c < 2; ++c) {
148 __asm__ volatile(""); // prevent optimization
149 }
150
151#endif
152 }
153 } while (!my_turn);
154 });
155}
156
157template <class T0, class T1, class F>
158void
159fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
160 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
161{
162 using TagType = Array4MaskCopyTag<T0, T1>;
163 Vector<TagType> tags;
164 const int N = copy_tags.size();
165 tags.reserve(N);
166 for (int i = 0; i < N; ++i) {
167 tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
168 copy_tags[i].dbox, copy_tags[i].offset});
169 }
170
171 if constexpr (std::is_same_v<F, CellStore<T0,T1>>)
172 {
173 fab_to_fab_store(tags, scomp, dcomp, ncomp, std::forward<F>(f));
174 }
175 else
176 {
177 fab_to_fab_other(tags, scomp, dcomp, ncomp, std::forward<F>(f));
178 }
179 // Note that Tag ParalleFor has a stream sync call in the end.
180}
181
182template <typename T0, typename T1,
183 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
184void
185fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
186 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
187{
188 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
189}
190
191template <typename T0, typename T1,
192 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
193void
194fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
195 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
196{
197 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
198}
199
200template <typename T0, typename T1,
201 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
202void
203fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
204 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
205{
206 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
207}
208
209template <typename T0, typename T1,
210 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
211void
212fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
213 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
214{
215 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
216}
217
218template <typename T0, typename T1, class F>
219void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int scomp,
220 int dcomp, int ncomp, F const& f)
221{
222 if (a_tags.empty()) { return; }
223
224 using TagType = Array4CopyTag<T0,T1>;
225
226 struct TiledTag {
227 int tag_index;
228 std::pair<int,Box> dindex_tilebox;
229 bool operator< (TiledTag const& rhs) const noexcept {
230 return this->dindex_tilebox < rhs.dindex_tilebox;
231 }
232 bool operator!= (TiledTag const& rhs) const noexcept {
233 return this->dindex_tilebox != rhs.dindex_tilebox;
234 }
235 };
236 Vector<TiledTag> tiled_tags;
237
238 auto const ixtype = a_tags[0].dbox.ixType();
239
240 constexpr int tile_size = 64;
241 for (int itag = 0; itag < a_tags.size(); ++itag) {
242 auto const& tag = a_tags[itag];
243 auto const& dlo = tag.dbox.smallEnd();
244 auto const& dhi = tag.dbox.bigEnd();
245 IntVect tlo(AMREX_D_DECL(amrex::coarsen<tile_size>(dlo[0]),
246 amrex::coarsen<tile_size>(dlo[1]),
247 amrex::coarsen<tile_size>(dlo[2])));
248 IntVect thi(AMREX_D_DECL(amrex::coarsen<tile_size>(dhi[0]),
249 amrex::coarsen<tile_size>(dhi[1]),
250 amrex::coarsen<tile_size>(dhi[2])));
251#if (AMREX_SPACEDIM == 3)
252 for (int kt = tlo[2]; kt <= thi[2]; ++kt)
253#endif
254 {
255#if (AMREX_SPACEDIM >= 2)
256 for (int jt = tlo[1]; jt <= thi[1]; ++jt)
257#endif
258 {
259 for (int it = tlo[0]; it <= thi[0]; ++it)
260 {
261 IntVect lo(AMREX_D_DECL(it*tile_size,
262 jt*tile_size,
263 kt*tile_size));
264 tiled_tags.push_back(TiledTag{itag, std::make_pair
265 (tag.dindex, Box(lo, lo+(tile_size-1), ixtype))});
266 }
267 }
268 }
269 }
270
271 std::sort(tiled_tags.begin(), tiled_tags.end());
272
273 Gpu::HostVector<unsigned int> h_ntags;
274 Gpu::HostVector<TagType> h_tags;
275 h_tags.reserve(tiled_tags.size());
276
277 for (unsigned int itag = 0; itag < tiled_tags.size(); ++itag) {
278 if (itag == 0) {
279 h_ntags.push_back(0);
280 } else if (tiled_tags[itag-1] != tiled_tags[itag]) {
281 h_ntags.push_back(itag);
282 }
283 auto const& ttag = tiled_tags[itag];
284 auto const& btag = a_tags[ttag.tag_index];
285 h_tags.push_back({btag.dfab, btag.dindex, btag.sfab,
286 btag.dbox & ttag.dindex_tilebox.second, btag.offset});
287 }
288 h_ntags.push_back((unsigned int)tiled_tags.size());
289
290 Gpu::DeviceVector<TagType> d_tags(h_tags.size());
291 Gpu::DeviceVector<unsigned int> d_ntags(h_ntags.size());
292 Gpu::copyAsync(Gpu::hostToDevice,h_tags.begin(),h_tags.end(),d_tags.begin());
293 Gpu::copyAsync(Gpu::hostToDevice,h_ntags.begin(),h_ntags.end(),d_ntags.begin());
294 auto const* ptag = d_tags.data();
295 auto const* pntags = d_ntags.data();
296 auto const nblocks = int(h_ntags.size()-1);
297 constexpr auto nthreads = 256;
298 amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
299#ifdef AMREX_USE_SYCL
300 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
301 [[sycl::reqd_work_group_size(nthreads)]]
302#else
303 [=] AMREX_GPU_DEVICE () noexcept
304#endif
305 {
306#ifdef AMREX_USE_SYCL
307 Dim1 blockIdx{item.get_group_linear_id()};
308 Dim1 threadIdx{item.get_local_linear_id()};
309#endif
310
311 for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
312 auto const tag = ptag[itag];
313 auto ncells = int(tag.dbox.numPts());
314 const auto len = amrex::length(tag.dbox);
315 const auto lo = amrex::lbound(tag.dbox);
316 for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
317 int k = icell / (len.x*len.y);
318 int j = (icell - k*(len.x*len.y)) / len.x;
319 int i = (icell - k*(len.x*len.y)) - j*len.x;
320 i += lo.x;
321 j += lo.y;
322 k += lo.z;
323 for (int n = 0; n < ncomp; ++n) {
324 f(tag.dfab.ptr(i,j,k,n+dcomp),
325 tag.sfab(i + tag.offset.x,
326 j + tag.offset.y,
327 k + tag.offset.z, n+scomp));
328 }
329 }
330
331 if (itag+1 < pntags[blockIdx.x+1]) {
332#ifdef AMREX_USE_SYCL
333 sycl::group_barrier(item.get_group());
334#else
335 __syncthreads();
336#endif
337 }
338 }
339 });
341}
342
343template <typename B, typename V, typename TT,
344 std::enable_if_t<amrex::HasAtomicAdd<V>::value,int> FOO = 0>
345void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
346 int dcomp, int ncomp)
347{
348 detail::ParallelFor_doit(tv,
349 [=] AMREX_GPU_DEVICE (
350 int icell, int ncells, int i, int j, int k, TT const& tag) noexcept
351 {
352 if (icell < ncells) {
353 Array4<B const> sfab{(B const*)(pbuffer+tag.poff),
354 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
355 for (int n = 0; n < ncomp; ++n) {
356 Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n+dcomp),
357 (V)sfab(i,j,k,n));
358 }
359 }
360 });
361}
362
363template <typename B, typename V, typename TT,
364 std::enable_if_t<!amrex::HasAtomicAdd<V>::value,int> FOO = 0>
365void unpack_recv_buffer_gpu_atomic_add (char* pbuffer, TagVector<TT> const& tv,
366 int dcomp, int ncomp)
367{
368 amrex::ignore_unused(pbuffer, tv, dcomp, ncomp);
369 amrex::Abort("unpack_recv_buffer_gpu: should NOT get here");
370}
371
372#endif /* AMREX_USE_GPU */
373
374}
376
377template <class FAB>
378void
379FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
380{
381 auto const& LocTags = *(TheFB.m_LocTags);
382 auto N_locs = static_cast<int>(LocTags.size());
383 if (N_locs == 0) { return; }
384 bool is_thread_safe = TheFB.m_threadsafe_loc;
385 if (is_thread_safe)
386 {
387#ifdef AMREX_USE_OMP
388#pragma omp parallel for
389#endif
390 for (int i = 0; i < N_locs; ++i)
391 {
392 const CopyComTag& tag = LocTags[i];
393
394 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
395 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
396
397 const FAB* sfab = &(get(tag.srcIndex));
398 FAB* dfab = &(get(tag.dstIndex));
399 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
400 }
401 }
402 else
403 {
405 for (int i = 0; i < N_locs; ++i)
406 {
407 const CopyComTag& tag = LocTags[i];
408
409 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
410 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
411
412 loc_copy_tags[tag.dstIndex].push_back
413 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
414 }
415#ifdef AMREX_USE_OMP
416#pragma omp parallel
417#endif
418 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
419 {
420 const auto& tags = loc_copy_tags[mfi];
421 auto dfab = this->array(mfi);
422 for (auto const & tag : tags)
423 {
424 auto const sfab = tag.sfab->array();
425 const auto offset = tag.offset.dim3();
426 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
427 [=] (int i, int j, int k, int n) noexcept
428 {
429 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
430 });
431 }
432 }
433 }
434}
435
436template <class FAB>
437void
438FabArray<FAB>::FB_local_add_cpu (const FB& TheFB, int scomp, int ncomp)
439{
440 auto const& LocTags = *(TheFB.m_LocTags);
441 auto N_locs = static_cast<int>(LocTags.size());
442 if (N_locs == 0) { return; }
443
445 // We must make a temporary copy of the data first to avoid race condition.
446 std::vector<FAB> src_fabs(N_locs);
447 for (int itag = 0; itag < N_locs; ++itag) {
448 const CopyComTag& tag = LocTags[itag];
449 src_fabs[itag].resize(tag.sbox,ncomp);
450 loc_copy_tags[tag.dstIndex].push_back
451 (FabCopyTag<FAB>{&(src_fabs[itag]),
452 tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
453 }
454
455#ifdef AMREX_USE_OMP
456#pragma omp parallel for
457#endif
458 for (int itag = 0; itag < N_locs; ++itag) {
459 const CopyComTag& tag = LocTags[itag];
460 src_fabs[itag].template copy<RunOn::Host>(this->operator[](tag.srcIndex), scomp, 0, ncomp);
461 }
462
463#ifdef AMREX_USE_OMP
464#pragma omp parallel
465#endif
466 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
467 {
468 const auto& tags = loc_copy_tags[mfi];
469 const auto& dfab = this->array(mfi);
470 for (auto const & tag : tags)
471 {
472 auto const sfab = tag.sfab->array();
473 const auto offset = tag.offset.dim3();
474 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
475 [&] (int i, int j, int k, int n) noexcept
476 {
477 dfab(i,j,k,n+scomp) += sfab(i+offset.x,j+offset.y,k+offset.z,n);
478 });
479 }
480 }
481}
482
483#ifdef AMREX_USE_GPU
484
485template <class FAB>
488{
489 auto const& LocTags = *(TheFB.m_LocTags);
490 int N_locs = LocTags.size();
491
492 using TagType = Array4CopyTag<value_type>;
493
495 if (auto it = m_fb_local_copy_handler.find(TheFB.m_id);
496 it != m_fb_local_copy_handler.end())
497 {
498 tv = it->second.get();
499 } else {
500 Vector<TagType> loc_copy_tags;
501 loc_copy_tags.reserve(N_locs);
502
503 for (int i = 0; i < N_locs; ++i)
504 {
505 const CopyComTag& tag = LocTags[i];
506
507 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
508 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
509
510 int li = this->localindex(tag.dstIndex);
511 loc_copy_tags.push_back
512 ({this->atLocalIdx(li).array(), tag.dstIndex,
513 this->fabPtr(tag.srcIndex)->const_array(),
514 tag.dbox,
515 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
516 }
517
518 auto utv = std::make_unique<TagVector<TagType>>(loc_copy_tags);
519 tv = utv.get();
520 m_fb_local_copy_handler[TheFB.m_id] = std::move(utv);
521 }
522
523 return tv;
524}
525
526template <class FAB>
527void
528FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
529{
530 auto const& LocTags = *(TheFB.m_LocTags);
531 int N_locs = LocTags.size();
532 if (N_locs == 0) { return; }
533 bool is_thread_safe = TheFB.m_threadsafe_loc;
534
535 using TagType = Array4CopyTag<value_type>;
536
537 if (is_thread_safe || amrex::IsStoreAtomic<value_type>::value)
538 {
539 auto* tv = FB_get_local_copy_tag_vector(TheFB);
540
541 detail::ParallelFor_doit(*tv,
542 [=] AMREX_GPU_DEVICE (
543 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
544 {
545 if (icell < ncells) {
546 for (int n = 0; n < ncomp; ++n) {
547 tag.dfab(i,j,k,n+scomp) = tag.sfab(i+tag.offset.x,
548 j+tag.offset.y,
549 k+tag.offset.z,n+scomp);
550 }
551 }
552 });
553 }
554 else
555 {
556 Vector<TagType> loc_copy_tags;
557 loc_copy_tags.reserve(N_locs);
558
559 Vector<BaseFab<int>> maskfabs(this->local_size());
560 Vector<Array4Tag<int> > masks_unique;
561 masks_unique.reserve(this->local_size());
562 Vector<Array4Tag<int> > masks;
563 masks.reserve(N_locs);
564
565 for (int i = 0; i < N_locs; ++i)
566 {
567 const CopyComTag& tag = LocTags[i];
568
569 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
570 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
571
572 int li = this->localindex(tag.dstIndex);
573 loc_copy_tags.push_back
574 ({this->atLocalIdx(li).array(), tag.dstIndex,
575 this->fabPtr(tag.srcIndex)->const_array(),
576 tag.dbox,
577 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
578
579 if (!maskfabs[li].isAllocated()) {
580 maskfabs[li].resize(this->atLocalIdx(li).box());
581 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
582 }
583 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
584 }
585
586 amrex::ParallelFor(masks_unique,
587 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
588 {
589 msk.dfab(i,j,k) = 0;
590 });
591
592 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
593 loc_copy_tags, scomp, scomp, ncomp, masks);
594 }
595}
596
597template <class FAB>
598void
599FabArray<FAB>::FB_local_add_gpu (const FB& TheFB, int scomp, int ncomp, bool deterministic)
600{
601 auto const& LocTags = *(TheFB.m_LocTags);
602 int N_locs = LocTags.size();
603 if (N_locs == 0) { return; }
604
605 using TagType = Array4CopyTag<value_type>;
606
607 Vector<TagType> loc_copy_tags_1, loc_copy_tags_2;
608 loc_copy_tags_1.reserve(N_locs);
609 loc_copy_tags_2.reserve(N_locs);
610
611 Vector<FAB> src_fabs(N_locs);
612 for (int itag = 0; itag < N_locs; ++itag) {
613 const CopyComTag& tag = LocTags[itag];
614 src_fabs[itag].resize(tag.sbox,ncomp);
615 loc_copy_tags_1.push_back(
616 TagType{src_fabs[itag].array(), -1,
617 this->const_array(tag.srcIndex,scomp), tag.sbox,
618 Dim3{0,0,0}});
619 loc_copy_tags_2.push_back(
620 TagType{this->array(tag.dstIndex,scomp), tag.dstIndex,
621 src_fabs[itag].const_array(), tag.dbox,
622 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
623 }
624
625 // Note that we have shifted the starting component to zero in the code above.
626
627 // TODO: We could try to cache the tags like in FillBoundary.
628
629 detail::fab_to_fab(loc_copy_tags_1, 0, 0, ncomp,
630 detail::CellStore<value_type, value_type>{});
631 if (deterministic || ! amrex::HasAtomicAdd<value_type>::value) {
632 detail::deterministic_fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
633 detail::CellAdd<value_type,value_type>{});
634 } else {
636 detail::fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
637 detail::CellAtomicAdd<value_type, value_type>{});
638 }
639 ((void)0);
640 }
641
642 // Note that fab_to_fab is synchronous.
643}
644
645template <class FAB>
646void
648 const CommMetaData& thecmd, int scomp, int ncomp)
649{
650 auto const& LocTags = *(thecmd.m_LocTags);
651 int N_locs = LocTags.size();
652 if (N_locs == 0) { return; }
653 bool is_thread_safe = thecmd.m_threadsafe_loc;
654
655 using TagType = Array4BoxTag<value_type>;
656 Vector<TagType> loc_setval_tags;
657 loc_setval_tags.reserve(N_locs);
658
660
661 for (int i = 0; i < N_locs; ++i)
662 {
663 const CopyComTag& tag = LocTags[i];
664 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
665 loc_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
666 }
667
668 amrex::ParallelFor(loc_setval_tags, ncomp,
669 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
670 {
671 tag.dfab(i,j,k,n+scomp) = x;
672 });
673}
674
675template <class FAB>
676void
678 const CommMetaData& thecmd, int scomp, int ncomp)
679{
680 auto const& RcvTags = *(thecmd.m_RcvTags);
681 bool is_thread_safe = thecmd.m_threadsafe_rcv;
682
683 using TagType = Array4BoxTag<value_type>;
684 Vector<TagType> rcv_setval_tags;
685
686 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
687 for (auto const& tag: it->second) {
688 rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
689 }
690 }
691
692 if (rcv_setval_tags.empty()) { return; }
693
695
696 amrex::ParallelFor(rcv_setval_tags, ncomp,
697 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
698 {
699 tag.dfab(i,j,k,n+scomp) = x;
700 });
701}
702
703#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
704template <class FAB>
705void
706FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
707{
708 const int N_locs = (*TheFB.m_LocTags).size();
710 for (int i = 0; i < N_locs; ++i)
711 {
712 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
713
714 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
715 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
716
717 loc_copy_tags[tag.dstIndex].push_back
718 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
719 }
720
721 // Create Graph if one is needed.
722 if ( !(TheFB.m_localCopy.ready()) )
723 {
724 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
725
726 int idx = 0;
727 // Record the graph.
728 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
729 {
730 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
731 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
732 (TheFB).m_localCopy.getDevicePtr(0),
733 std::size_t(sizeof(CopyMemory)*N_locs) );
734
735 const auto& tags = loc_copy_tags[mfi];
736 for (auto const & tag : tags)
737 {
738 const auto offset = tag.offset.dim3();
739 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
740 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
741 {
742 // Build the Array4's.
743 auto const dst = cmem->getDst<value_type>();
744 auto const src = cmem->getSrc<value_type>();
745 for (int n = 0; n < cmem->ncomp; ++n) {
746 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
747 }
748 });
749 }
750
751 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
752 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
753 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
754 }
755 }
756
757 // Setup Launch Parameters
758 // This is perfectly threadable, right?
759 // Additional optimization -> Check to see whether values need to be reset?
760 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
761 int idx = 0;
762 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
763 {
764 auto const dst_array = this->array(mfi);
765 const auto& tags = loc_copy_tags[mfi];
766 for (auto const & tag : tags)
767 {
768 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
769 dst_array,
770 scomp, ncomp));
771 }
772 }
773
774 // Launch Graph
775 TheFB.m_localCopy.executeGraph();
776}
777
778#ifdef AMREX_USE_MPI
779template <class FAB>
780void
781FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
782{
783 const int N_locs = TheFB.m_LocTags->size();
784
785 int launches = 0; // Used for graphs only.
786 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
787 for (int i = 0; i < N_locs; ++i)
788 {
789 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
790
791 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
792 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
793
794 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
795 {
796 loc_copy_tags[tag.dstIndex].push_back
797 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
798 launches++;
799 }
800 }
801
802 FillBoundary_test();
803
804 if ( !(TheFB.m_localCopy.ready()) )
805 {
806 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
807
808 int idx = 0;
809 int cuda_stream = 0;
810 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
811 {
812 const auto& tags = loc_copy_tags[mfi];
813 for (int t = 0; t<tags.size(); ++t)
814 {
815 Gpu::Device::setStreamIndex(cuda_stream++);
816 amrex::Gpu::Device::startGraphRecording( (idx == 0),
817 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
818 (TheFB).m_localCopy.getDevicePtr(0),
819 std::size_t(sizeof(CopyMemory)*launches) );
820
821 const auto& tag = tags[t];
822 const Dim3 offset = tag.offset.dim3();
823
824 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
825 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
826 {
827 auto const dst = cmem->getDst<value_type>();
828 auto const src = cmem->getSrc<value_type>();
829 for (int n = 0; n < cmem->ncomp; ++n) {
830 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
831 }
832 });
833
834 bool last_iter = idx == launches;
835 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
836 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
837 }
838 }
839 }
840
841 // Setup Launch Parameters
842 // This is perfectly threadable, right?
843 int idx = 0;
844 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
845 {
846 const auto& dst_array = this->array(mfi);
847 const auto& tags = loc_copy_tags[mfi];
848 for (auto const & tag : tags)
849 {
850 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
851 dst_array,
852 scomp, ncomp));
853 }
854 }
855
856 // Launch Graph without synch. Local work is entirely independent.
857 TheFB.m_localCopy.executeGraph(false);
858}
859#endif /* AMREX_USE_MPI */
860
861#endif /* __CUDACC__ */
862
863#endif /* AMREX_USE_GPU */
864
865#ifdef AMREX_USE_MPI
866
867#ifdef AMREX_USE_GPU
868
869#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
870
871template <class FAB>
872void
873FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
874 Vector<char*>& send_data,
875 Vector<std::size_t> const& send_size,
876 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
877{
878 const int N_snds = send_data.size();
879 if (N_snds == 0) { return; }
880
881 if ( !(TheFB.m_copyToBuffer.ready()) )
882 {
883 // Set size of CudaGraph buffer.
884 // Is the conditional ever expected false?
885 int launches = 0;
886 for (int send = 0; send < N_snds; ++send) {
887 if (send_size[send] > 0) {
888 launches += send_cctc[send]->size();
889 }
890 }
891 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
892
893 // Record the graph.
894 int idx = 0;
895 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
896 sit.isValid(); ++sit)
897 {
898 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
899 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
900 (TheFB).m_copyToBuffer.getDevicePtr(0),
901 std::size_t(sizeof(CopyMemory)*launches) );
902
903 const int j = sit();
904 if (send_size[j] > 0)
905 {
906 auto const& cctc = *send_cctc[j];
907 for (auto const& tag : cctc)
908 {
909 const Box& bx = tag.sbox;
910 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
911 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
912 {
913 auto const pfab = cmem->getDst<value_type>();
914 auto const sfab = cmem->getSrc<value_type>();
915 for (int n = 0; n < cmem->ncomp; ++n)
916 {
917 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
918 }
919 });
920 }
921 }
922
923 bool last_iter = sit() == (N_snds-1);
924 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
925 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
926 }
927 }
928
929 // Setup Launch Parameters
930 int idx = 0;
931 for (int send = 0; send < N_snds; ++send)
932 {
933 const int j = send;
934 if (send_size[j] > 0)
935 {
936 char* dptr = send_data[j];
937 auto const& cctc = *send_cctc[j];
938 for (auto const& tag : cctc)
939 {
940 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
941 amrex::makeArray4((value_type*)(dptr),
942 tag.sbox,
943 ncomp),
944 scomp, ncomp));
945
946 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
947 }
948 amrex::ignore_unused(send_size);
949 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
950 }
951 }
952
953 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
954 TheFB.m_copyToBuffer.executeGraph();
955}
956
957template <class FAB>
958void
959FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
960 Vector<char*> const& recv_data,
961 Vector<std::size_t> const& recv_size,
962 Vector<CopyComTagsContainer const*> const& recv_cctc,
963 bool /*is_thread_safe*/)
964{
965 const int N_rcvs = recv_cctc.size();
966 if (N_rcvs == 0) { return; }
967
968 int launches = 0;
969 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
970 for (int k = 0; k < N_rcvs; ++k)
971 {
972 if (recv_size[k] > 0)
973 {
974 const char* dptr = recv_data[k];
975 auto const& cctc = *recv_cctc[k];
976 for (auto const& tag : cctc)
977 {
978 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
979 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
980 launches++;
981 }
982 amrex::ignore_unused(recv_size);
983 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
984 }
985 }
986
987 if ( !(TheFB.m_copyFromBuffer.ready()) )
988 {
989 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
990
991 int idx = 0;
992 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
993 {
994 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
995 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
996 (TheFB).m_copyFromBuffer.getDevicePtr(0),
997 std::size_t(sizeof(CopyMemory)*launches) );
998
999 const auto& tags = recv_copy_tags[mfi];
1000 for (auto const & tag : tags)
1001 {
1002 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
1003 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
1004 {
1005 auto const pfab = cmem->getSrc<value_type>();
1006 auto const dfab = cmem->getDst<value_type>();
1007 for (int n = 0; n < cmem->ncomp; ++n)
1008 {
1009 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
1010 }
1011 });
1012 }
1013
1014 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
1015 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
1016 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
1017 }
1018 }
1019
1020 // Setup graph.
1021 int idx = 0;
1022 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
1023 {
1024 auto dst_array = this->array(mfi);
1025 const auto & tags = recv_copy_tags[mfi];
1026 for (auto const & tag : tags)
1027 {
1028 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
1029 tag.dbox,
1030 ncomp),
1031 dst_array,
1032 dcomp, ncomp));
1033 }
1034 }
1035
1036 // Launch Graph - synced because next action is freeing recv buffer.
1037 TheFB.m_copyFromBuffer.executeGraph();
1038}
1039
1040#endif /* __CUDACC__ */
1041
1042template <class FAB>
1043template <typename BUF>
1044auto
1046 Vector<std::size_t> const& send_size,
1047 Vector<CopyComTagsContainer const*> const& send_cctc,
1048 int ncomp, std::uint64_t id) const
1050{
1051 using TagType = CommSendBufTag<value_type>;
1052
1053 auto kit = std::find_if(send_cctc.begin(), send_cctc.end(),
1054 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1055 if (kit == send_cctc.end()) {
1056 return nullptr;
1057 }
1058
1059 auto get_tags = [&] () -> Vector<TagType>
1060 {
1061 Vector<TagType> snd_copy_tags;
1062 char* pbuf = send_data[0];
1063 const int N_snds = send_data.size();
1064 for (int j = 0; j < N_snds; ++j)
1065 {
1066 if (send_size[j] > 0)
1067 {
1068 char* dptr = send_data[j];
1069 auto const& cctc = *send_cctc[j];
1070 for (auto const& tag : cctc)
1071 {
1072 snd_copy_tags.emplace_back
1073 (TagType{this->const_array(tag.srcIndex), dptr-pbuf, tag.sbox});
1074 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
1075 }
1076 }
1077 }
1078 return snd_copy_tags;
1079 };
1080
1082 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1083
1084 if (auto it = m_send_copy_handler.find(key); it != m_send_copy_handler.end()) {
1085 tv = it->second.get();
1086 } else {
1087 if (m_send_copy_handler.size() > 32) {
1088 // Just in case. If this is used in ParallelCopy, it's possible
1089 // that the sending FabArray is the same, but the receiving
1090 // FabArray is different every time. Then the size of this map
1091 // could increase indefinitely.
1092 m_send_copy_handler.clear();
1093 }
1094 auto snd_copy_tags = get_tags();
1095 auto utv = std::make_unique<TagVector<TagType>>(snd_copy_tags);
1096 tv = utv.get();
1097 m_send_copy_handler[key] = std::move(utv);
1098 }
1099
1100 return tv;
1101}
1102
1103template <class FAB>
1104template <typename BUF>
1105void
1106FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
1107 Vector<char*> const& send_data,
1108 Vector<std::size_t> const& send_size,
1109 Vector<CopyComTagsContainer const*> const& send_cctc,
1110 std::uint64_t id)
1111{
1112 const int N_snds = send_data.size();
1113 if (N_snds == 0) { return; }
1114
1115 using TagType = CommSendBufTag<value_type>;
1116
1117 auto* tv = src.template get_send_copy_tag_vector<BUF>
1118 (send_data, send_size, send_cctc, ncomp, id);
1119 if (tv == nullptr) { return; }
1120
1121 char* pbuffer = send_data[0];
1122
1123 detail::ParallelFor_doit(*tv,
1124 [=] AMREX_GPU_DEVICE (
1125 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1126 {
1127 if (icell < ncells) {
1128 Array4<BUF> dfab{(BUF*)(pbuffer+tag.poff),
1129 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1130 for (int n = 0; n < ncomp; ++n) {
1131 dfab(i,j,k,n) = (BUF)tag.sfab(i,j,k,n+scomp);
1132 }
1133 }
1134 });
1135
1136 Gpu::streamSynchronize();
1137}
1138
1139template <class FAB>
1140template <typename BUF>
1141auto
1143 Vector<std::size_t> const& recv_size,
1144 Vector<CopyComTagsContainer const*> const& recv_cctc,
1145 int ncomp, std::uint64_t id)
1147{
1148 using TagType = CommRecvBufTag<value_type>;
1149
1150 auto kit = std::find_if(recv_cctc.begin(), recv_cctc.end(),
1151 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1152 if (kit == recv_cctc.end()) {
1153 return nullptr;
1154 }
1155
1156 auto get_tags = [&] () -> Vector<TagType>
1157 {
1158 Vector<TagType> recv_copy_tags;
1159 char* pbuf = recv_data[0];
1160 const int N_rcvs = recv_cctc.size();
1161 for (int k = 0; k < N_rcvs; ++k)
1162 {
1163 if (recv_size[k] > 0)
1164 {
1165 char* dptr = recv_data[k];
1166 auto const& cctc = *recv_cctc[k];
1167 for (auto const& tag : cctc)
1168 {
1169 const int li = this->localindex(tag.dstIndex);
1170 recv_copy_tags.emplace_back
1171 (TagType{this->atLocalIdx(li).array(), dptr-pbuf, tag.dbox});
1172 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1173 }
1174 }
1175 }
1176 return recv_copy_tags;
1177 };
1178
1180 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1181
1182 if (auto it = m_recv_copy_handler.find(key); it != m_recv_copy_handler.end()) {
1183 tv = it->second.get();
1184 } else {
1185 if (m_recv_copy_handler.size() > 32) {
1186 // Just in case. If this is used in ParallelCopy, it's possible
1187 // that the receiving FabArray is the same, but the sending
1188 // FabArray is different every time. Then the size of this map
1189 // could increase indefinitely.
1190 m_recv_copy_handler.clear();
1191 }
1192 auto recv_copy_tags = get_tags();
1193 auto utv = std::make_unique<TagVector<TagType>>(recv_copy_tags);
1194 tv = utv.get();
1195 m_recv_copy_handler[key] = std::move(utv);
1196 }
1197
1198 return tv;
1199}
1200
1201template <class FAB>
1202template <typename BUF>
1203void
1205 Vector<char*> const& recv_data,
1206 Vector<std::size_t> const& recv_size,
1207 Vector<CopyComTagsContainer const*> const& recv_cctc,
1208 CpOp op, bool is_thread_safe, std::uint64_t id,
1209 bool deterministic)
1210{
1211 const int N_rcvs = recv_cctc.size();
1212 if (N_rcvs == 0) { return; }
1213
1214 bool use_mask = false;
1215 if (!is_thread_safe)
1216 {
1217 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
1218 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
1219 {
1220 use_mask = true;
1221 }
1222 }
1223
1224 if (deterministic)
1225 {
1226 AMREX_ALWAYS_ASSERT(op == FabArrayBase::ADD); // Only ADD for now
1227 using TagType = Array4CopyTag<value_type,BUF>;
1228 Vector<TagType> tags;
1229 tags.reserve(N_rcvs);
1230 for (int k = 0; k < N_rcvs; ++k) {
1231 if (recv_size[k] > 0) {
1232 char const* dptr = recv_data[k];
1233 auto const& cctc = *recv_cctc[k];
1234 for (auto const& tag : cctc) {
1235 tags.emplace_back(
1236 TagType{dst.array(tag.dstIndex), tag.dstIndex,
1237 Array4<BUF const>((BUF const*)dptr,
1238 amrex::begin(tag.dbox),
1239 amrex::end(tag.dbox), ncomp),
1240 tag.dbox, Dim3{0,0,0}});
1241 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1242 }
1243 }
1244 }
1246 detail::deterministic_fab_to_fab<value_type,BUF>
1247 (tags, 0, dcomp, ncomp, detail::CellAdd<value_type,BUF>{});
1248 } else {
1249 amrex::Abort("SumBoundary requires operator+=");
1250 }
1251 }
1252 else if (!use_mask)
1253 {
1254 using TagType = CommRecvBufTag<value_type>;
1255 auto* tv = dst.template get_recv_copy_tag_vector<BUF>
1256 (recv_data, recv_size, recv_cctc, ncomp, id);
1257 if (tv == nullptr) { return; }
1258
1259 char* pbuffer = recv_data[0];
1260
1261 if (op == FabArrayBase::COPY)
1262 {
1263 detail::ParallelFor_doit(*tv,
1264 [=] AMREX_GPU_DEVICE (
1265 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1266 {
1267 if (icell < ncells) {
1268 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1269 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1270 for (int n = 0; n < ncomp; ++n) {
1271 tag.dfab(i,j,k,n+dcomp) = (value_type)sfab(i,j,k,n);
1272 }
1273 }
1274 });
1275 }
1276 else
1277 {
1278 if (is_thread_safe) {
1279 detail::ParallelFor_doit(*tv,
1280 [=] AMREX_GPU_DEVICE (
1281 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1282 {
1283 if (icell < ncells) {
1284 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1285 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1286 for (int n = 0; n < ncomp; ++n) {
1287 tag.dfab(i,j,k,n+dcomp) += (value_type)sfab(i,j,k,n);
1288 }
1289 }
1290 });
1291 } else {
1292 detail::unpack_recv_buffer_gpu_atomic_add<BUF, value_type>
1293 (pbuffer, *tv, dcomp, ncomp);
1294 }
1295 }
1296 Gpu::streamSynchronize();
1297 }
1298 else
1299 {
1300 char* pbuffer = recv_data[0];
1301
1302 using TagType = Array4CopyTag<value_type, BUF>;
1303 Vector<TagType> recv_copy_tags;
1304 recv_copy_tags.reserve(N_rcvs);
1305
1306 Vector<BaseFab<int> > maskfabs(dst.local_size());
1307 Vector<Array4Tag<int> > masks_unique;
1308 masks_unique.reserve(dst.local_size());
1309 Vector<Array4Tag<int> > masks;
1310
1311 for (int k = 0; k < N_rcvs; ++k)
1312 {
1313 if (recv_size[k] > 0)
1314 {
1315 std::size_t offset = recv_data[k]-recv_data[0];
1316 const char* dptr = pbuffer + offset;
1317 auto const& cctc = *recv_cctc[k];
1318 for (auto const& tag : cctc)
1319 {
1320 const int li = dst.localindex(tag.dstIndex);
1321 recv_copy_tags.emplace_back(TagType{
1322 dst.atLocalIdx(li).array(), tag.dstIndex,
1323 amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
1324 tag.dbox,
1325 Dim3{0,0,0}
1326 });
1327 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1328
1329 if (!maskfabs[li].isAllocated()) {
1330 maskfabs[li].resize(dst.atLocalIdx(li).box());
1331 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1332 }
1333 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1334 }
1335 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
1336 }
1337 }
1338
1339 amrex::ParallelFor(masks_unique,
1340 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
1341 {
1342 msk.dfab(i,j,k) = 0;
1343 });
1345 if (op == FabArrayBase::COPY)
1346 {
1347 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
1348 recv_copy_tags, 0, dcomp, ncomp, masks);
1349 }
1350 else
1351 {
1352 detail::fab_to_fab_atomic_add<value_type, BUF>(
1353 recv_copy_tags, 0, dcomp, ncomp, masks);
1354 }
1355
1356 // There is Gpu::streamSynchronize in fab_to_fab.
1358}
1360#endif /* AMREX_USE_GPU */
1361
1362template <class FAB>
1363template <typename BUF>
1364void
1365FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
1366 Vector<char*> const& send_data,
1367 Vector<std::size_t> const& send_size,
1368 Vector<CopyComTagsContainer const*> const& send_cctc)
1369{
1370 amrex::ignore_unused(send_size);
1371
1372 auto const N_snds = static_cast<int>(send_data.size());
1373 if (N_snds == 0) { return; }
1374
1375#ifdef AMREX_USE_OMP
1376#pragma omp parallel for
1377#endif
1378 for (int j = 0; j < N_snds; ++j)
1379 {
1380 if (send_size[j] > 0)
1381 {
1382 char* dptr = send_data[j];
1383 auto const& cctc = *send_cctc[j];
1384 for (auto const& tag : cctc)
1385 {
1386 const Box& bx = tag.sbox;
1387 auto const sfab = src.array(tag.srcIndex);
1388 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
1389 amrex::LoopConcurrentOnCpu( bx, ncomp,
1390 [=] (int ii, int jj, int kk, int n) noexcept
1391 {
1392 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
1393 });
1394 dptr += (bx.numPts() * ncomp * sizeof(BUF));
1395 }
1396 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
1397 }
1398 }
1399}
1400
1401template <class FAB>
1402template <typename BUF>
1403void
1405 Vector<char*> const& recv_data,
1406 Vector<std::size_t> const& recv_size,
1407 Vector<CopyComTagsContainer const*> const& recv_cctc,
1408 CpOp op, bool is_thread_safe)
1409{
1410 amrex::ignore_unused(recv_size);
1411
1412 auto const N_rcvs = static_cast<int>(recv_cctc.size());
1413 if (N_rcvs == 0) { return; }
1414
1415 if (is_thread_safe)
1416 {
1417#ifdef AMREX_USE_OMP
1418#pragma omp parallel for
1419#endif
1420 for (int k = 0; k < N_rcvs; ++k)
1421 {
1422 if (recv_size[k] > 0)
1423 {
1424 const char* dptr = recv_data[k];
1425 auto const& cctc = *recv_cctc[k];
1426 for (auto const& tag : cctc)
1427 {
1428 const Box& bx = tag.dbox;
1429 FAB& dfab = dst[tag.dstIndex];
1430 if (op == FabArrayBase::COPY)
1432 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
1433 }
1434 else
1435 {
1436 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
1437 }
1438 dptr += bx.numPts() * ncomp * sizeof(BUF);
1439 }
1440 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1441 }
1442 }
1443 }
1444 else
1445 {
1446 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
1447 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
1448 for (int k = 0; k < N_rcvs; ++k)
1449 {
1450 if (recv_size[k] > 0)
1451 {
1452 const char* dptr = recv_data[k];
1453 auto const& cctc = *recv_cctc[k];
1454 for (auto const& tag : cctc)
1455 {
1456 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
1457 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1458 }
1459 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1460 }
1461 }
1462
1463#ifdef AMREX_USE_OMP
1464#pragma omp parallel
1465#endif
1466 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1467 {
1468 const auto& tags = recv_copy_tags[mfi];
1469 auto dfab = dst.array(mfi);
1470 for (auto const & tag : tags)
1471 {
1472 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1473 if (op == FabArrayBase::COPY)
1474 {
1475 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1476 [=] (int i, int j, int k, int n) noexcept
1477 {
1478 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1479 });
1480 }
1481 else
1482 {
1483 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1484 [=] (int i, int j, int k, int n) noexcept
1485 {
1486 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1487 });
1488 }
1489 }
1490 }
1491 }
1492}
1493
1494#endif /* AMREX_USE_MPI */
1495
1496}
1497
1498#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
CopyComTag::CopyComTagsContainer CopyComTagsContainer
Definition AMReX_FabArrayBase.H:220
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:119
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:113
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:349
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:360
void CMD_remote_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:677
void FB_local_add_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:438
void FB_local_add_gpu(const FB &TheFB, int scomp, int ncomp, bool deterministic)
Definition AMReX_FBI.H:599
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:565
void FB_local_copy_gpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:528
void CMD_local_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:647
void FB_local_copy_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:379
TagVector< Array4CopyTag< value_type > > const * FB_get_local_copy_tag_vector(const FB &TheFB)
Definition AMReX_FBI.H:486
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:535
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:85
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1317
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
__host__ __device__ AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition AMReX_GpuAtomic.H:488
__host__ __device__ AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition AMReX_GpuAtomic.H:514
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:284
__host__ __device__ AMREX_FORCE_INLINE T Add(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:201
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:92
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2866
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2861
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1230
Definition AMReX_TagParallelFor.H:58
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
A multidimensional array accessor.
Definition AMReX_Array4.H:283
Definition AMReX_TagParallelFor.H:106
Definition AMReX_TagParallelFor.H:116
Definition AMReX_Dim3.H:12
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:474
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:195
Box sbox
Definition AMReX_FabArrayBase.H:197
int srcIndex
Definition AMReX_FabArrayBase.H:199
Box dbox
Definition AMReX_FabArrayBase.H:196
int dstIndex
Definition AMReX_FabArrayBase.H:198
FillBoundary.
Definition AMReX_FabArrayBase.H:488
Definition AMReX_FBI.H:7
IntVect offset
Definition AMReX_FBI.H:10
Box dbox
Definition AMReX_FBI.H:9
FAB const * sfab
Definition AMReX_FBI.H:8
Definition AMReX_TypeTraits.H:56
Definition AMReX_TypeTraits.H:66
Definition AMReX_TypeTraits.H:297
Definition AMReX_TagParallelFor.H:158
Definition AMReX_FBI.H:13
Box dbox
Definition AMReX_FBI.H:15
char const * p
Definition AMReX_FBI.H:14