Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4namespace amrex {
5
6template <class FAB>
7struct FabCopyTag {
8 FAB const* sfab;
10 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
11};
12
14 char const* p;
16};
17
19namespace detail {
20
21#ifdef AMREX_USE_GPU
22
23template <class T0, class T1>
24struct CellStore
25{
27 operator() (T0* d, T1 s) const noexcept
28 {
29 *d = static_cast<T0>(s);
30 }
31};
32
33template <class T0, class T1>
34struct CellAdd
35{
37 operator() (T0* d, T1 s) const noexcept
38 {
39 *d += static_cast<T0>(s);
40 }
41};
42
43template <class T0, class T1>
44struct CellAtomicAdd
45{
46 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
48 operator() (U0* d, T1 s) const noexcept
49 {
50 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
51 }
52};
53
54template <class T0, class T1, class F>
55void
56fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
57 F && f)
58{
59 TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
60
61 detail::ParallelFor_doit(tv,
63 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const& tag) noexcept
64 {
65 if (icell < ncells) {
66 for (int n = 0; n < ncomp; ++n) {
67 f(&(tag.dfab(i,j,k,n+dcomp)),
68 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
69 }
70 }
71 });
72}
73
74template <class TagType, class F>
75void
76fab_to_fab_store (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
77{
79 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
80 {
81 int m = Gpu::Atomic::Add(&(tag.mask(i,j,k)), 1);
82 if (m == 0) {
83 for (int n = 0; n < ncomp; ++n) {
84 f(&(tag.dfab(i,j,k,n+dcomp)),
85 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
86 }
87 }
88 });
89}
90
91template <class TagType, class F>
92void
93fab_to_fab_other (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
94{
96 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
97 {
98 int* m = &(tag.mask(i,j,k));
99 bool my_turn = false;
100 do {
101#if defined(AMREX_USE_SYCL)
102 my_turn = (Gpu::Atomic::Exch(m, 1) == 0);
103#else
104 my_turn = (Gpu::Atomic::CAS(m, 0, 1) == 0);
105#endif
106 if (my_turn) {
107#if defined(AMREX_USE_SYCL)
108 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
109#else
110 __threadfence();
111#endif
112 for (int n = 0; n < ncomp; ++n) {
113 f(&(tag.dfab(i,j,k,n+dcomp)),
114 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
115 }
116#if defined(AMREX_USE_SYCL)
117 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
118#else
119 __threadfence(); // It appears that we need this fence too if a GPU is shared.
120#endif
121 Gpu::Atomic::Exch(m, 0);
122 }
123 else {
124#if defined(AMREX_USE_CUDA)
125
126#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
127#if defined(_WIN32)
128 volatile int tmp; // prevent optimization
129 for (int c = 0; c < 2; ++c) {
130 ++tmp;
131 }
132#else
133 for (int c = 0; c < 2; ++c) {
134 __asm__ volatile(""); // prevent optimization
135 }
136#endif
137#else
138 __nanosleep(1);
139#endif
140
141#elif defined(AMREX_USE_HIP)
142
143 __builtin_amdgcn_s_sleep(1);
144
145#elif defined(AMREX_USE_SYCL)
146
147 for (int c = 0; c < 2; ++c) {
148 __asm__ volatile(""); // prevent optimization
149 }
150
151#endif
152 }
153 } while (!my_turn);
154 });
155}
156
157template <class T0, class T1, class F>
158void
159fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
160 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
161{
162 using TagType = Array4MaskCopyTag<T0, T1>;
163 Vector<TagType> tags;
164 const int N = copy_tags.size();
165 tags.reserve(N);
166 for (int i = 0; i < N; ++i) {
167 tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
168 copy_tags[i].dbox, copy_tags[i].offset});
169 }
170
171 if constexpr (std::is_same_v<F, CellStore<T0,T1>>)
172 {
173 fab_to_fab_store(tags, scomp, dcomp, ncomp, std::forward<F>(f));
174 }
175 else
176 {
177 fab_to_fab_other(tags, scomp, dcomp, ncomp, std::forward<F>(f));
178 }
179 // Note that Tag ParalleFor has a stream sync call in the end.
180}
181
182template <typename T0, typename T1,
183 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
184void
185fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
186 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
187{
188 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
189}
190
191template <typename T0, typename T1,
192 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
193void
194fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
195 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
196{
197 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
198}
199
200template <typename T0, typename T1,
201 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
202void
203fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
204 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
205{
206 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
207}
208
209template <typename T0, typename T1,
210 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
211void
212fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
213 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
214{
215 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
216}
217
218template <typename T0, typename T1, class F>
219void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int scomp,
220 int dcomp, int ncomp, F const& f)
221{
222 if (a_tags.empty()) { return; }
223
224 using TagType = Array4CopyTag<T0,T1>;
225
226 struct TiledTag {
227 int tag_index;
228 std::pair<int,Box> dindex_tilebox;
229 bool operator< (TiledTag const& rhs) const noexcept {
230 return this->dindex_tilebox < rhs.dindex_tilebox;
231 }
232 bool operator!= (TiledTag const& rhs) const noexcept {
233 return this->dindex_tilebox != rhs.dindex_tilebox;
234 }
235 };
236 Vector<TiledTag> tiled_tags;
237
238 auto const ixtype = a_tags[0].dbox.ixType();
239
240 constexpr int tile_size = 64;
241 for (int itag = 0; itag < a_tags.size(); ++itag) {
242 auto const& tag = a_tags[itag];
243 auto const& dlo = tag.dbox.smallEnd();
244 auto const& dhi = tag.dbox.bigEnd();
245 IntVect tlo(AMREX_D_DECL(amrex::coarsen<tile_size>(dlo[0]),
246 amrex::coarsen<tile_size>(dlo[1]),
247 amrex::coarsen<tile_size>(dlo[2])));
248 IntVect thi(AMREX_D_DECL(amrex::coarsen<tile_size>(dhi[0]),
249 amrex::coarsen<tile_size>(dhi[1]),
250 amrex::coarsen<tile_size>(dhi[2])));
251#if (AMREX_SPACEDIM == 3)
252 for (int kt = tlo[2]; kt <= thi[2]; ++kt)
253#endif
254 {
255#if (AMREX_SPACEDIM >= 2)
256 for (int jt = tlo[1]; jt <= thi[1]; ++jt)
257#endif
258 {
259 for (int it = tlo[0]; it <= thi[0]; ++it)
260 {
261 IntVect lo(AMREX_D_DECL(it*tile_size,
262 jt*tile_size,
263 kt*tile_size));
264 tiled_tags.push_back(TiledTag{itag, std::make_pair
265 (tag.dindex, Box(lo, lo+(tile_size-1), ixtype))});
266 }
267 }
268 }
269 }
270
271 std::sort(tiled_tags.begin(), tiled_tags.end());
272
273 Gpu::HostVector<unsigned int> h_ntags;
274 Gpu::HostVector<TagType> h_tags;
275 h_tags.reserve(tiled_tags.size());
276
277 for (unsigned int itag = 0; itag < tiled_tags.size(); ++itag) {
278 if (itag == 0) {
279 h_ntags.push_back(0);
280 } else if (tiled_tags[itag-1] != tiled_tags[itag]) {
281 h_ntags.push_back(itag);
282 }
283 auto const& ttag = tiled_tags[itag];
284 auto const& btag = a_tags[ttag.tag_index];
285 h_tags.push_back({btag.dfab, btag.dindex, btag.sfab,
286 btag.dbox & ttag.dindex_tilebox.second, btag.offset});
287 }
288 h_ntags.push_back((unsigned int)tiled_tags.size());
289
290 Gpu::DeviceVector<TagType> d_tags(h_tags.size());
291 Gpu::DeviceVector<unsigned int> d_ntags(h_ntags.size());
292 Gpu::copyAsync(Gpu::hostToDevice,h_tags.begin(),h_tags.end(),d_tags.begin());
293 Gpu::copyAsync(Gpu::hostToDevice,h_ntags.begin(),h_ntags.end(),d_ntags.begin());
294 auto const* ptag = d_tags.data();
295 auto const* pntags = d_ntags.data();
296 auto const nblocks = int(h_ntags.size()-1);
297 constexpr auto nthreads = 256;
298 amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
299#ifdef AMREX_USE_SYCL
300 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
301 [[sycl::reqd_work_group_size(nthreads)]]
302#else
303 [=] AMREX_GPU_DEVICE () noexcept
304#endif
305 {
306#ifdef AMREX_USE_SYCL
307 Dim1 blockIdx{item.get_group_linear_id()};
308 Dim1 threadIdx{item.get_local_linear_id()};
309#endif
310
311 for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
312 auto const tag = ptag[itag];
313 auto ncells = int(tag.dbox.numPts());
314 const auto len = amrex::length(tag.dbox);
315 const auto lo = amrex::lbound(tag.dbox);
316 for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
317 int k = icell / (len.x*len.y);
318 int j = (icell - k*(len.x*len.y)) / len.x;
319 int i = (icell - k*(len.x*len.y)) - j*len.x;
320 i += lo.x;
321 j += lo.y;
322 k += lo.z;
323 for (int n = 0; n < ncomp; ++n) {
324 f(tag.dfab.ptr(i,j,k,n+dcomp),
325 tag.sfab(i + tag.offset.x,
326 j + tag.offset.y,
327 k + tag.offset.z, n+scomp));
328 }
329 }
330
331 if (itag+1 < pntags[blockIdx.x+1]) {
332#ifdef AMREX_USE_SYCL
333 sycl::group_barrier(item.get_group());
334#else
335 __syncthreads();
336#endif
337 }
338 }
339 });
340 Gpu::streamSynchronize();
341}
342
343#endif /* AMREX_USE_GPU */
344
345}
347
348template <class FAB>
349void
350FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
351{
352 auto const& LocTags = *(TheFB.m_LocTags);
353 auto N_locs = static_cast<int>(LocTags.size());
354 if (N_locs == 0) { return; }
355 bool is_thread_safe = TheFB.m_threadsafe_loc;
356 if (is_thread_safe)
357 {
358#ifdef AMREX_USE_OMP
359#pragma omp parallel for
360#endif
361 for (int i = 0; i < N_locs; ++i)
362 {
363 const CopyComTag& tag = LocTags[i];
364
365 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
366 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
367
368 const FAB* sfab = &(get(tag.srcIndex));
369 FAB* dfab = &(get(tag.dstIndex));
370 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
371 }
372 }
373 else
374 {
376 for (int i = 0; i < N_locs; ++i)
377 {
378 const CopyComTag& tag = LocTags[i];
379
380 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
381 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
382
383 loc_copy_tags[tag.dstIndex].push_back
384 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
385 }
386#ifdef AMREX_USE_OMP
387#pragma omp parallel
388#endif
389 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
390 {
391 const auto& tags = loc_copy_tags[mfi];
392 auto dfab = this->array(mfi);
393 for (auto const & tag : tags)
394 {
395 auto const sfab = tag.sfab->array();
396 const auto offset = tag.offset.dim3();
397 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
398 [=] (int i, int j, int k, int n) noexcept
399 {
400 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
401 });
402 }
403 }
404 }
405}
406
407template <class FAB>
408void
409FabArray<FAB>::FB_local_add_cpu (const FB& TheFB, int scomp, int ncomp)
410{
411 auto const& LocTags = *(TheFB.m_LocTags);
412 auto N_locs = static_cast<int>(LocTags.size());
413 if (N_locs == 0) { return; }
414
416 // We must make a temporary copy of the data first to avoid race condition.
417 std::vector<FAB> src_fabs(N_locs);
418 for (int itag = 0; itag < N_locs; ++itag) {
419 const CopyComTag& tag = LocTags[itag];
420 src_fabs[itag].resize(tag.sbox,ncomp);
421 loc_copy_tags[tag.dstIndex].push_back
422 (FabCopyTag<FAB>{&(src_fabs[itag]),
423 tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
424 }
425
426#ifdef AMREX_USE_OMP
427#pragma omp parallel for
428#endif
429 for (int itag = 0; itag < N_locs; ++itag) {
430 const CopyComTag& tag = LocTags[itag];
431 src_fabs[itag].template copy<RunOn::Host>(this->operator[](tag.srcIndex), scomp, 0, ncomp);
432 }
433
434#ifdef AMREX_USE_OMP
435#pragma omp parallel
436#endif
437 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
438 {
439 const auto& tags = loc_copy_tags[mfi];
440 const auto& dfab = this->array(mfi);
441 for (auto const & tag : tags)
442 {
443 auto const sfab = tag.sfab->array();
444 const auto offset = tag.offset.dim3();
445 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
446 [&] (int i, int j, int k, int n) noexcept
447 {
448 dfab(i,j,k,n+scomp) += sfab(i+offset.x,j+offset.y,k+offset.z,n);
449 });
450 }
451 }
452}
453
454#ifdef AMREX_USE_GPU
455
456template <class FAB>
459{
460 auto const& LocTags = *(TheFB.m_LocTags);
461 int N_locs = LocTags.size();
462
463 using TagType = Array4CopyTag<value_type>;
464
466 if (auto it = m_fb_local_copy_handler.find(TheFB.m_id);
467 it != m_fb_local_copy_handler.end())
468 {
469 tv = it->second.get();
470 } else {
471 Vector<TagType> loc_copy_tags;
472 loc_copy_tags.reserve(N_locs);
473
474 for (int i = 0; i < N_locs; ++i)
475 {
476 const CopyComTag& tag = LocTags[i];
477
478 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
479 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
480
481 int li = this->localindex(tag.dstIndex);
482 loc_copy_tags.push_back
483 ({this->atLocalIdx(li).array(), tag.dstIndex,
484 this->fabPtr(tag.srcIndex)->const_array(),
485 tag.dbox,
486 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
487 }
488
489 auto utv = std::make_unique<TagVector<TagType>>(loc_copy_tags);
490 tv = utv.get();
491 m_fb_local_copy_handler[TheFB.m_id] = std::move(utv);
492 }
493
494 return tv;
495}
496
497template <class FAB>
498void
499FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
500{
501 auto const& LocTags = *(TheFB.m_LocTags);
502 int N_locs = LocTags.size();
503 if (N_locs == 0) { return; }
504 bool is_thread_safe = TheFB.m_threadsafe_loc;
505
506 using TagType = Array4CopyTag<value_type>;
507
508 if (is_thread_safe || amrex::IsStoreAtomic<value_type>::value)
509 {
510 auto* tv = FB_get_local_copy_tag_vector(TheFB);
511
512 detail::ParallelFor_doit(*tv,
513 [=] AMREX_GPU_DEVICE (
514 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
515 {
516 if (icell < ncells) {
517 for (int n = 0; n < ncomp; ++n) {
518 tag.dfab(i,j,k,n+scomp) = tag.sfab(i+tag.offset.x,
519 j+tag.offset.y,
520 k+tag.offset.z,n+scomp);
521 }
522 }
523 });
524 }
525 else
526 {
527 Vector<TagType> loc_copy_tags;
528 loc_copy_tags.reserve(N_locs);
529
530 Vector<BaseFab<int>> maskfabs(this->local_size());
531 Vector<Array4Tag<int> > masks_unique;
532 masks_unique.reserve(this->local_size());
533 Vector<Array4Tag<int> > masks;
534 masks.reserve(N_locs);
535
536 for (int i = 0; i < N_locs; ++i)
537 {
538 const CopyComTag& tag = LocTags[i];
539
540 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
541 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
542
543 int li = this->localindex(tag.dstIndex);
544 loc_copy_tags.push_back
545 ({this->atLocalIdx(li).array(), tag.dstIndex,
546 this->fabPtr(tag.srcIndex)->const_array(),
547 tag.dbox,
548 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
549
550 if (!maskfabs[li].isAllocated()) {
551 maskfabs[li].resize(this->atLocalIdx(li).box());
552 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
553 }
554 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
555 }
556
557 amrex::ParallelFor(masks_unique,
558 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
559 {
560 msk.dfab(i,j,k) = 0;
561 });
562
563 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
564 loc_copy_tags, scomp, scomp, ncomp, masks);
565 }
566}
567
568template <class FAB>
569void
570FabArray<FAB>::FB_local_add_gpu (const FB& TheFB, int scomp, int ncomp, bool deterministic)
571{
572 auto const& LocTags = *(TheFB.m_LocTags);
573 int N_locs = LocTags.size();
574 if (N_locs == 0) { return; }
575
576 using TagType = Array4CopyTag<value_type>;
577
578 Vector<TagType> loc_copy_tags_1, loc_copy_tags_2;
579 loc_copy_tags_1.reserve(N_locs);
580 loc_copy_tags_2.reserve(N_locs);
581
582 Vector<FAB> src_fabs(N_locs);
583 for (int itag = 0; itag < N_locs; ++itag) {
584 const CopyComTag& tag = LocTags[itag];
585 src_fabs[itag].resize(tag.sbox,ncomp);
586 loc_copy_tags_1.push_back(
587 TagType{src_fabs[itag].array(), -1,
588 this->const_array(tag.srcIndex,scomp), tag.sbox,
589 Dim3{0,0,0}});
590 loc_copy_tags_2.push_back(
591 TagType{this->array(tag.dstIndex,scomp), tag.dstIndex,
592 src_fabs[itag].const_array(), tag.dbox,
593 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
594 }
595
596 // Note that we have shifted the starting component to zero in the code above.
597
598 // TODO: We could try to cache the tags like in FillBoundary.
599
600 detail::fab_to_fab(loc_copy_tags_1, 0, 0, ncomp,
601 detail::CellStore<value_type, value_type>{});
602 if (deterministic || ! amrex::HasAtomicAdd<value_type>::value) {
603 detail::deterministic_fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
604 detail::CellAdd<value_type,value_type>{});
605 } else {
607 detail::fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
608 detail::CellAtomicAdd<value_type, value_type>{});
609 }
610 ((void)0);
611 }
612
613 // Note that fab_to_fab is synchronous.
614}
615
616template <class FAB>
617void
619 const CommMetaData& thecmd, int scomp, int ncomp)
620{
621 auto const& LocTags = *(thecmd.m_LocTags);
622 int N_locs = LocTags.size();
623 if (N_locs == 0) { return; }
624 bool is_thread_safe = thecmd.m_threadsafe_loc;
625
626 using TagType = Array4BoxTag<value_type>;
627 Vector<TagType> loc_setval_tags;
628 loc_setval_tags.reserve(N_locs);
629
631
632 for (int i = 0; i < N_locs; ++i)
633 {
634 const CopyComTag& tag = LocTags[i];
635 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
636 loc_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
637 }
638
639 amrex::ParallelFor(loc_setval_tags, ncomp,
640 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
641 {
642 tag.dfab(i,j,k,n+scomp) = x;
643 });
644}
645
646template <class FAB>
647void
649 const CommMetaData& thecmd, int scomp, int ncomp)
650{
651 auto const& RcvTags = *(thecmd.m_RcvTags);
652 bool is_thread_safe = thecmd.m_threadsafe_rcv;
653
654 using TagType = Array4BoxTag<value_type>;
655 Vector<TagType> rcv_setval_tags;
656
657 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
658 for (auto const& tag: it->second) {
659 rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
660 }
661 }
662
663 if (rcv_setval_tags.empty()) { return; }
664
666
667 amrex::ParallelFor(rcv_setval_tags, ncomp,
668 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
669 {
670 tag.dfab(i,j,k,n+scomp) = x;
671 });
672}
673
674#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
675template <class FAB>
676void
677FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
678{
679 const int N_locs = (*TheFB.m_LocTags).size();
681 for (int i = 0; i < N_locs; ++i)
682 {
683 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
684
685 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
686 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
687
688 loc_copy_tags[tag.dstIndex].push_back
689 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
690 }
691
692 // Create Graph if one is needed.
693 if ( !(TheFB.m_localCopy.ready()) )
694 {
695 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
696
697 int idx = 0;
698 // Record the graph.
699 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
700 {
701 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
702 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
703 (TheFB).m_localCopy.getDevicePtr(0),
704 std::size_t(sizeof(CopyMemory)*N_locs) );
705
706 const auto& tags = loc_copy_tags[mfi];
707 for (auto const & tag : tags)
708 {
709 const auto offset = tag.offset.dim3();
710 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
711 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
712 {
713 // Build the Array4's.
714 auto const dst = cmem->getDst<value_type>();
715 auto const src = cmem->getSrc<value_type>();
716 for (int n = 0; n < cmem->ncomp; ++n) {
717 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
718 }
719 });
720 }
721
722 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
723 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
724 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
725 }
726 }
727
728 // Setup Launch Parameters
729 // This is perfectly threadable, right?
730 // Additional optimization -> Check to see whether values need to be reset?
731 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
732 int idx = 0;
733 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
734 {
735 auto const dst_array = this->array(mfi);
736 const auto& tags = loc_copy_tags[mfi];
737 for (auto const & tag : tags)
738 {
739 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
740 dst_array,
741 scomp, ncomp));
742 }
743 }
744
745 // Launch Graph
746 TheFB.m_localCopy.executeGraph();
747}
748
749#ifdef AMREX_USE_MPI
750template <class FAB>
751void
752FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
753{
754 const int N_locs = TheFB.m_LocTags->size();
755
756 int launches = 0; // Used for graphs only.
757 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
758 for (int i = 0; i < N_locs; ++i)
759 {
760 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
761
762 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
763 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
764
765 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
766 {
767 loc_copy_tags[tag.dstIndex].push_back
768 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
769 launches++;
770 }
771 }
772
773 FillBoundary_test();
774
775 if ( !(TheFB.m_localCopy.ready()) )
776 {
777 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
778
779 int idx = 0;
780 int cuda_stream = 0;
781 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
782 {
783 const auto& tags = loc_copy_tags[mfi];
784 for (int t = 0; t<tags.size(); ++t)
785 {
786 Gpu::Device::setStreamIndex(cuda_stream++);
787 amrex::Gpu::Device::startGraphRecording( (idx == 0),
788 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
789 (TheFB).m_localCopy.getDevicePtr(0),
790 std::size_t(sizeof(CopyMemory)*launches) );
791
792 const auto& tag = tags[t];
793 const Dim3 offset = tag.offset.dim3();
794
795 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
796 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
797 {
798 auto const dst = cmem->getDst<value_type>();
799 auto const src = cmem->getSrc<value_type>();
800 for (int n = 0; n < cmem->ncomp; ++n) {
801 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
802 }
803 });
804
805 bool last_iter = idx == launches;
806 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
807 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
808 }
809 }
810 }
811
812 // Setup Launch Parameters
813 // This is perfectly threadable, right?
814 int idx = 0;
815 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
816 {
817 const auto& dst_array = this->array(mfi);
818 const auto& tags = loc_copy_tags[mfi];
819 for (auto const & tag : tags)
820 {
821 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
822 dst_array,
823 scomp, ncomp));
824 }
825 }
826
827 // Launch Graph without synch. Local work is entirely independent.
828 TheFB.m_localCopy.executeGraph(false);
829}
830#endif /* AMREX_USE_MPI */
831
832#endif /* __CUDACC__ */
833
834#endif /* AMREX_USE_GPU */
835
836#ifdef AMREX_USE_MPI
837
838#ifdef AMREX_USE_GPU
839
840#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
841
842template <class FAB>
843void
844FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
845 Vector<char*>& send_data,
846 Vector<std::size_t> const& send_size,
847 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
848{
849 const int N_snds = send_data.size();
850 if (N_snds == 0) { return; }
851
852 if ( !(TheFB.m_copyToBuffer.ready()) )
853 {
854 // Set size of CudaGraph buffer.
855 // Is the conditional ever expected false?
856 int launches = 0;
857 for (int send = 0; send < N_snds; ++send) {
858 if (send_size[send] > 0) {
859 launches += send_cctc[send]->size();
860 }
861 }
862 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
863
864 // Record the graph.
865 int idx = 0;
866 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
867 sit.isValid(); ++sit)
868 {
869 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
870 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
871 (TheFB).m_copyToBuffer.getDevicePtr(0),
872 std::size_t(sizeof(CopyMemory)*launches) );
873
874 const int j = sit();
875 if (send_size[j] > 0)
876 {
877 auto const& cctc = *send_cctc[j];
878 for (auto const& tag : cctc)
879 {
880 const Box& bx = tag.sbox;
881 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
882 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
883 {
884 auto const pfab = cmem->getDst<value_type>();
885 auto const sfab = cmem->getSrc<value_type>();
886 for (int n = 0; n < cmem->ncomp; ++n)
887 {
888 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
889 }
890 });
891 }
892 }
893
894 bool last_iter = sit() == (N_snds-1);
895 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
896 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
897 }
898 }
899
900 // Setup Launch Parameters
901 int idx = 0;
902 for (int send = 0; send < N_snds; ++send)
903 {
904 const int j = send;
905 if (send_size[j] > 0)
906 {
907 char* dptr = send_data[j];
908 auto const& cctc = *send_cctc[j];
909 for (auto const& tag : cctc)
910 {
911 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
912 amrex::makeArray4((value_type*)(dptr),
913 tag.sbox,
914 ncomp),
915 scomp, ncomp));
916
917 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
918 }
919 amrex::ignore_unused(send_size);
920 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
921 }
922 }
923
924 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
925 TheFB.m_copyToBuffer.executeGraph();
926}
927
928template <class FAB>
929void
930FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
931 Vector<char*> const& recv_data,
932 Vector<std::size_t> const& recv_size,
933 Vector<CopyComTagsContainer const*> const& recv_cctc,
934 bool /*is_thread_safe*/)
935{
936 const int N_rcvs = recv_cctc.size();
937 if (N_rcvs == 0) { return; }
938
939 int launches = 0;
940 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
941 for (int k = 0; k < N_rcvs; ++k)
942 {
943 if (recv_size[k] > 0)
944 {
945 const char* dptr = recv_data[k];
946 auto const& cctc = *recv_cctc[k];
947 for (auto const& tag : cctc)
948 {
949 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
950 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
951 launches++;
952 }
953 amrex::ignore_unused(recv_size);
954 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
955 }
956 }
957
958 if ( !(TheFB.m_copyFromBuffer.ready()) )
959 {
960 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
961
962 int idx = 0;
963 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
964 {
965 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
966 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
967 (TheFB).m_copyFromBuffer.getDevicePtr(0),
968 std::size_t(sizeof(CopyMemory)*launches) );
969
970 const auto& tags = recv_copy_tags[mfi];
971 for (auto const & tag : tags)
972 {
973 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
974 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
975 {
976 auto const pfab = cmem->getSrc<value_type>();
977 auto const dfab = cmem->getDst<value_type>();
978 for (int n = 0; n < cmem->ncomp; ++n)
979 {
980 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
981 }
982 });
983 }
984
985 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
986 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
987 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
988 }
989 }
990
991 // Setup graph.
992 int idx = 0;
993 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
994 {
995 auto dst_array = this->array(mfi);
996 const auto & tags = recv_copy_tags[mfi];
997 for (auto const & tag : tags)
998 {
999 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
1000 tag.dbox,
1001 ncomp),
1002 dst_array,
1003 dcomp, ncomp));
1004 }
1005 }
1006
1007 // Launch Graph - synced because next action is freeing recv buffer.
1008 TheFB.m_copyFromBuffer.executeGraph();
1009}
1010
1011#endif /* __CUDACC__ */
1012
1013template <class FAB>
1014template <typename BUF>
1015auto
1017 Vector<std::size_t> const& send_size,
1018 Vector<CopyComTagsContainer const*> const& send_cctc,
1019 int ncomp, std::uint64_t id) const
1021{
1022 using TagType = CommSendBufTag<value_type>;
1023
1024 auto kit = std::find_if(send_cctc.begin(), send_cctc.end(),
1025 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1026 if (kit == send_cctc.end()) {
1027 return nullptr;
1028 }
1029
1030 auto get_tags = [&] () -> Vector<TagType>
1031 {
1032 Vector<TagType> snd_copy_tags;
1033 char* pbuf = send_data[0];
1034 const int N_snds = send_data.size();
1035 for (int j = 0; j < N_snds; ++j)
1036 {
1037 if (send_size[j] > 0)
1038 {
1039 char* dptr = send_data[j];
1040 auto const& cctc = *send_cctc[j];
1041 for (auto const& tag : cctc)
1042 {
1043 snd_copy_tags.emplace_back
1044 (TagType{this->const_array(tag.srcIndex), dptr-pbuf, tag.sbox});
1045 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
1046 }
1047 }
1048 }
1049 return snd_copy_tags;
1050 };
1051
1053 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1054
1055 if (auto it = m_send_copy_handler.find(key); it != m_send_copy_handler.end()) {
1056 tv = it->second.get();
1057 } else {
1058 if (m_send_copy_handler.size() > 32) {
1059 // Just in case. If this is used in ParallelCopy, it's possible
1060 // that the sending FabArray is the same, but the receiving
1061 // FabArray is different every time. Then the size of this map
1062 // could increase indefinitely.
1063 m_send_copy_handler.clear();
1064 }
1065 auto snd_copy_tags = get_tags();
1066 auto utv = std::make_unique<TagVector<TagType>>(snd_copy_tags);
1067 tv = utv.get();
1068 m_send_copy_handler[key] = std::move(utv);
1069 }
1070
1071 return tv;
1072}
1073
1074template <class FAB>
1075template <typename BUF>
1076void
1077FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
1078 Vector<char*> const& send_data,
1079 Vector<std::size_t> const& send_size,
1080 Vector<CopyComTagsContainer const*> const& send_cctc,
1081 std::uint64_t id)
1082{
1083 const int N_snds = send_data.size();
1084 if (N_snds == 0) { return; }
1085
1086 using TagType = CommSendBufTag<value_type>;
1087
1088 auto* tv = src.template get_send_copy_tag_vector<BUF>
1089 (send_data, send_size, send_cctc, ncomp, id);
1090 if (tv == nullptr) { return; }
1091
1092 char* pbuffer = send_data[0];
1093
1094 detail::ParallelFor_doit(*tv,
1095 [=] AMREX_GPU_DEVICE (
1096 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1097 {
1098 if (icell < ncells) {
1099 Array4<BUF> dfab{(BUF*)(pbuffer+tag.poff),
1100 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1101 for (int n = 0; n < ncomp; ++n) {
1102 dfab(i,j,k,n) = (BUF)tag.sfab(i,j,k,n+scomp);
1103 }
1104 }
1105 });
1106
1107 Gpu::streamSynchronize();
1108}
1109
1110template <class FAB>
1111template <typename BUF>
1112auto
1114 Vector<std::size_t> const& recv_size,
1115 Vector<CopyComTagsContainer const*> const& recv_cctc,
1116 int ncomp, std::uint64_t id)
1118{
1119 using TagType = CommRecvBufTag<value_type>;
1120
1121 auto kit = std::find_if(recv_cctc.begin(), recv_cctc.end(),
1122 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1123 if (kit == recv_cctc.end()) {
1124 return nullptr;
1125 }
1126
1127 auto get_tags = [&] () -> Vector<TagType>
1128 {
1129 Vector<TagType> recv_copy_tags;
1130 char* pbuf = recv_data[0];
1131 const int N_rcvs = recv_cctc.size();
1132 for (int k = 0; k < N_rcvs; ++k)
1133 {
1134 if (recv_size[k] > 0)
1135 {
1136 char* dptr = recv_data[k];
1137 auto const& cctc = *recv_cctc[k];
1138 for (auto const& tag : cctc)
1139 {
1140 const int li = this->localindex(tag.dstIndex);
1141 recv_copy_tags.emplace_back
1142 (TagType{this->atLocalIdx(li).array(), dptr-pbuf, tag.dbox});
1143 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1144 }
1145 }
1146 }
1147 return recv_copy_tags;
1148 };
1149
1151 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1152
1153 if (auto it = m_recv_copy_handler.find(key); it != m_recv_copy_handler.end()) {
1154 tv = it->second.get();
1155 } else {
1156 if (m_recv_copy_handler.size() > 32) {
1157 // Just in case. If this is used in ParallelCopy, it's possible
1158 // that the receiving FabArray is the same, but the sending
1159 // FabArray is different every time. Then the size of this map
1160 // could increase indefinitely.
1161 m_recv_copy_handler.clear();
1162 }
1163 auto recv_copy_tags = get_tags();
1164 auto utv = std::make_unique<TagVector<TagType>>(recv_copy_tags);
1165 tv = utv.get();
1166 m_recv_copy_handler[key] = std::move(utv);
1167 }
1168
1169 return tv;
1170}
1171
1172template <class FAB>
1173template <typename BUF>
1174void
1176 Vector<char*> const& recv_data,
1177 Vector<std::size_t> const& recv_size,
1178 Vector<CopyComTagsContainer const*> const& recv_cctc,
1179 CpOp op, bool is_thread_safe, std::uint64_t id,
1180 bool deterministic)
1181{
1182 const int N_rcvs = recv_cctc.size();
1183 if (N_rcvs == 0) { return; }
1184
1185 bool use_mask = false;
1186 if (!is_thread_safe)
1187 {
1188 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
1189 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
1190 {
1191 use_mask = true;
1192 }
1193 }
1194
1195 if (deterministic)
1196 {
1197 AMREX_ALWAYS_ASSERT(op == FabArrayBase::ADD); // Only ADD for now
1198 using TagType = Array4CopyTag<value_type,BUF>;
1199 Vector<TagType> tags;
1200 tags.reserve(N_rcvs);
1201 for (int k = 0; k < N_rcvs; ++k) {
1202 if (recv_size[k] > 0) {
1203 char const* dptr = recv_data[k];
1204 auto const& cctc = *recv_cctc[k];
1205 for (auto const& tag : cctc) {
1206 tags.emplace_back(
1207 TagType{dst.array(tag.dstIndex), tag.dstIndex,
1208 Array4<BUF const>((BUF const*)dptr,
1209 amrex::begin(tag.dbox),
1210 amrex::end(tag.dbox), ncomp),
1211 tag.dbox, Dim3{0,0,0}});
1212 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1213 }
1214 }
1215 }
1217 detail::deterministic_fab_to_fab<value_type,BUF>
1218 (tags, 0, dcomp, ncomp, detail::CellAdd<value_type,BUF>{});
1219 } else {
1220 amrex::Abort("SumBoundary requires operator+=");
1221 }
1222 }
1223 else if (!use_mask)
1224 {
1225 using TagType = CommRecvBufTag<value_type>;
1226 auto* tv = dst.template get_recv_copy_tag_vector<BUF>
1227 (recv_data, recv_size, recv_cctc, ncomp, id);
1228 if (tv == nullptr) { return; }
1229
1230 char* pbuffer = recv_data[0];
1231
1232 if (op == FabArrayBase::COPY)
1233 {
1234 detail::ParallelFor_doit(*tv,
1235 [=] AMREX_GPU_DEVICE (
1236 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1237 {
1238 if (icell < ncells) {
1239 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1240 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1241 for (int n = 0; n < ncomp; ++n) {
1242 tag.dfab(i,j,k,n+dcomp) = (value_type)sfab(i,j,k,n);
1243 }
1244 }
1245 });
1246 }
1247 else
1248 {
1249 if (is_thread_safe) {
1250 detail::ParallelFor_doit(*tv,
1251 [=] AMREX_GPU_DEVICE (
1252 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1253 {
1254 if (icell < ncells) {
1255 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1256 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1257 for (int n = 0; n < ncomp; ++n) {
1258 tag.dfab(i,j,k,n+dcomp) += (value_type)sfab(i,j,k,n);
1259 }
1260 }
1261 });
1262 } else {
1264 detail::ParallelFor_doit(*tv,
1265 [=] AMREX_GPU_DEVICE (
1266 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1267 {
1268 if (icell < ncells) {
1269 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1270 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1271 for (int n = 0; n < ncomp; ++n) {
1272 Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n+dcomp),
1273 (value_type)sfab(i,j,k,n));
1274 }
1275 }
1276 });
1277 } else {
1278 amrex::Abort("unpack_recv_buffer_gpu: should NOT get here");
1279 }
1280 }
1281 }
1282 Gpu::streamSynchronize();
1283 }
1284 else
1285 {
1286 char* pbuffer = recv_data[0];
1287
1288 using TagType = Array4CopyTag<value_type, BUF>;
1289 Vector<TagType> recv_copy_tags;
1290 recv_copy_tags.reserve(N_rcvs);
1291
1292 Vector<BaseFab<int> > maskfabs(dst.local_size());
1293 Vector<Array4Tag<int> > masks_unique;
1294 masks_unique.reserve(dst.local_size());
1295 Vector<Array4Tag<int> > masks;
1296
1297 for (int k = 0; k < N_rcvs; ++k)
1298 {
1299 if (recv_size[k] > 0)
1300 {
1301 std::size_t offset = recv_data[k]-recv_data[0];
1302 const char* dptr = pbuffer + offset;
1303 auto const& cctc = *recv_cctc[k];
1304 for (auto const& tag : cctc)
1305 {
1306 const int li = dst.localindex(tag.dstIndex);
1307 recv_copy_tags.emplace_back(TagType{
1308 dst.atLocalIdx(li).array(), tag.dstIndex,
1309 amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
1310 tag.dbox,
1311 Dim3{0,0,0}
1312 });
1313 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1314
1315 if (!maskfabs[li].isAllocated()) {
1316 maskfabs[li].resize(dst.atLocalIdx(li).box());
1317 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1318 }
1319 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1320 }
1321 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
1322 }
1323 }
1324
1325 amrex::ParallelFor(masks_unique,
1326 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
1327 {
1328 msk.dfab(i,j,k) = 0;
1329 });
1330
1331 if (op == FabArrayBase::COPY)
1332 {
1333 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
1334 recv_copy_tags, 0, dcomp, ncomp, masks);
1335 }
1336 else
1337 {
1338 detail::fab_to_fab_atomic_add<value_type, BUF>(
1339 recv_copy_tags, 0, dcomp, ncomp, masks);
1340 }
1342 // There is Gpu::streamSynchronize in fab_to_fab.
1343 }
1344}
1345
1346#endif /* AMREX_USE_GPU */
1347
1348template <class FAB>
1349template <typename BUF>
1350void
1351FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
1352 Vector<char*> const& send_data,
1353 Vector<std::size_t> const& send_size,
1354 Vector<CopyComTagsContainer const*> const& send_cctc)
1356 amrex::ignore_unused(send_size);
1358 auto const N_snds = static_cast<int>(send_data.size());
1359 if (N_snds == 0) { return; }
1360
1361#ifdef AMREX_USE_OMP
1362#pragma omp parallel for
1363#endif
1364 for (int j = 0; j < N_snds; ++j)
1365 {
1366 if (send_size[j] > 0)
1367 {
1368 char* dptr = send_data[j];
1369 auto const& cctc = *send_cctc[j];
1370 for (auto const& tag : cctc)
1371 {
1372 const Box& bx = tag.sbox;
1373 auto const sfab = src.array(tag.srcIndex);
1374 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
1375 amrex::LoopConcurrentOnCpu( bx, ncomp,
1376 [=] (int ii, int jj, int kk, int n) noexcept
1377 {
1378 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
1379 });
1380 dptr += (bx.numPts() * ncomp * sizeof(BUF));
1381 }
1382 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
1383 }
1384 }
1385}
1386
1387template <class FAB>
1388template <typename BUF>
1389void
1391 Vector<char*> const& recv_data,
1392 Vector<std::size_t> const& recv_size,
1393 Vector<CopyComTagsContainer const*> const& recv_cctc,
1394 CpOp op, bool is_thread_safe)
1395{
1396 amrex::ignore_unused(recv_size);
1397
1398 auto const N_rcvs = static_cast<int>(recv_cctc.size());
1399 if (N_rcvs == 0) { return; }
1400
1401 if (is_thread_safe)
1402 {
1403#ifdef AMREX_USE_OMP
1404#pragma omp parallel for
1405#endif
1406 for (int k = 0; k < N_rcvs; ++k)
1407 {
1408 if (recv_size[k] > 0)
1409 {
1410 const char* dptr = recv_data[k];
1411 auto const& cctc = *recv_cctc[k];
1412 for (auto const& tag : cctc)
1413 {
1414 const Box& bx = tag.dbox;
1415 FAB& dfab = dst[tag.dstIndex];
1416 if (op == FabArrayBase::COPY)
1417 {
1418 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
1419 }
1420 else
1421 {
1422 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
1424 dptr += bx.numPts() * ncomp * sizeof(BUF);
1425 }
1426 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1427 }
1428 }
1430 else
1431 {
1432 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
1433 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
1434 for (int k = 0; k < N_rcvs; ++k)
1435 {
1436 if (recv_size[k] > 0)
1437 {
1438 const char* dptr = recv_data[k];
1439 auto const& cctc = *recv_cctc[k];
1440 for (auto const& tag : cctc)
1441 {
1442 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
1443 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1444 }
1445 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1446 }
1447 }
1448
1449#ifdef AMREX_USE_OMP
1450#pragma omp parallel
1451#endif
1452 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1453 {
1454 const auto& tags = recv_copy_tags[mfi];
1455 auto dfab = dst.array(mfi);
1456 for (auto const & tag : tags)
1457 {
1458 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1459 if (op == FabArrayBase::COPY)
1460 {
1461 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1462 [=] (int i, int j, int k, int n) noexcept
1463 {
1464 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1465 });
1466 }
1467 else
1468 {
1469 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1470 [=] (int i, int j, int k, int n) noexcept
1471 {
1472 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1473 });
1474 }
1475 }
1476 }
1477 }
1478}
1479
1480#endif /* AMREX_USE_MPI */
1481
1482}
1483
1484#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
CopyComTag::CopyComTagsContainer CopyComTagsContainer
Definition AMReX_FabArrayBase.H:220
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:119
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:113
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:347
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:358
void CMD_remote_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:648
void FB_local_add_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:409
void FB_local_add_gpu(const FB &TheFB, int scomp, int ncomp, bool deterministic)
Definition AMReX_FBI.H:570
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:563
void FB_local_copy_gpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:499
void CMD_local_setVal_gpu(value_type x, const CommMetaData &thecmd, int scomp, int ncomp)
Definition AMReX_FBI.H:618
void FB_local_copy_cpu(const FB &TheFB, int scomp, int ncomp)
Definition AMReX_FBI.H:350
TagVector< Array4CopyTag< value_type > > const * FB_get_local_copy_tag_vector(const FB &TheFB)
Definition AMReX_FBI.H:457
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:533
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition AMReX_LayoutData.H:25
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:63
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:147
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:126
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
__host__ __device__ Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:87
__host__ __device__ constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:186
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2869
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:27
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2864
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_TagParallelFor.H:58
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
Definition AMReX_Array4.H:61
Definition AMReX_TagParallelFor.H:106
Definition AMReX_TagParallelFor.H:116
Definition AMReX_Dim3.H:12
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:474
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:195
Box sbox
Definition AMReX_FabArrayBase.H:197
int srcIndex
Definition AMReX_FabArrayBase.H:199
Box dbox
Definition AMReX_FabArrayBase.H:196
int dstIndex
Definition AMReX_FabArrayBase.H:198
FillBoundary.
Definition AMReX_FabArrayBase.H:488
Definition AMReX_FBI.H:7
IntVect offset
Definition AMReX_FBI.H:10
Box dbox
Definition AMReX_FBI.H:9
FAB const * sfab
Definition AMReX_FBI.H:8
Definition AMReX_TypeTraits.H:56
Definition AMReX_TypeTraits.H:66
Definition AMReX_TypeTraits.H:280
Definition AMReX_TagParallelFor.H:158
Definition AMReX_FBI.H:13
Box dbox
Definition AMReX_FBI.H:15
char const * p
Definition AMReX_FBI.H:14