Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4template <class FAB>
5struct FabCopyTag {
6 FAB const* sfab;
7 Box dbox;
8 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
9};
10
12 char const* p;
13 Box dbox;
14};
15
16namespace detail {
17
18#ifdef AMREX_USE_GPU
19
20template <class T0, class T1>
22{
24 operator() (T0* d, T1 s) const noexcept
25 {
26 *d = static_cast<T0>(s);
27 }
28};
29
30template <class T0, class T1>
31struct CellAdd
32{
34 operator() (T0* d, T1 s) const noexcept
35 {
36 *d += static_cast<T0>(s);
37 }
38};
39
40template <class T0, class T1>
42{
43 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
45 operator() (U0* d, T1 s) const noexcept
46 {
47 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
48 }
49};
50
51template <class T0, class T1, class F>
52void
53fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
54 F && f)
55{
56 TagVector<Array4CopyTag<T0, T1>> tv{copy_tags};
57
58 detail::ParallelFor_doit(tv,
60 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const& tag) noexcept
61 {
62 if (icell < ncells) {
63 for (int n = 0; n < ncomp; ++n) {
64 f(&(tag.dfab(i,j,k,n+dcomp)),
65 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
66 }
67 }
68 });
69}
70
71template <class TagType, class F>
72void
73fab_to_fab_store (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
74{
75 amrex::ParallelFor(tags,
76 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
77 {
78 int m = Gpu::Atomic::Add(&(tag.mask(i,j,k)), 1);
79 if (m == 0) {
80 for (int n = 0; n < ncomp; ++n) {
81 f(&(tag.dfab(i,j,k,n+dcomp)),
82 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
83 }
84 }
85 });
86}
87
88template <class TagType, class F>
89void
90fab_to_fab_other (Vector<TagType> const& tags, int scomp, int dcomp, int ncomp, F&&f)
91{
92 amrex::ParallelFor(tags,
93 [=] AMREX_GPU_DEVICE (int i, int j, int k, TagType const& tag) noexcept
94 {
95 int* m = &(tag.mask(i,j,k));
96 bool my_turn = false;
97 do {
98#if defined(AMREX_USE_SYCL)
99 my_turn = (Gpu::Atomic::Exch(m, 1) == 0);
100#else
101 my_turn = (Gpu::Atomic::CAS(m, 0, 1) == 0);
102#endif
103 if (my_turn) {
104#if defined(AMREX_USE_SYCL)
105 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
106#else
107 __threadfence();
108#endif
109 for (int n = 0; n < ncomp; ++n) {
110 f(&(tag.dfab(i,j,k,n+dcomp)),
111 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
112 }
113#if defined(AMREX_USE_SYCL)
114 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
115#else
116 __threadfence(); // It appears that we need this fence too if a GPU is shared.
117#endif
118 Gpu::Atomic::Exch(m, 0);
119 }
120 else {
121#if defined(AMREX_USE_CUDA)
122
123#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
124#if defined(_WIN32)
125 volatile int tmp; // prevent optimization
126 for (int c = 0; c < 2; ++c) {
127 ++tmp;
128 }
129#else
130 for (int c = 0; c < 2; ++c) {
131 __asm__ volatile(""); // prevent optimization
132 }
133#endif
134#else
135 __nanosleep(1);
136#endif
137
138#elif defined(AMREX_USE_HIP)
139
140 __builtin_amdgcn_s_sleep(1);
141
142#elif defined(AMREX_USE_SYCL)
143
144 for (int c = 0; c < 2; ++c) {
145 __asm__ volatile(""); // prevent optimization
146 }
147
148#endif
149 }
150 } while (!my_turn);
151 });
152}
153
154template <class T0, class T1, class F>
155void
156fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
157 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
158{
159 using TagType = Array4MaskCopyTag<T0, T1>;
160 Vector<TagType> tags;
161 const int N = copy_tags.size();
162 tags.reserve(N);
163 for (int i = 0; i < N; ++i) {
164 tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
165 copy_tags[i].dbox, copy_tags[i].offset});
166 }
167
168 if constexpr (std::is_same_v<F, CellStore<T0,T1>>)
169 {
170 fab_to_fab_store(tags, scomp, dcomp, ncomp, std::forward<F>(f));
171 }
172 else
173 {
174 fab_to_fab_other(tags, scomp, dcomp, ncomp, std::forward<F>(f));
175 }
176 // Note that Tag ParalleFor has a stream sync call in the end.
177}
178
179template <typename T0, typename T1,
180 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
181void
182fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
183 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
184{
185 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
186}
187
188template <typename T0, typename T1,
189 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
190void
191fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
192 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
193{
194 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
195}
196
197template <typename T0, typename T1,
198 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
199void
200fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
201 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
202{
203 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
204}
205
206template <typename T0, typename T1,
207 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
208void
209fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
210 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
211{
212 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
213}
214
215template <typename T0, typename T1, class F>
216void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int scomp,
217 int dcomp, int ncomp, F const& f)
218{
219 if (a_tags.empty()) { return; }
220
221 using TagType = Array4CopyTag<T0,T1>;
222
223 struct TiledTag {
224 int tag_index;
225 std::pair<int,Box> dindex_tilebox;
226 bool operator< (TiledTag const& rhs) const noexcept {
227 return this->dindex_tilebox < rhs.dindex_tilebox;
228 }
229 bool operator!= (TiledTag const& rhs) const noexcept {
230 return this->dindex_tilebox != rhs.dindex_tilebox;
231 }
232 };
233 Vector<TiledTag> tiled_tags;
234
235 auto const ixtype = a_tags[0].dbox.ixType();
236
237 constexpr int tile_size = 64;
238 for (int itag = 0; itag < a_tags.size(); ++itag) {
239 auto const& tag = a_tags[itag];
240 auto const& dlo = tag.dbox.smallEnd();
241 auto const& dhi = tag.dbox.bigEnd();
242 IntVect tlo(AMREX_D_DECL(amrex::coarsen<tile_size>(dlo[0]),
243 amrex::coarsen<tile_size>(dlo[1]),
244 amrex::coarsen<tile_size>(dlo[2])));
245 IntVect thi(AMREX_D_DECL(amrex::coarsen<tile_size>(dhi[0]),
246 amrex::coarsen<tile_size>(dhi[1]),
247 amrex::coarsen<tile_size>(dhi[2])));
248#if (AMREX_SPACEDIM == 3)
249 for (int kt = tlo[2]; kt <= thi[2]; ++kt)
250#endif
251 {
252#if (AMREX_SPACEDIM >= 2)
253 for (int jt = tlo[1]; jt <= thi[1]; ++jt)
254#endif
255 {
256 for (int it = tlo[0]; it <= thi[0]; ++it)
257 {
258 IntVect lo(AMREX_D_DECL(it*tile_size,
259 jt*tile_size,
260 kt*tile_size));
261 tiled_tags.push_back(TiledTag{itag, std::make_pair
262 (tag.dindex, Box(lo, lo+(tile_size-1), ixtype))});
263 }
264 }
265 }
266 }
267
268 std::sort(tiled_tags.begin(), tiled_tags.end());
269
270 Gpu::HostVector<unsigned int> h_ntags;
271 Gpu::HostVector<TagType> h_tags;
272 h_tags.reserve(tiled_tags.size());
273
274 for (unsigned int itag = 0; itag < tiled_tags.size(); ++itag) {
275 if (itag == 0) {
276 h_ntags.push_back(0);
277 } else if (tiled_tags[itag-1] != tiled_tags[itag]) {
278 h_ntags.push_back(itag);
279 }
280 auto const& ttag = tiled_tags[itag];
281 auto const& btag = a_tags[ttag.tag_index];
282 h_tags.push_back({btag.dfab, btag.dindex, btag.sfab,
283 btag.dbox & ttag.dindex_tilebox.second, btag.offset});
284 }
285 h_ntags.push_back((unsigned int)tiled_tags.size());
286
287 Gpu::DeviceVector<TagType> d_tags(h_tags.size());
288 Gpu::DeviceVector<unsigned int> d_ntags(h_ntags.size());
289 Gpu::copyAsync(Gpu::hostToDevice,h_tags.begin(),h_tags.end(),d_tags.begin());
290 Gpu::copyAsync(Gpu::hostToDevice,h_ntags.begin(),h_ntags.end(),d_ntags.begin());
291 auto const* ptag = d_tags.data();
292 auto const* pntags = d_ntags.data();
293 auto const nblocks = int(h_ntags.size()-1);
294 constexpr auto nthreads = 256;
295 amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
296#ifdef AMREX_USE_SYCL
297 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
298 [[sycl::reqd_work_group_size(nthreads)]]
299#else
300 [=] AMREX_GPU_DEVICE () noexcept
301#endif
302 {
303#ifdef AMREX_USE_SYCL
304 Dim1 blockIdx{item.get_group_linear_id()};
305 Dim1 threadIdx{item.get_local_linear_id()};
306#endif
307
308 for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
309 auto const tag = ptag[itag];
310 auto ncells = int(tag.dbox.numPts());
311 const auto len = amrex::length(tag.dbox);
312 const auto lo = amrex::lbound(tag.dbox);
313 for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
314 int k = icell / (len.x*len.y);
315 int j = (icell - k*(len.x*len.y)) / len.x;
316 int i = (icell - k*(len.x*len.y)) - j*len.x;
317 i += lo.x;
318 j += lo.y;
319 k += lo.z;
320 for (int n = 0; n < ncomp; ++n) {
321 f(tag.dfab.ptr(i,j,k,n+dcomp),
322 tag.sfab(i + tag.offset.x,
323 j + tag.offset.y,
324 k + tag.offset.z, n+scomp));
325 }
326 }
327
328 if (itag+1 < pntags[blockIdx.x+1]) {
329#ifdef AMREX_USE_SYCL
330 sycl::group_barrier(item.get_group());
331#else
332 __syncthreads();
333#endif
334 }
335 }
336 });
337 Gpu::streamSynchronize();
338}
339
340#endif /* AMREX_USE_GPU */
341
342}
343
344template <class FAB>
345void
346FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
347{
348 auto const& LocTags = *(TheFB.m_LocTags);
349 auto N_locs = static_cast<int>(LocTags.size());
350 if (N_locs == 0) { return; }
351 bool is_thread_safe = TheFB.m_threadsafe_loc;
352 if (is_thread_safe)
353 {
354#ifdef AMREX_USE_OMP
355#pragma omp parallel for
356#endif
357 for (int i = 0; i < N_locs; ++i)
358 {
359 const CopyComTag& tag = LocTags[i];
360
361 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
362 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
363
364 const FAB* sfab = &(get(tag.srcIndex));
365 FAB* dfab = &(get(tag.dstIndex));
366 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
367 }
368 }
369 else
370 {
372 for (int i = 0; i < N_locs; ++i)
373 {
374 const CopyComTag& tag = LocTags[i];
375
376 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
377 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
378
379 loc_copy_tags[tag.dstIndex].push_back
380 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
381 }
382#ifdef AMREX_USE_OMP
383#pragma omp parallel
384#endif
385 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
386 {
387 const auto& tags = loc_copy_tags[mfi];
388 auto dfab = this->array(mfi);
389 for (auto const & tag : tags)
390 {
391 auto const sfab = tag.sfab->array();
392 const auto offset = tag.offset.dim3();
393 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
394 [=] (int i, int j, int k, int n) noexcept
395 {
396 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
397 });
398 }
399 }
400 }
401}
402
403template <class FAB>
404void
405FabArray<FAB>::FB_local_add_cpu (const FB& TheFB, int scomp, int ncomp)
406{
407 auto const& LocTags = *(TheFB.m_LocTags);
408 auto N_locs = static_cast<int>(LocTags.size());
409 if (N_locs == 0) { return; }
410
412 // We must make a temporary copy of the data first to avoid race condition.
413 std::vector<FAB> src_fabs(N_locs);
414 for (int itag = 0; itag < N_locs; ++itag) {
415 const CopyComTag& tag = LocTags[itag];
416 src_fabs[itag].resize(tag.sbox,ncomp);
417 loc_copy_tags[tag.dstIndex].push_back
418 (FabCopyTag<FAB>{&(src_fabs[itag]),
419 tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
420 }
421
422#ifdef AMREX_USE_OMP
423#pragma omp parallel for
424#endif
425 for (int itag = 0; itag < N_locs; ++itag) {
426 const CopyComTag& tag = LocTags[itag];
427 src_fabs[itag].template copy<RunOn::Host>(this->operator[](tag.srcIndex), scomp, 0, ncomp);
428 }
429
430#ifdef AMREX_USE_OMP
431#pragma omp parallel
432#endif
433 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
434 {
435 const auto& tags = loc_copy_tags[mfi];
436 const auto& dfab = this->array(mfi);
437 for (auto const & tag : tags)
438 {
439 auto const sfab = tag.sfab->array();
440 const auto offset = tag.offset.dim3();
441 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
442 [&] (int i, int j, int k, int n) noexcept
443 {
444 dfab(i,j,k,n+scomp) += sfab(i+offset.x,j+offset.y,k+offset.z,n);
445 });
446 }
447 }
448}
449
450#ifdef AMREX_USE_GPU
451
452template <class FAB>
455{
456 auto const& LocTags = *(TheFB.m_LocTags);
457 int N_locs = LocTags.size();
458
459 using TagType = Array4CopyTag<value_type>;
460
462 if (auto it = m_fb_local_copy_handler.find(TheFB.m_id);
463 it != m_fb_local_copy_handler.end())
464 {
465 tv = it->second.get();
466 } else {
467 Vector<TagType> loc_copy_tags;
468 loc_copy_tags.reserve(N_locs);
469
470 for (int i = 0; i < N_locs; ++i)
471 {
472 const CopyComTag& tag = LocTags[i];
473
474 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
475 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
476
477 int li = this->localindex(tag.dstIndex);
478 loc_copy_tags.push_back
479 ({this->atLocalIdx(li).array(), tag.dstIndex,
480 this->fabPtr(tag.srcIndex)->const_array(),
481 tag.dbox,
482 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
483 }
484
485 auto utv = std::make_unique<TagVector<TagType>>(loc_copy_tags);
486 tv = utv.get();
487 m_fb_local_copy_handler[TheFB.m_id] = std::move(utv);
488 }
489
490 return tv;
491}
492
493template <class FAB>
494void
495FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
496{
497 auto const& LocTags = *(TheFB.m_LocTags);
498 int N_locs = LocTags.size();
499 if (N_locs == 0) { return; }
500 bool is_thread_safe = TheFB.m_threadsafe_loc;
501
502 using TagType = Array4CopyTag<value_type>;
503
504 if (is_thread_safe || amrex::IsStoreAtomic<value_type>::value)
505 {
506 auto* tv = FB_get_local_copy_tag_vector(TheFB);
507
508 detail::ParallelFor_doit(*tv,
509 [=] AMREX_GPU_DEVICE (
510 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
511 {
512 if (icell < ncells) {
513 for (int n = 0; n < ncomp; ++n) {
514 tag.dfab(i,j,k,n+scomp) = tag.sfab(i+tag.offset.x,
515 j+tag.offset.y,
516 k+tag.offset.z,n+scomp);
517 }
518 }
519 });
520 }
521 else
522 {
523 Vector<TagType> loc_copy_tags;
524 loc_copy_tags.reserve(N_locs);
525
526 Vector<BaseFab<int>> maskfabs(this->local_size());
527 Vector<Array4Tag<int> > masks_unique;
528 masks_unique.reserve(this->local_size());
529 Vector<Array4Tag<int> > masks;
530 masks.reserve(N_locs);
531
532 for (int i = 0; i < N_locs; ++i)
533 {
534 const CopyComTag& tag = LocTags[i];
535
536 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
537 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
538
539 int li = this->localindex(tag.dstIndex);
540 loc_copy_tags.push_back
541 ({this->atLocalIdx(li).array(), tag.dstIndex,
542 this->fabPtr(tag.srcIndex)->const_array(),
543 tag.dbox,
544 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
545
546 if (!maskfabs[li].isAllocated()) {
547 maskfabs[li].resize(this->atLocalIdx(li).box());
548 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
549 }
550 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
551 }
552
553 amrex::ParallelFor(masks_unique,
554 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
555 {
556 msk.dfab(i,j,k) = 0;
557 });
558
559 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
560 loc_copy_tags, scomp, scomp, ncomp, masks);
561 }
562}
563
564template <class FAB>
565void
566FabArray<FAB>::FB_local_add_gpu (const FB& TheFB, int scomp, int ncomp, bool deterministic)
567{
568 auto const& LocTags = *(TheFB.m_LocTags);
569 int N_locs = LocTags.size();
570 if (N_locs == 0) { return; }
571
572 using TagType = Array4CopyTag<value_type>;
573
574 Vector<TagType> loc_copy_tags_1, loc_copy_tags_2;
575 loc_copy_tags_1.reserve(N_locs);
576 loc_copy_tags_2.reserve(N_locs);
577
578 Vector<FAB> src_fabs(N_locs);
579 for (int itag = 0; itag < N_locs; ++itag) {
580 const CopyComTag& tag = LocTags[itag];
581 src_fabs[itag].resize(tag.sbox,ncomp);
582 loc_copy_tags_1.push_back(
583 TagType{src_fabs[itag].array(), -1,
584 this->const_array(tag.srcIndex,scomp), tag.sbox,
585 Dim3{0,0,0}});
586 loc_copy_tags_2.push_back(
587 TagType{this->array(tag.dstIndex,scomp), tag.dstIndex,
588 src_fabs[itag].const_array(), tag.dbox,
589 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
590 }
591
592 // Note that we have shifted the starting component to zero in the code above.
593
594 // TODO: We could try to cache the tags like in FillBoundary.
595
596 detail::fab_to_fab(loc_copy_tags_1, 0, 0, ncomp,
598 if (deterministic || ! amrex::HasAtomicAdd<value_type>::value) {
599 detail::deterministic_fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
601 } else {
603 detail::fab_to_fab(loc_copy_tags_2, 0, 0, ncomp,
605 }
606 ((void)0);
607 }
608
609 // Note that fab_to_fab is synchronous.
610}
611
612template <class FAB>
613void
615 const CommMetaData& thecmd, int scomp, int ncomp)
616{
617 auto const& LocTags = *(thecmd.m_LocTags);
618 int N_locs = LocTags.size();
619 if (N_locs == 0) { return; }
620 bool is_thread_safe = thecmd.m_threadsafe_loc;
621
622 using TagType = Array4BoxTag<value_type>;
623 Vector<TagType> loc_setval_tags;
624 loc_setval_tags.reserve(N_locs);
625
627
628 for (int i = 0; i < N_locs; ++i)
629 {
630 const CopyComTag& tag = LocTags[i];
631 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
632 loc_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
633 }
634
635 amrex::ParallelFor(loc_setval_tags, ncomp,
636 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
637 {
638 tag.dfab(i,j,k,n+scomp) = x;
639 });
640}
641
642template <class FAB>
643void
645 const CommMetaData& thecmd, int scomp, int ncomp)
646{
647 auto const& RcvTags = *(thecmd.m_RcvTags);
648 bool is_thread_safe = thecmd.m_threadsafe_rcv;
649
650 using TagType = Array4BoxTag<value_type>;
651 Vector<TagType> rcv_setval_tags;
652
653 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
654 for (auto const& tag: it->second) {
655 rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
656 }
657 }
658
659 if (rcv_setval_tags.empty()) { return; }
660
662
663 amrex::ParallelFor(rcv_setval_tags, ncomp,
664 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
665 {
666 tag.dfab(i,j,k,n+scomp) = x;
667 });
668}
669
670#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
671template <class FAB>
672void
673FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
674{
675 const int N_locs = (*TheFB.m_LocTags).size();
677 for (int i = 0; i < N_locs; ++i)
678 {
679 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
680
681 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
682 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
683
684 loc_copy_tags[tag.dstIndex].push_back
685 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
686 }
687
688 // Create Graph if one is needed.
689 if ( !(TheFB.m_localCopy.ready()) )
690 {
691 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
692
693 int idx = 0;
694 // Record the graph.
695 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
696 {
697 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
698 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
699 (TheFB).m_localCopy.getDevicePtr(0),
700 std::size_t(sizeof(CopyMemory)*N_locs) );
701
702 const auto& tags = loc_copy_tags[mfi];
703 for (auto const & tag : tags)
704 {
705 const auto offset = tag.offset.dim3();
706 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
707 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
708 {
709 // Build the Array4's.
710 auto const dst = cmem->getDst<value_type>();
711 auto const src = cmem->getSrc<value_type>();
712 for (int n = 0; n < cmem->ncomp; ++n) {
713 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
714 }
715 });
716 }
717
718 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
719 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
720 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
721 }
722 }
723
724 // Setup Launch Parameters
725 // This is perfectly threadable, right?
726 // Additional optimization -> Check to see whether values need to be reset?
727 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
728 int idx = 0;
729 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
730 {
731 auto const dst_array = this->array(mfi);
732 const auto& tags = loc_copy_tags[mfi];
733 for (auto const & tag : tags)
734 {
735 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
736 dst_array,
737 scomp, ncomp));
738 }
739 }
740
741 // Launch Graph
742 TheFB.m_localCopy.executeGraph();
743}
744
745#ifdef AMREX_USE_MPI
746template <class FAB>
747void
748FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
749{
750 const int N_locs = TheFB.m_LocTags->size();
751
752 int launches = 0; // Used for graphs only.
753 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
754 for (int i = 0; i < N_locs; ++i)
755 {
756 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
757
758 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
759 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
760
761 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
762 {
763 loc_copy_tags[tag.dstIndex].push_back
764 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
765 launches++;
766 }
767 }
768
769 FillBoundary_test();
770
771 if ( !(TheFB.m_localCopy.ready()) )
772 {
773 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
774
775 int idx = 0;
776 int cuda_stream = 0;
777 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
778 {
779 const auto& tags = loc_copy_tags[mfi];
780 for (int t = 0; t<tags.size(); ++t)
781 {
782 Gpu::Device::setStreamIndex(cuda_stream++);
783 amrex::Gpu::Device::startGraphRecording( (idx == 0),
784 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
785 (TheFB).m_localCopy.getDevicePtr(0),
786 std::size_t(sizeof(CopyMemory)*launches) );
787
788 const auto& tag = tags[t];
789 const Dim3 offset = tag.offset.dim3();
790
791 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
792 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
793 {
794 auto const dst = cmem->getDst<value_type>();
795 auto const src = cmem->getSrc<value_type>();
796 for (int n = 0; n < cmem->ncomp; ++n) {
797 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
798 }
799 });
800
801 bool last_iter = idx == launches;
802 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
803 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
804 }
805 }
806 }
807
808 // Setup Launch Parameters
809 // This is perfectly threadable, right?
810 int idx = 0;
811 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
812 {
813 const auto& dst_array = this->array(mfi);
814 const auto& tags = loc_copy_tags[mfi];
815 for (auto const & tag : tags)
816 {
817 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
818 dst_array,
819 scomp, ncomp));
820 }
821 }
822
823 // Launch Graph without synch. Local work is entirely independent.
824 TheFB.m_localCopy.executeGraph(false);
825}
826#endif /* AMREX_USE_MPI */
827
828#endif /* __CUDACC__ */
829
830#endif /* AMREX_USE_GPU */
831
832#ifdef AMREX_USE_MPI
833
834#ifdef AMREX_USE_GPU
835
836#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
837
838template <class FAB>
839void
840FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
841 Vector<char*>& send_data,
842 Vector<std::size_t> const& send_size,
843 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
844{
845 const int N_snds = send_data.size();
846 if (N_snds == 0) { return; }
847
848 if ( !(TheFB.m_copyToBuffer.ready()) )
849 {
850 // Set size of CudaGraph buffer.
851 // Is the conditional ever expected false?
852 int launches = 0;
853 for (int send = 0; send < N_snds; ++send) {
854 if (send_size[send] > 0) {
855 launches += send_cctc[send]->size();
856 }
857 }
858 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
859
860 // Record the graph.
861 int idx = 0;
862 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
863 sit.isValid(); ++sit)
864 {
865 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
866 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
867 (TheFB).m_copyToBuffer.getDevicePtr(0),
868 std::size_t(sizeof(CopyMemory)*launches) );
869
870 const int j = sit();
871 if (send_size[j] > 0)
872 {
873 auto const& cctc = *send_cctc[j];
874 for (auto const& tag : cctc)
875 {
876 const Box& bx = tag.sbox;
877 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
878 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
879 {
880 auto const pfab = cmem->getDst<value_type>();
881 auto const sfab = cmem->getSrc<value_type>();
882 for (int n = 0; n < cmem->ncomp; ++n)
883 {
884 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
885 }
886 });
887 }
888 }
889
890 bool last_iter = sit() == (N_snds-1);
891 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
892 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
893 }
894 }
895
896 // Setup Launch Parameters
897 int idx = 0;
898 for (int send = 0; send < N_snds; ++send)
899 {
900 const int j = send;
901 if (send_size[j] > 0)
902 {
903 char* dptr = send_data[j];
904 auto const& cctc = *send_cctc[j];
905 for (auto const& tag : cctc)
906 {
907 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
908 amrex::makeArray4((value_type*)(dptr),
909 tag.sbox,
910 ncomp),
911 scomp, ncomp));
912
913 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
914 }
915 amrex::ignore_unused(send_size);
916 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
917 }
918 }
919
920 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
921 TheFB.m_copyToBuffer.executeGraph();
922}
923
924template <class FAB>
925void
926FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
927 Vector<char*> const& recv_data,
928 Vector<std::size_t> const& recv_size,
929 Vector<CopyComTagsContainer const*> const& recv_cctc,
930 bool /*is_thread_safe*/)
931{
932 const int N_rcvs = recv_cctc.size();
933 if (N_rcvs == 0) { return; }
934
935 int launches = 0;
936 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
937 for (int k = 0; k < N_rcvs; ++k)
938 {
939 if (recv_size[k] > 0)
940 {
941 const char* dptr = recv_data[k];
942 auto const& cctc = *recv_cctc[k];
943 for (auto const& tag : cctc)
944 {
945 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
946 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
947 launches++;
948 }
949 amrex::ignore_unused(recv_size);
950 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
951 }
952 }
953
954 if ( !(TheFB.m_copyFromBuffer.ready()) )
955 {
956 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
957
958 int idx = 0;
959 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
960 {
961 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
962 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
963 (TheFB).m_copyFromBuffer.getDevicePtr(0),
964 std::size_t(sizeof(CopyMemory)*launches) );
965
966 const auto& tags = recv_copy_tags[mfi];
967 for (auto const & tag : tags)
968 {
969 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
970 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
971 {
972 auto const pfab = cmem->getSrc<value_type>();
973 auto const dfab = cmem->getDst<value_type>();
974 for (int n = 0; n < cmem->ncomp; ++n)
975 {
976 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
977 }
978 });
979 }
980
981 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
982 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
983 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
984 }
985 }
986
987 // Setup graph.
988 int idx = 0;
989 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
990 {
991 auto dst_array = this->array(mfi);
992 const auto & tags = recv_copy_tags[mfi];
993 for (auto const & tag : tags)
994 {
995 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
996 tag.dbox,
997 ncomp),
998 dst_array,
999 dcomp, ncomp));
1000 }
1001 }
1002
1003 // Launch Graph - synced because next action is freeing recv buffer.
1004 TheFB.m_copyFromBuffer.executeGraph();
1005}
1006
1007#endif /* __CUDACC__ */
1008
1009template <class FAB>
1010template <typename BUF>
1011auto
1013 Vector<std::size_t> const& send_size,
1014 Vector<CopyComTagsContainer const*> const& send_cctc,
1015 int ncomp, std::uint64_t id) const
1017{
1018 using TagType = CommSendBufTag<value_type>;
1019
1020 auto kit = std::find_if(send_cctc.begin(), send_cctc.end(),
1021 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1022 if (kit == send_cctc.end()) {
1023 return nullptr;
1024 }
1025
1026 auto get_tags = [&] () -> Vector<TagType>
1027 {
1028 Vector<TagType> snd_copy_tags;
1029 char* pbuf = send_data[0];
1030 const int N_snds = send_data.size();
1031 for (int j = 0; j < N_snds; ++j)
1032 {
1033 if (send_size[j] > 0)
1034 {
1035 char* dptr = send_data[j];
1036 auto const& cctc = *send_cctc[j];
1037 for (auto const& tag : cctc)
1038 {
1039 snd_copy_tags.emplace_back
1040 (TagType{this->const_array(tag.srcIndex), dptr-pbuf, tag.sbox});
1041 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
1042 }
1043 }
1044 }
1045 return snd_copy_tags;
1046 };
1047
1049 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1050
1051 if (auto it = m_send_copy_handler.find(key); it != m_send_copy_handler.end()) {
1052 tv = it->second.get();
1053 } else {
1054 if (m_send_copy_handler.size() > 32) {
1055 // Just in case. If this is used in ParallelCopy, it's possible
1056 // that the sending FabArray is the same, but the receiving
1057 // FabArray is different every time. Then the size of this map
1058 // could increase indefinitely.
1059 m_send_copy_handler.clear();
1060 }
1061 auto snd_copy_tags = get_tags();
1062 auto utv = std::make_unique<TagVector<TagType>>(snd_copy_tags);
1063 tv = utv.get();
1064 m_send_copy_handler[key] = std::move(utv);
1065 }
1066
1067 return tv;
1068}
1069
1070template <class FAB>
1071template <typename BUF>
1072void
1073FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
1074 Vector<char*> const& send_data,
1075 Vector<std::size_t> const& send_size,
1076 Vector<CopyComTagsContainer const*> const& send_cctc,
1077 std::uint64_t id)
1078{
1079 const int N_snds = send_data.size();
1080 if (N_snds == 0) { return; }
1081
1082 using TagType = CommSendBufTag<value_type>;
1083
1084 auto* tv = src.template get_send_copy_tag_vector<BUF>
1085 (send_data, send_size, send_cctc, ncomp, id);
1086 if (tv == nullptr) { return; }
1087
1088 char* pbuffer = send_data[0];
1089
1090 detail::ParallelFor_doit(*tv,
1091 [=] AMREX_GPU_DEVICE (
1092 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1093 {
1094 if (icell < ncells) {
1095 Array4<BUF> dfab{(BUF*)(pbuffer+tag.poff),
1096 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1097 for (int n = 0; n < ncomp; ++n) {
1098 dfab(i,j,k,n) = (BUF)tag.sfab(i,j,k,n+scomp);
1099 }
1100 }
1101 });
1102
1103 Gpu::streamSynchronize();
1104}
1105
1106template <class FAB>
1107template <typename BUF>
1108auto
1110 Vector<std::size_t> const& recv_size,
1111 Vector<CopyComTagsContainer const*> const& recv_cctc,
1112 int ncomp, std::uint64_t id)
1114{
1115 using TagType = CommRecvBufTag<value_type>;
1116
1117 auto kit = std::find_if(recv_cctc.begin(), recv_cctc.end(),
1118 [] (CopyComTagsContainer const* p) { return p != nullptr; });
1119 if (kit == recv_cctc.end()) {
1120 return nullptr;
1121 }
1122
1123 auto get_tags = [&] () -> Vector<TagType>
1124 {
1125 Vector<TagType> recv_copy_tags;
1126 char* pbuf = recv_data[0];
1127 const int N_rcvs = recv_cctc.size();
1128 for (int k = 0; k < N_rcvs; ++k)
1129 {
1130 if (recv_size[k] > 0)
1131 {
1132 char* dptr = recv_data[k];
1133 auto const& cctc = *recv_cctc[k];
1134 for (auto const& tag : cctc)
1135 {
1136 const int li = this->localindex(tag.dstIndex);
1137 recv_copy_tags.emplace_back
1138 (TagType{this->atLocalIdx(li).array(), dptr-pbuf, tag.dbox});
1139 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1140 }
1141 }
1142 }
1143 return recv_copy_tags;
1144 };
1145
1147 std::tuple<std::uint64_t,std::size_t,int> key{id, sizeof(BUF), ncomp};
1148
1149 if (auto it = m_recv_copy_handler.find(key); it != m_recv_copy_handler.end()) {
1150 tv = it->second.get();
1151 } else {
1152 if (m_recv_copy_handler.size() > 32) {
1153 // Just in case. If this is used in ParallelCopy, it's possible
1154 // that the receiving FabArray is the same, but the sending
1155 // FabArray is different every time. Then the size of this map
1156 // could increase indefinitely.
1157 m_recv_copy_handler.clear();
1158 }
1159 auto recv_copy_tags = get_tags();
1160 auto utv = std::make_unique<TagVector<TagType>>(recv_copy_tags);
1161 tv = utv.get();
1162 m_recv_copy_handler[key] = std::move(utv);
1163 }
1164
1165 return tv;
1166}
1167
1168template <class FAB>
1169template <typename BUF>
1170void
1172 Vector<char*> const& recv_data,
1173 Vector<std::size_t> const& recv_size,
1174 Vector<CopyComTagsContainer const*> const& recv_cctc,
1175 CpOp op, bool is_thread_safe, std::uint64_t id,
1176 bool deterministic)
1177{
1178 const int N_rcvs = recv_cctc.size();
1179 if (N_rcvs == 0) { return; }
1180
1181 bool use_mask = false;
1182 if (!is_thread_safe)
1183 {
1184 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
1185 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
1186 {
1187 use_mask = true;
1188 }
1189 }
1190
1191 if (deterministic)
1192 {
1193 AMREX_ALWAYS_ASSERT(op == FabArrayBase::ADD); // Only ADD for now
1194 using TagType = Array4CopyTag<value_type,BUF>;
1195 Vector<TagType> tags;
1196 tags.reserve(N_rcvs);
1197 for (int k = 0; k < N_rcvs; ++k) {
1198 if (recv_size[k] > 0) {
1199 char const* dptr = recv_data[k];
1200 auto const& cctc = *recv_cctc[k];
1201 for (auto const& tag : cctc) {
1202 tags.emplace_back(
1203 TagType{dst.array(tag.dstIndex), tag.dstIndex,
1204 Array4<BUF const>((BUF const*)dptr,
1205 amrex::begin(tag.dbox),
1206 amrex::end(tag.dbox), ncomp),
1207 tag.dbox, Dim3{0,0,0}});
1208 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1209 }
1210 }
1211 }
1213 detail::deterministic_fab_to_fab<value_type,BUF>
1214 (tags, 0, dcomp, ncomp, detail::CellAdd<value_type,BUF>{});
1215 } else {
1216 amrex::Abort("SumBoundary requires operator+=");
1217 }
1218 }
1219 else if (!use_mask)
1220 {
1221 using TagType = CommRecvBufTag<value_type>;
1222 auto* tv = dst.template get_recv_copy_tag_vector<BUF>
1223 (recv_data, recv_size, recv_cctc, ncomp, id);
1224 if (tv == nullptr) { return; }
1225
1226 char* pbuffer = recv_data[0];
1227
1228 if (op == FabArrayBase::COPY)
1229 {
1230 detail::ParallelFor_doit(*tv,
1231 [=] AMREX_GPU_DEVICE (
1232 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1233 {
1234 if (icell < ncells) {
1235 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1236 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1237 for (int n = 0; n < ncomp; ++n) {
1238 tag.dfab(i,j,k,n+dcomp) = (value_type)sfab(i,j,k,n);
1239 }
1240 }
1241 });
1242 }
1243 else
1244 {
1245 if (is_thread_safe) {
1246 detail::ParallelFor_doit(*tv,
1247 [=] AMREX_GPU_DEVICE (
1248 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1249 {
1250 if (icell < ncells) {
1251 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1252 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1253 for (int n = 0; n < ncomp; ++n) {
1254 tag.dfab(i,j,k,n+dcomp) += (value_type)sfab(i,j,k,n);
1255 }
1256 }
1257 });
1258 } else {
1260 detail::ParallelFor_doit(*tv,
1261 [=] AMREX_GPU_DEVICE (
1262 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
1263 {
1264 if (icell < ncells) {
1265 Array4<BUF const> sfab{(BUF const*)(pbuffer+tag.poff),
1266 amrex::begin(tag.bx), amrex::end(tag.bx), ncomp};
1267 for (int n = 0; n < ncomp; ++n) {
1268 Gpu::Atomic::AddNoRet(tag.dfab.ptr(i,j,k,n+dcomp),
1269 (value_type)sfab(i,j,k,n));
1270 }
1271 }
1272 });
1273 } else {
1274 amrex::Abort("unpack_recv_buffer_gpu: should NOT get here");
1275 }
1276 }
1277 }
1278 Gpu::streamSynchronize();
1279 }
1280 else
1281 {
1282 char* pbuffer = recv_data[0];
1283
1284 using TagType = Array4CopyTag<value_type, BUF>;
1285 Vector<TagType> recv_copy_tags;
1286 recv_copy_tags.reserve(N_rcvs);
1287
1288 Vector<BaseFab<int> > maskfabs(dst.local_size());
1289 Vector<Array4Tag<int> > masks_unique;
1290 masks_unique.reserve(dst.local_size());
1291 Vector<Array4Tag<int> > masks;
1292
1293 for (int k = 0; k < N_rcvs; ++k)
1294 {
1295 if (recv_size[k] > 0)
1296 {
1297 std::size_t offset = recv_data[k]-recv_data[0];
1298 const char* dptr = pbuffer + offset;
1299 auto const& cctc = *recv_cctc[k];
1300 for (auto const& tag : cctc)
1301 {
1302 const int li = dst.localindex(tag.dstIndex);
1303 recv_copy_tags.emplace_back(TagType{
1304 dst.atLocalIdx(li).array(), tag.dstIndex,
1305 amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
1306 tag.dbox,
1307 Dim3{0,0,0}
1308 });
1309 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1310
1311 if (!maskfabs[li].isAllocated()) {
1312 maskfabs[li].resize(dst.atLocalIdx(li).box());
1313 masks_unique.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1314 }
1315 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
1316 }
1317 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
1318 }
1321 amrex::ParallelFor(masks_unique,
1322 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
1323 {
1324 msk.dfab(i,j,k) = 0;
1325 });
1326
1327 if (op == FabArrayBase::COPY)
1328 {
1329 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
1330 recv_copy_tags, 0, dcomp, ncomp, masks);
1331 }
1332 else
1333 {
1334 detail::fab_to_fab_atomic_add<value_type, BUF>(
1335 recv_copy_tags, 0, dcomp, ncomp, masks);
1336 }
1337
1338 // There is Gpu::streamSynchronize in fab_to_fab.
1339 }
1340}
1341
1342#endif /* AMREX_USE_GPU */
1343
1344template <class FAB>
1345template <typename BUF>
1346void
1347FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
1348 Vector<char*> const& send_data,
1349 Vector<std::size_t> const& send_size,
1350 Vector<CopyComTagsContainer const*> const& send_cctc)
1351{
1352 amrex::ignore_unused(send_size);
1353
1354 auto const N_snds = static_cast<int>(send_data.size());
1355 if (N_snds == 0) { return; }
1356
1357#ifdef AMREX_USE_OMP
1358#pragma omp parallel for
1359#endif
1360 for (int j = 0; j < N_snds; ++j)
1361 {
1362 if (send_size[j] > 0)
1363 {
1364 char* dptr = send_data[j];
1365 auto const& cctc = *send_cctc[j];
1366 for (auto const& tag : cctc)
1367 {
1368 const Box& bx = tag.sbox;
1369 auto const sfab = src.array(tag.srcIndex);
1370 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
1371 amrex::LoopConcurrentOnCpu( bx, ncomp,
1372 [=] (int ii, int jj, int kk, int n) noexcept
1373 {
1374 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
1375 });
1376 dptr += (bx.numPts() * ncomp * sizeof(BUF));
1377 }
1378 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
1379 }
1380 }
1381}
1382
1383template <class FAB>
1384template <typename BUF>
1385void
1387 Vector<char*> const& recv_data,
1388 Vector<std::size_t> const& recv_size,
1389 Vector<CopyComTagsContainer const*> const& recv_cctc,
1390 CpOp op, bool is_thread_safe)
1391{
1392 amrex::ignore_unused(recv_size);
1393
1394 auto const N_rcvs = static_cast<int>(recv_cctc.size());
1395 if (N_rcvs == 0) { return; }
1396
1397 if (is_thread_safe)
1398 {
1399#ifdef AMREX_USE_OMP
1400#pragma omp parallel for
1401#endif
1402 for (int k = 0; k < N_rcvs; ++k)
1403 {
1404 if (recv_size[k] > 0)
1405 {
1406 const char* dptr = recv_data[k];
1407 auto const& cctc = *recv_cctc[k];
1408 for (auto const& tag : cctc)
1409 {
1410 const Box& bx = tag.dbox;
1411 FAB& dfab = dst[tag.dstIndex];
1412 if (op == FabArrayBase::COPY)
1413 {
1414 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
1415 }
1416 else
1417 {
1418 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
1419 }
1420 dptr += bx.numPts() * ncomp * sizeof(BUF);
1421 }
1422 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1423 }
1424 }
1425 }
1426 else
1427 {
1428 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
1429 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
1430 for (int k = 0; k < N_rcvs; ++k)
1431 {
1432 if (recv_size[k] > 0)
1433 {
1434 const char* dptr = recv_data[k];
1435 auto const& cctc = *recv_cctc[k];
1436 for (auto const& tag : cctc)
1437 {
1438 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
1439 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
1440 }
1441 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1442 }
1443 }
1444
1445#ifdef AMREX_USE_OMP
1446#pragma omp parallel
1447#endif
1448 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1449 {
1450 const auto& tags = recv_copy_tags[mfi];
1451 auto dfab = dst.array(mfi);
1452 for (auto const & tag : tags)
1453 {
1454 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1455 if (op == FabArrayBase::COPY)
1456 {
1457 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1458 [=] (int i, int j, int k, int n) noexcept
1459 {
1460 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1461 });
1462 }
1463 else
1464 {
1465 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1466 [=] (int i, int j, int k, int n) noexcept
1467 {
1468 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1469 });
1470 }
1471 }
1472 }
1473 }
1474}
1475
1476#endif /* AMREX_USE_MPI */
1477
1478#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:349
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Get the smallend of the BoxND.
Definition AMReX_Box.H:108
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:110
CopyComTag::CopyComTagsContainer CopyComTagsContainer
Definition AMReX_FabArrayBase.H:220
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:119
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:113
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:345
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:356
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:561
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:531
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition AMReX_LayoutData.H:25
Definition AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
__host__ __device__ Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:87
__host__ __device__ constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:179
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2867
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:326
BoxND< 3 > Box
Definition AMReX_BaseFwd.H:27
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1899
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:378
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1908
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2862
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
Definition AMReX_FabArrayCommI.H:1000
void deterministic_fab_to_fab(Vector< Array4CopyTag< T0, T1 > > const &a_tags, int scomp, int dcomp, int ncomp, F const &f)
Definition AMReX_FBI.H:216
void fab_to_fab_store(Vector< TagType > const &tags, int scomp, int dcomp, int ncomp, F &&f)
Definition AMReX_FBI.H:73
void fab_to_fab(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, F &&f)
Definition AMReX_FBI.H:53
void fab_to_fab_other(Vector< TagType > const &tags, int scomp, int dcomp, int ncomp, F &&f)
Definition AMReX_FBI.H:90
void fab_to_fab_atomic_add(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition AMReX_FBI.H:200
void fab_to_fab_atomic_cpy(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition AMReX_FBI.H:182
Definition AMReX_FBI.H:5
IntVect offset
Definition AMReX_FBI.H:8
FAB const * sfab
Definition AMReX_FBI.H:6
Box dbox
Definition AMReX_FBI.H:7
Definition AMReX_FBI.H:11
char const * p
Definition AMReX_FBI.H:12
Box dbox
Definition AMReX_FBI.H:13
Definition AMReX_TagParallelFor.H:58
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:50
Array4< T > dfab
Definition AMReX_TagParallelFor.H:51
Definition AMReX_Array4.H:61
Definition AMReX_TagParallelFor.H:106
Definition AMReX_TagParallelFor.H:116
Definition AMReX_Dim3.H:12
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:474
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:195
Box sbox
Definition AMReX_FabArrayBase.H:197
int srcIndex
Definition AMReX_FabArrayBase.H:199
Box dbox
Definition AMReX_FabArrayBase.H:196
int dstIndex
Definition AMReX_FabArrayBase.H:198
FillBoundary.
Definition AMReX_FabArrayBase.H:488
Definition AMReX_TypeTraits.H:56
Definition AMReX_TypeTraits.H:66
Definition AMReX_TypeTraits.H:274
Definition AMReX_TagParallelFor.H:160
Definition AMReX_FBI.H:32
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:34
Definition AMReX_FBI.H:42
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(U0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:45
Definition AMReX_FBI.H:22
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:24