Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_FBI.H
Go to the documentation of this file.
1#ifndef AMREX_FBI_H_
2#define AMREX_FBI_H_
3
4template <class FAB>
5struct FabCopyTag {
6 FAB const* sfab;
7 Box dbox;
8 IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
9};
10
12 char const* p;
13 Box dbox;
14};
15
16namespace detail {
17
18#ifdef AMREX_USE_GPU
19
20template <class T0, class T1>
22{
24 operator() (T0* d, T1 s) const noexcept
25 {
26 *d = static_cast<T0>(s);
27 }
28};
29
30template <class T0, class T1>
31struct CellAdd
32{
34 operator() (T0* d, T1 s) const noexcept
35 {
36 *d += static_cast<T0>(s);
37 }
38};
39
40template <class T0, class T1>
42{
43 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
45 operator() (U0* d, T1 s) const noexcept
46 {
47 Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
48 }
49};
50
51template <class T0, class T1, class F>
52void
53fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
54 F && f)
55{
56 detail::ParallelFor_doit(copy_tags,
58#ifdef AMREX_USE_SYCL
59 sycl::nd_item<1> const& /*item*/,
60#endif
61 int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const tag) noexcept
62 {
63 if (icell < ncells) {
64 for (int n = 0; n < ncomp; ++n) {
65 f(&(tag.dfab(i,j,k,n+dcomp)),
66 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
67 }
68 }
69 });
70}
71
72template <class T0, class T1, class F>
73void
74fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
75 int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
76{
77 using TagType = Array4MaskCopyTag<T0, T1>;
78 Vector<TagType> tags;
79 const int N = copy_tags.size();
80 tags.reserve(N);
81 for (int i = 0; i < N; ++i) {
82 tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
83 copy_tags[i].dbox, copy_tags[i].offset});
84 }
85
86 amrex::Abort("xxxxx TODO This function still has a bug. Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type.");
87
88 detail::ParallelFor_doit(tags,
90#ifdef AMREX_USE_SYCL
91 sycl::nd_item<1> const& item,
92#endif
93 int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
94 {
95#ifdef AMREX_USE_SYCL
96 int g_tid = item.get_global_id(0);
97 int g_wid = g_tid / Gpu::Device::warp_size;
98
99 int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) : nullptr;
100 int mypriority = g_wid+1;
101 int to_try = 1;
102 while (true) {
103 int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0;
104 if (sycl::all_of_group(item.get_sub_group(), msk == 0)) { // 0 means lock acquired
105 break; // all threads have acquired.
106 } else {
107 if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) {
108 if (m) { *m = 0; } // yield
109 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
110 to_try = 1;
111 } else {
112 to_try = (msk > 0); // hold on to my lock
113 }
114 }
115 };
116
117 if (icell < ncells) {
118 for (int n = 0; n < ncomp; ++n) {
119 f(&(tag.dfab(i,j,k,n+dcomp)),
120 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
121 }
122 }
123
124 if (m) *m = 0;
125
126#else
127
128 int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
129 int g_wid = g_tid / Gpu::Device::warp_size;
130
131 int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) : nullptr;
132 int mypriority = g_wid+1;
133 int to_try = 1;
134 while (true) {
135 int msk = (m && to_try) ? atomicCAS(m, 0, mypriority) : 0;
136#ifdef AMREX_USE_CUDA
137 if (__all_sync(0xffffffff, msk == 0)) { // 0 means lock acquired
138#elif defined(AMREX_USE_HIP)
139 if (__all(msk == 0)) {
140#endif
141 break; // all threads have acquired.
142 } else {
143#ifdef AMREX_USE_CUDA
144 if (__any_sync(0xffffffff, msk > mypriority)) {
145#elif defined(AMREX_USE_HIP)
146 if (__any(msk > mypriority)) {
147#endif
148 if (m) *m = 0; // yield
149 __threadfence();
150 to_try = 1;
151 } else {
152 to_try = (msk > 0); // hold on to my lock
153 }
154 }
155 };
156
157 if (icell < ncells) {
158 for (int n = 0; n < ncomp; ++n) {
159 f(&(tag.dfab(i,j,k,n+dcomp)),
160 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
161 }
162 }
163
164 if (m) *m = 0;
165#endif
166 });
167}
168
169template <typename T0, typename T1,
170 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
171void
172fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
173 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
174{
175 fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
176}
177
178template <typename T0, typename T1,
179 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
180void
181fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
182 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
183{
184 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
185}
186
187template <typename T0, typename T1,
188 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
189void
190fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
191 int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
192{
193 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
194}
195
196template <typename T0, typename T1,
197 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
198void
199fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
200 int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
201{
202 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
203}
204
205#endif /* AMREX_USE_GPU */
206
207}
208
209template <class FAB>
210void
211FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
212{
213 auto const& LocTags = *(TheFB.m_LocTags);
214 auto N_locs = static_cast<int>(LocTags.size());
215 if (N_locs == 0) { return; }
216 bool is_thread_safe = TheFB.m_threadsafe_loc;
217 if (is_thread_safe)
218 {
219#ifdef AMREX_USE_OMP
220#pragma omp parallel for
221#endif
222 for (int i = 0; i < N_locs; ++i)
223 {
224 const CopyComTag& tag = LocTags[i];
225
226 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
227 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
228
229 const FAB* sfab = &(get(tag.srcIndex));
230 FAB* dfab = &(get(tag.dstIndex));
231 dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
232 }
233 }
234 else
235 {
237 for (int i = 0; i < N_locs; ++i)
238 {
239 const CopyComTag& tag = LocTags[i];
240
241 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
242 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
243
244 loc_copy_tags[tag.dstIndex].push_back
245 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
246 }
247#ifdef AMREX_USE_OMP
248#pragma omp parallel
249#endif
250 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
251 {
252 const auto& tags = loc_copy_tags[mfi];
253 auto dfab = this->array(mfi);
254 for (auto const & tag : tags)
255 {
256 auto const sfab = tag.sfab->array();
257 const auto offset = tag.offset.dim3();
258 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
259 [=] (int i, int j, int k, int n) noexcept
260 {
261 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
262 });
263 }
264 }
265 }
266}
267
268#ifdef AMREX_USE_GPU
269
270template <class FAB>
271void
272FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
273{
274 auto const& LocTags = *(TheFB.m_LocTags);
275 int N_locs = LocTags.size();
276 if (N_locs == 0) { return; }
277 bool is_thread_safe = TheFB.m_threadsafe_loc;
278
279 using TagType = Array4CopyTag<value_type>;
280 Vector<TagType> loc_copy_tags;
281 loc_copy_tags.reserve(N_locs);
282
283 Vector<BaseFab<int> > maskfabs;
284 Vector<Array4Tag<int> > masks;
285 if (!amrex::IsStoreAtomic<value_type>::value && !is_thread_safe)
286 {
287 maskfabs.resize(this->local_size());
288 masks.reserve(N_locs);
289 }
290
291 for (int i = 0; i < N_locs; ++i)
292 {
293 const CopyComTag& tag = LocTags[i];
294
295 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
296 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
297
298 int li = this->localindex(tag.dstIndex);
299 loc_copy_tags.push_back
300 ({this->atLocalIdx(li).array(),
301 this->fabPtr(tag.srcIndex)->const_array(),
302 tag.dbox,
303 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
304
305 if (maskfabs.size() > 0) {
306 if (!maskfabs[li].isAllocated()) {
307 maskfabs[li].resize(this->atLocalIdx(li).box());
308 }
309 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
310 }
311 }
312
313 if (maskfabs.size() > 0) {
314 amrex::ParallelFor(masks,
315 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
316 {
317 msk.dfab(i,j,k) = 0;
318 });
319 }
320
321 if (is_thread_safe) {
322 detail::fab_to_fab<value_type, value_type>(loc_copy_tags, scomp, scomp,
324 } else {
325 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
326 loc_copy_tags, scomp, scomp, ncomp, masks);
327 }
328}
329
330template <class FAB>
331void
333 const CommMetaData& thecmd, int scomp, int ncomp)
334{
335 auto const& LocTags = *(thecmd.m_LocTags);
336 int N_locs = LocTags.size();
337 if (N_locs == 0) { return; }
338 bool is_thread_safe = thecmd.m_threadsafe_loc;
339
340 using TagType = Array4BoxTag<value_type>;
341 Vector<TagType> loc_setval_tags;
342 loc_setval_tags.reserve(N_locs);
343
345
346 for (int i = 0; i < N_locs; ++i)
347 {
348 const CopyComTag& tag = LocTags[i];
349 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
350 loc_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
351 }
352
353 amrex::ParallelFor(loc_setval_tags, ncomp,
354 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
355 {
356 tag.dfab(i,j,k,n+scomp) = x;
357 });
358}
359
360template <class FAB>
361void
363 const CommMetaData& thecmd, int scomp, int ncomp)
364{
365 auto const& RcvTags = *(thecmd.m_RcvTags);
366 bool is_thread_safe = thecmd.m_threadsafe_rcv;
367
368 using TagType = Array4BoxTag<value_type>;
369 Vector<TagType> rcv_setval_tags;
370
371 for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
372 for (auto const& tag: it->second) {
373 rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
374 }
375 }
376
377 if (rcv_setval_tags.empty()) { return; }
378
380
381 amrex::ParallelFor(rcv_setval_tags, ncomp,
382 [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
383 {
384 tag.dfab(i,j,k,n+scomp) = x;
385 });
386}
387
388#if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
389template <class FAB>
390void
391FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
392{
393 const int N_locs = (*TheFB.m_LocTags).size();
395 for (int i = 0; i < N_locs; ++i)
396 {
397 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
398
399 BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
400 BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
401
402 loc_copy_tags[tag.dstIndex].push_back
403 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
404 }
405
406 // Create Graph if one is needed.
407 if ( !(TheFB.m_localCopy.ready()) )
408 {
409 const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
410
411 int idx = 0;
412 // Record the graph.
413 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
414 {
415 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
416 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
417 (TheFB).m_localCopy.getDevicePtr(0),
418 std::size_t(sizeof(CopyMemory)*N_locs) );
419
420 const auto& tags = loc_copy_tags[mfi];
421 for (auto const & tag : tags)
422 {
423 const auto offset = tag.offset.dim3();
424 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
425 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
426 {
427 // Build the Array4's.
428 auto const dst = cmem->getDst<value_type>();
429 auto const src = cmem->getSrc<value_type>();
430 for (int n = 0; n < cmem->ncomp; ++n) {
431 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
432 }
433 });
434 }
435
436 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
437 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
438 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
439 }
440 }
441
442 // Setup Launch Parameters
443 // This is perfectly threadable, right?
444 // Additional optimization -> Check to see whether values need to be reset?
445 // Can then remove this setup and memcpy from CudaGraph::executeGraph.
446 int idx = 0;
447 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
448 {
449 auto const dst_array = this->array(mfi);
450 const auto& tags = loc_copy_tags[mfi];
451 for (auto const & tag : tags)
452 {
453 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
454 dst_array,
455 scomp, ncomp));
456 }
457 }
458
459 // Launch Graph
460 TheFB.m_localCopy.executeGraph();
461}
462
463#ifdef AMREX_USE_MPI
464template <class FAB>
465void
466FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
467{
468 const int N_locs = TheFB.m_LocTags->size();
469
470 int launches = 0; // Used for graphs only.
471 LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
472 for (int i = 0; i < N_locs; ++i)
473 {
474 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
475
476 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
477 BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
478
479 if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
480 {
481 loc_copy_tags[tag.dstIndex].push_back
482 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
483 launches++;
484 }
485 }
486
487 FillBoundary_test();
488
489 if ( !(TheFB.m_localCopy.ready()) )
490 {
491 const_cast<FB&>(TheFB).m_localCopy.resize(launches);
492
493 int idx = 0;
494 int cuda_stream = 0;
495 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
496 {
497 const auto& tags = loc_copy_tags[mfi];
498 for (int t = 0; t<tags.size(); ++t)
499 {
500 Gpu::Device::setStreamIndex(cuda_stream++);
501 amrex::Gpu::Device::startGraphRecording( (idx == 0),
502 const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
503 (TheFB).m_localCopy.getDevicePtr(0),
504 std::size_t(sizeof(CopyMemory)*launches) );
505
506 const auto& tag = tags[t];
507 const Dim3 offset = tag.offset.dim3();
508
509 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
510 AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
511 {
512 auto const dst = cmem->getDst<value_type>();
513 auto const src = cmem->getSrc<value_type>();
514 for (int n = 0; n < cmem->ncomp; ++n) {
515 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
516 }
517 });
518
519 bool last_iter = idx == launches;
520 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
521 if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
522 }
523 }
524 }
525
526 // Setup Launch Parameters
527 // This is perfectly threadable, right?
528 int idx = 0;
529 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
530 {
531 const auto& dst_array = this->array(mfi);
532 const auto& tags = loc_copy_tags[mfi];
533 for (auto const & tag : tags)
534 {
535 const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
536 dst_array,
537 scomp, ncomp));
538 }
539 }
540
541 // Launch Graph without synch. Local work is entirely independent.
542 TheFB.m_localCopy.executeGraph(false);
543}
544#endif /* AMREX_USE_MPI */
545
546#endif /* __CUDACC__ */
547
548#endif /* AMREX_USE_GPU */
549
550#ifdef AMREX_USE_MPI
551
552#ifdef AMREX_USE_GPU
553
554#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
555
556template <class FAB>
557void
558FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
559 Vector<char*>& send_data,
560 Vector<std::size_t> const& send_size,
561 Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
562{
563 const int N_snds = send_data.size();
564 if (N_snds == 0) { return; }
565
566 if ( !(TheFB.m_copyToBuffer.ready()) )
567 {
568 // Set size of CudaGraph buffer.
569 // Is the conditional ever expected false?
570 int launches = 0;
571 for (int send = 0; send < N_snds; ++send) {
572 if (send_size[send] > 0) {
573 launches += send_cctc[send]->size();
574 }
575 }
576 const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
577
578 // Record the graph.
579 int idx = 0;
580 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
581 sit.isValid(); ++sit)
582 {
583 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
584 const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
585 (TheFB).m_copyToBuffer.getDevicePtr(0),
586 std::size_t(sizeof(CopyMemory)*launches) );
587
588 const int j = sit();
589 if (send_size[j] > 0)
590 {
591 auto const& cctc = *send_cctc[j];
592 for (auto const& tag : cctc)
593 {
594 const Box& bx = tag.sbox;
595 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
596 AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
597 {
598 auto const pfab = cmem->getDst<value_type>();
599 auto const sfab = cmem->getSrc<value_type>();
600 for (int n = 0; n < cmem->ncomp; ++n)
601 {
602 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
603 }
604 });
605 }
606 }
607
608 bool last_iter = sit() == (N_snds-1);
609 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
610 if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
611 }
612 }
613
614 // Setup Launch Parameters
615 int idx = 0;
616 for (int send = 0; send < N_snds; ++send)
617 {
618 const int j = send;
619 if (send_size[j] > 0)
620 {
621 char* dptr = send_data[j];
622 auto const& cctc = *send_cctc[j];
623 for (auto const& tag : cctc)
624 {
625 const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
626 amrex::makeArray4((value_type*)(dptr),
627 tag.sbox,
628 ncomp),
629 scomp, ncomp));
630
631 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
632 }
633 amrex::ignore_unused(send_size);
634 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
635 }
636 }
637
638 // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
639 TheFB.m_copyToBuffer.executeGraph();
640}
641
642template <class FAB>
643void
644FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
645 Vector<char*> const& recv_data,
646 Vector<std::size_t> const& recv_size,
647 Vector<CopyComTagsContainer const*> const& recv_cctc,
648 bool /*is_thread_safe*/)
649{
650 const int N_rcvs = recv_cctc.size();
651 if (N_rcvs == 0) { return; }
652
653 int launches = 0;
654 LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
655 for (int k = 0; k < N_rcvs; ++k)
656 {
657 if (recv_size[k] > 0)
658 {
659 const char* dptr = recv_data[k];
660 auto const& cctc = *recv_cctc[k];
661 for (auto const& tag : cctc)
662 {
663 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
664 dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
665 launches++;
666 }
667 amrex::ignore_unused(recv_size);
668 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
669 }
670 }
671
672 if ( !(TheFB.m_copyFromBuffer.ready()) )
673 {
674 const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
675
676 int idx = 0;
677 for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
678 {
679 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
680 const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
681 (TheFB).m_copyFromBuffer.getDevicePtr(0),
682 std::size_t(sizeof(CopyMemory)*launches) );
683
684 const auto& tags = recv_copy_tags[mfi];
685 for (auto const & tag : tags)
686 {
687 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
688 AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
689 {
690 auto const pfab = cmem->getSrc<value_type>();
691 auto const dfab = cmem->getDst<value_type>();
692 for (int n = 0; n < cmem->ncomp; ++n)
693 {
694 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
695 }
696 });
697 }
698
699 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
700 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
701 if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
702 }
703 }
704
705 // Setup graph.
706 int idx = 0;
707 for (MFIter mfi(*this); mfi.isValid(); ++mfi)
708 {
709 auto dst_array = this->array(mfi);
710 const auto & tags = recv_copy_tags[mfi];
711 for (auto const & tag : tags)
712 {
713 const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
714 tag.dbox,
715 ncomp),
716 dst_array,
717 dcomp, ncomp));
718 }
719 }
720
721 // Launch Graph - synced because next action is freeing recv buffer.
722 TheFB.m_copyFromBuffer.executeGraph();
723}
724
725#endif /* __CUDACC__ */
726
727template <class FAB>
728template <typename BUF>
729void
730FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
731 Vector<char*> const& send_data,
732 Vector<std::size_t> const& send_size,
734{
735 amrex::ignore_unused(send_size);
736
737 const int N_snds = send_data.size();
738 if (N_snds == 0) { return; }
739
740 char* pbuffer = send_data[0];
741 std::size_t szbuffer = 0;
742#if 0
743 // For linear solver test on summit, this is slower than writing to
744 // pinned memory directly on device.
745 if (! ParallelDescriptor::UseGpuAwareMpi()) {
746 // Memory in send_data is pinned.
747 szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1];
748 pbuffer = (char*)The_Arena()->alloc(szbuffer);
749 }
750#endif
751
752 using TagType = Array4CopyTag<BUF, value_type>;
753 Vector<TagType> snd_copy_tags;
754 for (int j = 0; j < N_snds; ++j)
755 {
756 if (send_size[j] > 0)
757 {
758 std::size_t offset = send_data[j]-send_data[0];
759 char* dptr = pbuffer + offset;
760 auto const& cctc = *send_cctc[j];
761 for (auto const& tag : cctc)
762 {
763 snd_copy_tags.emplace_back(TagType{
764 amrex::makeArray4((BUF*)(dptr), tag.sbox, ncomp),
765 src.array(tag.srcIndex),
766 tag.sbox,
767 Dim3{0,0,0}
768 });
769 dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
770 }
771 BL_ASSERT(dptr <= pbuffer + offset + send_size[j]);
772 }
773 }
774
775 detail::fab_to_fab<BUF, value_type>(snd_copy_tags, scomp, 0, ncomp,
777
778 // There is Gpu::streamSynchronize in fab_to_fab.
779
780 if (pbuffer != send_data[0]) {
781 Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
782 Gpu::streamSynchronize();
783 The_Arena()->free(pbuffer);
784 }
785}
786
787template <class FAB>
788template <typename BUF>
789void
791 Vector<char*> const& recv_data,
792 Vector<std::size_t> const& recv_size,
794 CpOp op, bool is_thread_safe)
795{
796 amrex::ignore_unused(recv_size);
797
798 const int N_rcvs = recv_cctc.size();
799 if (N_rcvs == 0) { return; }
800
801 char* pbuffer = recv_data[0];
802#if 0
803 std::size_t szbuffer = 0;
804 // For linear solver test on summit, this is slower than writing to
805 // pinned memory directly on device.
806 if (! ParallelDescriptor::UseGpuAwareMpi()) {
807 // Memory in recv_data is pinned.
808 szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
809 pbuffer = (char*)The_Arena()->alloc(szbuffer);
810 Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
811 Gpu::streamSynchronize();
812 }
813#endif
814
815 using TagType = Array4CopyTag<value_type, BUF>;
816 Vector<TagType> recv_copy_tags;
817 recv_copy_tags.reserve(N_rcvs);
818
819 Vector<BaseFab<int> > maskfabs;
820 Vector<Array4Tag<int> > masks;
821 if (!is_thread_safe)
822 {
823 if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
824 (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
825 {
826 maskfabs.resize(dst.local_size());
827 }
828 }
829
830 for (int k = 0; k < N_rcvs; ++k)
831 {
832 if (recv_size[k] > 0)
833 {
834 std::size_t offset = recv_data[k]-recv_data[0];
835 const char* dptr = pbuffer + offset;
836 auto const& cctc = *recv_cctc[k];
837 for (auto const& tag : cctc)
838 {
839 const int li = dst.localindex(tag.dstIndex);
840 recv_copy_tags.emplace_back(TagType{
841 dst.atLocalIdx(li).array(),
842 amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
843 tag.dbox,
844 Dim3{0,0,0}
845 });
846 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
847
848 if (maskfabs.size() > 0) {
849 if (!maskfabs[li].isAllocated()) {
850 maskfabs[li].resize(dst.atLocalIdx(li).box());
851 }
852 masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
853 }
854 }
855 BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
856 }
857 }
858
859 if (maskfabs.size() > 0) {
860 amrex::ParallelFor(masks,
861 [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
862 {
863 msk.dfab(i,j,k) = 0;
864 });
865 }
866
867 if (op == FabArrayBase::COPY)
868 {
869 if (is_thread_safe) {
870 detail::fab_to_fab<value_type, BUF>(
871 recv_copy_tags, 0, dcomp, ncomp, detail::CellStore<value_type, BUF>());
872 } else {
873 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
874 recv_copy_tags, 0, dcomp, ncomp, masks);
875 }
876 }
877 else
878 {
879 if (is_thread_safe) {
880 detail::fab_to_fab<value_type, BUF>(
881 recv_copy_tags, 0, dcomp, ncomp, detail::CellAdd<value_type, BUF>());
882 } else {
883 detail::fab_to_fab_atomic_add<value_type, BUF>(
884 recv_copy_tags, 0, dcomp, ncomp, masks);
885 }
886 }
887
888 // There is Gpu::streamSynchronize in fab_to_fab.
889
890 if (pbuffer != recv_data[0]) {
891 The_Arena()->free(pbuffer);
892 }
893}
894
895#endif /* AMREX_USE_GPU */
896
897template <class FAB>
898template <typename BUF>
899void
900FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
901 Vector<char*> const& send_data,
902 Vector<std::size_t> const& send_size,
904{
905 amrex::ignore_unused(send_size);
906
907 auto const N_snds = static_cast<int>(send_data.size());
908 if (N_snds == 0) { return; }
909
910#ifdef AMREX_USE_OMP
911#pragma omp parallel for
912#endif
913 for (int j = 0; j < N_snds; ++j)
914 {
915 if (send_size[j] > 0)
916 {
917 char* dptr = send_data[j];
918 auto const& cctc = *send_cctc[j];
919 for (auto const& tag : cctc)
920 {
921 const Box& bx = tag.sbox;
922 auto const sfab = src.array(tag.srcIndex);
923 auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
925 [=] (int ii, int jj, int kk, int n) noexcept
926 {
927 pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
928 });
929 dptr += (bx.numPts() * ncomp * sizeof(BUF));
930 }
931 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
932 }
933 }
934}
935
936template <class FAB>
937template <typename BUF>
938void
940 Vector<char*> const& recv_data,
941 Vector<std::size_t> const& recv_size,
943 CpOp op, bool is_thread_safe)
944{
945 amrex::ignore_unused(recv_size);
946
947 auto const N_rcvs = static_cast<int>(recv_cctc.size());
948 if (N_rcvs == 0) { return; }
949
950 if (is_thread_safe)
951 {
952#ifdef AMREX_USE_OMP
953#pragma omp parallel for
954#endif
955 for (int k = 0; k < N_rcvs; ++k)
956 {
957 if (recv_size[k] > 0)
958 {
959 const char* dptr = recv_data[k];
960 auto const& cctc = *recv_cctc[k];
961 for (auto const& tag : cctc)
962 {
963 const Box& bx = tag.dbox;
964 FAB& dfab = dst[tag.dstIndex];
965 if (op == FabArrayBase::COPY)
966 {
967 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
968 }
969 else
970 {
971 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
972 }
973 dptr += bx.numPts() * ncomp * sizeof(BUF);
974 }
975 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
976 }
977 }
978 }
979 else
980 {
981 LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
982 recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
983 for (int k = 0; k < N_rcvs; ++k)
984 {
985 if (recv_size[k] > 0)
986 {
987 const char* dptr = recv_data[k];
988 auto const& cctc = *recv_cctc[k];
989 for (auto const& tag : cctc)
990 {
991 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
992 dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
993 }
994 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
995 }
996 }
997
998#ifdef AMREX_USE_OMP
999#pragma omp parallel
1000#endif
1001 for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1002 {
1003 const auto& tags = recv_copy_tags[mfi];
1004 auto dfab = dst.array(mfi);
1005 for (auto const & tag : tags)
1006 {
1007 auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1008 if (op == FabArrayBase::COPY)
1009 {
1010 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1011 [=] (int i, int j, int k, int n) noexcept
1012 {
1013 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1014 });
1015 }
1016 else
1017 {
1018 amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1019 [=] (int i, int j, int k, int n) noexcept
1020 {
1021 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1022 });
1023 }
1024 }
1025 }
1026 }
1027}
1028
1029#endif /* AMREX_USE_MPI */
1030
1031#endif
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:106
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
AMREX_GPU_HOST_DEVICE const IntVectND< dim > & smallEnd() const &noexcept
Get the smallend of the BoxND.
Definition AMReX_Box.H:105
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:346
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:109
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition AMReX_FabArrayBase.H:118
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:130
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition AMReX_FabArrayBase.H:112
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:393
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:94
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:344
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition AMReX_FabArray.H:355
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:560
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition AMReX_FabArray.H:530
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition AMReX_LayoutData.H:25
Definition AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Long size() const noexcept
Definition AMReX_Vector.H:50
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:378
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition AMReX_BaseFab.H:87
AMREX_GPU_HOST_DEVICE constexpr GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition AMReX_Tuple.H:179
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
Arena * The_Arena()
Definition AMReX_Arena.cpp:616
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayCommI.H:896
void fab_to_fab(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, F &&f)
Definition AMReX_FBI.H:53
void fab_to_fab_atomic_add(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition AMReX_FBI.H:190
void fab_to_fab_atomic_cpy(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition AMReX_FBI.H:172
Definition AMReX_FBI.H:5
IntVect offset
Definition AMReX_FBI.H:8
FAB const * sfab
Definition AMReX_FBI.H:6
Box dbox
Definition AMReX_FBI.H:7
Definition AMReX_FBI.H:11
char const * p
Definition AMReX_FBI.H:12
Box dbox
Definition AMReX_FBI.H:13
Definition AMReX_TagParallelFor.H:57
Definition AMReX_TagParallelFor.H:26
Definition AMReX_TagParallelFor.H:49
Array4< T > dfab
Definition AMReX_TagParallelFor.H:50
Definition AMReX_Dim3.H:12
Definition AMReX_FabArrayBase.H:471
bool m_threadsafe_loc
Definition AMReX_FabArrayBase.H:473
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:474
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:477
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:475
Used by a bunch of routines when communicating via MPI.
Definition AMReX_FabArrayBase.H:194
Box sbox
Definition AMReX_FabArrayBase.H:196
int srcIndex
Definition AMReX_FabArrayBase.H:198
Box dbox
Definition AMReX_FabArrayBase.H:195
int dstIndex
Definition AMReX_FabArrayBase.H:197
FillBoundary.
Definition AMReX_FabArrayBase.H:487
Definition AMReX_TypeTraits.H:266
Definition AMReX_FBI.H:32
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:34
Definition AMReX_FBI.H:42
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(U0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:45
Definition AMReX_FBI.H:22
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition AMReX_FBI.H:24