Block-Structured AMR Software Framework
AMReX_FabArrayCommI.H
Go to the documentation of this file.
1 
2 #include <AMReX_FBI.H>
3 #include <AMReX_PCI.H>
4 
5 template <class FAB>
6 template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
7 void
8 FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
9  const Periodicity& period, bool cross,
10  bool enforce_periodicity_only,
11  bool override_sync)
12 {
13  BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: FB");
14  BL_PROFILE("FillBoundary_nowait()");
15 
16  AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
17  AMREX_ASSERT(!enforce_periodicity_only || !override_sync);
18 
19  bool work_to_do;
20  if (enforce_periodicity_only) {
21  work_to_do = period.isAnyPeriodic();
22  } else if (override_sync) {
23  work_to_do = (nghost.max() > 0) || !is_cell_centered();
24  } else {
25  work_to_do = nghost.max() > 0;
26  }
27  if (!work_to_do) { return; }
28 
29  const FB& TheFB = getFB(nghost, period, cross, enforce_periodicity_only, override_sync);
30 
31  if (ParallelContext::NProcsSub() == 1)
32  {
33  //
34  // There can only be local work to do.
35  //
36  int N_locs = (*TheFB.m_LocTags).size();
37  if (N_locs == 0) { return; }
38 #ifdef AMREX_USE_GPU
39  if (Gpu::inLaunchRegion())
40  {
41 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
42  if (Gpu::inGraphRegion())
43  {
44  FB_local_copy_cuda_graph_1(TheFB, scomp, ncomp);
45  }
46  else
47 #endif
48  {
49  FB_local_copy_gpu(TheFB, scomp, ncomp);
50  }
51  }
52  else
53 #endif
54  {
55  FB_local_copy_cpu(TheFB, scomp, ncomp);
56  }
57 
58  return;
59  }
60 
61 #ifdef BL_USE_MPI
62 
63  //
64  // Do this before prematurely exiting if running in parallel.
65  // Otherwise sequence numbers will not match across MPI processes.
66  //
68 
69  const int N_locs = TheFB.m_LocTags->size();
70  const int N_rcvs = TheFB.m_RcvTags->size();
71  const int N_snds = TheFB.m_SndTags->size();
72 
73  if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
74  // No work to do.
75  return;
76  }
77 
78  fbd = std::make_unique<FBData<FAB>>();
79  fbd->fb = &TheFB;
80  fbd->scomp = scomp;
81  fbd->ncomp = ncomp;
82  fbd->tag = SeqNum;
83 
84  //
85  // Post rcvs. Allocate one chunk of space to hold'm all.
86  //
87 
88  if (N_rcvs > 0) {
89  PostRcvs<BUF>(*TheFB.m_RcvTags, fbd->the_recv_data,
90  fbd->recv_data, fbd->recv_size, fbd->recv_from, fbd->recv_reqs,
91  ncomp, SeqNum);
92  fbd->recv_stat.resize(N_rcvs);
93  }
94 
95  //
96  // Post send's
97  //
98  char*& the_send_data = fbd->the_send_data;
99  Vector<char*> & send_data = fbd->send_data;
100  Vector<std::size_t> send_size;
101  Vector<int> send_rank;
102  Vector<MPI_Request>& send_reqs = fbd->send_reqs;
104 
105  if (N_snds > 0)
106  {
107  PrepareSendBuffers<BUF>(*TheFB.m_SndTags, the_send_data, send_data, send_size, send_rank,
108  send_reqs, send_cctc, ncomp);
109 
110 #ifdef AMREX_USE_GPU
111  if (Gpu::inLaunchRegion())
112  {
113 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
114  if (Gpu::inGraphRegion()) {
115  FB_pack_send_buffer_cuda_graph(TheFB, scomp, ncomp, send_data, send_size, send_cctc);
116  }
117  else
118 #endif
119  {
120  pack_send_buffer_gpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc);
121  }
122  }
123  else
124 #endif
125  {
126  pack_send_buffer_cpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc);
127  }
128 
129  AMREX_ASSERT(send_reqs.size() == N_snds);
130  PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
131  }
132 
133  FillBoundary_test();
134 
135  //
136  // Do the local work. Hope for a bit of communication/computation overlap.
137  //
138  if (N_locs > 0)
139  {
140 #ifdef AMREX_USE_GPU
141  if (Gpu::inLaunchRegion())
142  {
143 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
144  if (Gpu::inGraphRegion()) {
145  FB_local_copy_cuda_graph_n(TheFB, scomp, ncomp);
146  }
147  else
148 #endif
149  {
150  FB_local_copy_gpu(TheFB, scomp, ncomp);
151  }
152  }
153  else
154 #endif
155  {
156  FB_local_copy_cpu(TheFB, scomp, ncomp);
157  }
158 
159  FillBoundary_test();
160  }
161 
162 #endif /*BL_USE_MPI*/
163 }
164 
165 template <class FAB>
166 template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
167 void
169 {
170 #ifdef AMREX_USE_MPI
171 
172  BL_PROFILE("FillBoundary_finish()");
174 
175  if (!fbd) { n_filled = IntVect::TheZeroVector(); return; }
176 
177  const FB* TheFB = fbd->fb;
178  const auto N_rcvs = static_cast<int>(TheFB->m_RcvTags->size());
179  if (N_rcvs > 0)
180  {
181  Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
182  for (int k = 0; k < N_rcvs; k++)
183  {
184  if (fbd->recv_size[k] > 0)
185  {
186  auto const& cctc = TheFB->m_RcvTags->at(fbd->recv_from[k]);
187  recv_cctc[k] = &cctc;
188  }
189  }
190 
191  int actual_n_rcvs = N_rcvs - std::count(fbd->recv_data.begin(), fbd->recv_data.end(), nullptr);
192 
193  if (actual_n_rcvs > 0) {
194  ParallelDescriptor::Waitall(fbd->recv_reqs, fbd->recv_stat);
195 #ifdef AMREX_DEBUG
196  if (!CheckRcvStats(fbd->recv_stat, fbd->recv_size, fbd->tag))
197  {
198  amrex::Abort("FillBoundary_finish failed with wrong message size");
199  }
200 #endif
201  }
202 
203  bool is_thread_safe = TheFB->m_threadsafe_rcv;
204 
205 #ifdef AMREX_USE_GPU
206  if (Gpu::inLaunchRegion())
207  {
208 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
209  if (Gpu::inGraphRegion())
210  {
211  FB_unpack_recv_buffer_cuda_graph(*TheFB, fbd->scomp, fbd->ncomp,
212  fbd->recv_data, fbd->recv_size,
213  recv_cctc, is_thread_safe);
214  }
215  else
216 #endif
217  {
218  unpack_recv_buffer_gpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
219  recv_cctc, FabArrayBase::COPY, is_thread_safe);
220  }
221  }
222  else
223 #endif
224  {
225  unpack_recv_buffer_cpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
226  recv_cctc, FabArrayBase::COPY, is_thread_safe);
227  }
228 
229  if (fbd->the_recv_data)
230  {
231  amrex::The_Comms_Arena()->free(fbd->the_recv_data);
232  fbd->the_recv_data = nullptr;
233  }
234  }
235 
236  const auto N_snds = static_cast<int>(TheFB->m_SndTags->size());
237  if (N_snds > 0) {
238  Vector<MPI_Status> stats(fbd->send_reqs.size());
239  ParallelDescriptor::Waitall(fbd->send_reqs, stats);
240  amrex::The_Comms_Arena()->free(fbd->the_send_data);
241  fbd->the_send_data = nullptr;
242  }
243 
244  fbd.reset();
245 
246 #endif
247 }
248 
249 // \cond CODEGEN
250 template <class FAB>
251 void
253  int scomp,
254  int dcomp,
255  int ncomp,
256  const IntVect& snghost,
257  const IntVect& dnghost,
258  const Periodicity& period,
259  CpOp op,
260  const FabArrayBase::CPC * a_cpc)
261 {
262  BL_PROFILE("FabArray::ParallelCopy()");
263 
264  ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc);
266 }
267 
268 template <class FAB>
269 void
270 FabArray<FAB>::ParallelCopyToGhost (const FabArray<FAB>& src,
271  int scomp,
272  int dcomp,
273  int ncomp,
274  const IntVect& snghost,
275  const IntVect& dnghost,
276  const Periodicity& period)
277 {
278  BL_PROFILE("FabArray::ParallelCopyToGhost()");
279 
280  ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
281  FabArrayBase::COPY, nullptr, true);
283 }
284 
285 template <class FAB>
286 void
287 FabArray<FAB>::ParallelCopyToGhost_nowait (const FabArray<FAB>& src,
288  int scomp,
289  int dcomp,
290  int ncomp,
291  const IntVect& snghost,
292  const IntVect& dnghost,
293  const Periodicity& period)
294 {
295  ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
296  FabArrayBase::COPY, nullptr, true);
297 }
298 
299 template <class FAB>
300 void
301 FabArray<FAB>::ParallelCopyToGhost_finish ()
302 {
304 }
305 
306 
307 template <class FAB>
308 void
309 FabArray<FAB>::ParallelCopy_nowait (const FabArray<FAB>& src,
310  int scomp,
311  int dcomp,
312  int ncomp,
313  const IntVect& snghost,
314  const IntVect& dnghost,
315  const Periodicity& period,
316  CpOp op,
317  const FabArrayBase::CPC * a_cpc,
318  bool to_ghost_cells_only)
319 {
320  BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: PC");
321  BL_PROFILE("FabArray::ParallelCopy_nowait()");
322 
323  AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
324 
325  if (empty() || src.empty()) {
326  return;
327  }
328 
329  BL_ASSERT(op == FabArrayBase::COPY || op == FabArrayBase::ADD);
330  BL_ASSERT(boxArray().ixType() == src.boxArray().ixType());
331  BL_ASSERT(src.nGrowVect().allGE(snghost));
332  BL_ASSERT( nGrowVect().allGE(dnghost));
333 
334  n_filled = dnghost;
335 
336  if ((ParallelDescriptor::NProcs() == 1) &&
337  (this->size() == 1) && (src.size() == 1) &&
338  !period.isAnyPeriodic() && !to_ghost_cells_only)
339  {
340  if (this != &src) { // avoid self copy or plus
341  auto const& da = this->array(0, dcomp);
342  auto const& sa = src.const_array(0, scomp);
343  Box box = amrex::grow(src.box(0),snghost)
344  & amrex::grow(this->box(0),dnghost);
345  if (op == FabArrayBase::COPY) {
346 #ifdef AMREX_USE_GPU
347  ParallelFor(box, ncomp,
348  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
349  da(i,j,k,n) = sa(i,j,k,n);
350  });
351 #else
352  auto const& lo = amrex::lbound(box);
353  auto const& hi = amrex::ubound(box);
354 #ifdef AMREX_USE_OMP
355 #pragma omp parallel for collapse(3)
356 #endif
357  for (int n = 0; n < ncomp; ++n) {
358  for (int k = lo.z; k <= hi.z; ++k) {
359  for (int j = lo.y; j <= hi.y; ++j) {
361  for (int i = lo.x; i <= hi.x; ++i) {
362  da(i,j,k,n) = sa(i,j,k,n);
363  }}}}
364 #endif
365  } else {
366 #ifdef AMREX_USE_GPU
367  ParallelFor(box, ncomp,
368  [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
369  da(i,j,k,n) += sa(i,j,k,n);
370  });
371 #else
372  auto const& lo = amrex::lbound(box);
373  auto const& hi = amrex::ubound(box);
374 #ifdef AMREX_USE_OMP
375 #pragma omp parallel for collapse(3)
376 #endif
377  for (int n = 0; n < ncomp; ++n) {
378  for (int k = lo.z; k <= hi.z; ++k) {
379  for (int j = lo.y; j <= hi.y; ++j) {
381  for (int i = lo.x; i <= hi.x; ++i) {
382  da(i,j,k,n) += sa(i,j,k,n);
383  }}}}
384 #endif
385  }
387  }
388  return;
389  }
390 
391  if ((src.boxArray().ixType().cellCentered() || op == FabArrayBase::COPY) &&
392  (boxarray == src.boxarray && distributionMap == src.distributionMap) &&
393  snghost == IntVect::TheZeroVector() &&
394  dnghost == IntVect::TheZeroVector() &&
395  !period.isAnyPeriodic() && !to_ghost_cells_only)
396  {
397  //
398  // Short-circuit full intersection code if we're doing copy()s or if
399  // we're doing plus()s on cell-centered data. Don't do plus()s on
400  // non-cell-centered data this simplistic way.
401  //
402  if (this != &src) { // avoid self copy or plus
403  if (op == FabArrayBase::COPY) {
404  Copy(*this, src, scomp, dcomp, ncomp, IntVect(0));
405  } else {
406  Add(*this, src, scomp, dcomp, ncomp, IntVect(0));
407  }
408  }
409  return;
410  }
411 
412  const CPC& thecpc = (a_cpc) ? *a_cpc : getCPC(dnghost, src, snghost, period, to_ghost_cells_only);
413 
414  if (ParallelContext::NProcsSub() == 1)
415  {
416  //
417  // There can only be local work to do.
418  //
419 
420  int N_locs = (*thecpc.m_LocTags).size();
421  if (N_locs == 0) { return; }
422 #ifdef AMREX_USE_GPU
423  if (Gpu::inLaunchRegion())
424  {
425  PC_local_gpu(thecpc, src, scomp, dcomp, ncomp, op);
426  }
427  else
428 #endif
429  {
430  PC_local_cpu(thecpc, src, scomp, dcomp, ncomp, op);
431  }
432 
433  return;
434  }
435 
436 #ifdef BL_USE_MPI
437 
438  //
439  // Do this before prematurely exiting if running in parallel.
440  // Otherwise sequence numbers will not match across MPI processes.
441  //
442  int tag = ParallelDescriptor::SeqNum();
443 
444  const int N_snds = thecpc.m_SndTags->size();
445  const int N_rcvs = thecpc.m_RcvTags->size();
446  const int N_locs = thecpc.m_LocTags->size();
447 
448  if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
449  //
450  // No work to do.
451  //
452 
453  return;
454  }
455 
456  //
457  // Send/Recv at most MaxComp components at a time to cut down memory usage.
458  //
459  int NCompLeft = ncomp;
460  int SC = scomp, DC = dcomp, NC;
461 
462  for (int ipass = 0; ipass < ncomp; )
463  {
464  pcd = std::make_unique<PCData<FAB>>();
465  pcd->cpc = &thecpc;
466  pcd->src = &src;
467  pcd->op = op;
468  pcd->tag = tag;
469 
470  NC = std::min(NCompLeft,FabArrayBase::MaxComp);
471  const bool last_iter = (NCompLeft == NC);
472 
473  pcd->SC = SC;
474  pcd->DC = DC;
475  pcd->NC = NC;
476 
477  //
478  // Post rcvs. Allocate one chunk of space to hold'm all.
479  //
480  pcd->the_recv_data = nullptr;
481 
482  pcd->actual_n_rcvs = 0;
483  if (N_rcvs > 0) {
484  PostRcvs(*thecpc.m_RcvTags, pcd->the_recv_data,
485  pcd->recv_data, pcd->recv_size, pcd->recv_from, pcd->recv_reqs, NC, pcd->tag);
486  pcd->actual_n_rcvs = N_rcvs - std::count(pcd->recv_size.begin(), pcd->recv_size.end(), 0);
487  }
488 
489  //
490  // Post send's
491  //
492  Vector<char*> send_data;
493  Vector<std::size_t> send_size;
494  Vector<int> send_rank;
495  Vector<const CopyComTagsContainer*> send_cctc;
496 
497  if (N_snds > 0)
498  {
499  src.PrepareSendBuffers(*thecpc.m_SndTags, pcd->the_send_data, send_data, send_size,
500  send_rank, pcd->send_reqs, send_cctc, NC);
501 
502 #ifdef AMREX_USE_GPU
503  if (Gpu::inLaunchRegion())
504  {
505  pack_send_buffer_gpu(src, SC, NC, send_data, send_size, send_cctc);
506  }
507  else
508 #endif
509  {
510  pack_send_buffer_cpu(src, SC, NC, send_data, send_size, send_cctc);
511  }
512 
513  AMREX_ASSERT(pcd->send_reqs.size() == N_snds);
514  FabArray<FAB>::PostSnds(send_data, send_size, send_rank, pcd->send_reqs, pcd->tag);
515  }
516 
517  //
518  // Do the local work. Hope for a bit of communication/computation overlap.
519  //
520  if (N_locs > 0)
521  {
522 #ifdef AMREX_USE_GPU
523  if (Gpu::inLaunchRegion())
524  {
525  PC_local_gpu(thecpc, src, SC, DC, NC, op);
526  }
527  else
528 #endif
529  {
530  PC_local_cpu(thecpc, src, SC, DC, NC, op);
531  }
532  }
533 
534  if (!last_iter)
535  {
537 
538  SC += NC;
539  DC += NC;
540  }
541 
542  ipass += NC;
543  NCompLeft -= NC;
544  }
545 
546 #endif /*BL_USE_MPI*/
547 }
548 
549 template <class FAB>
550 void
552 {
553 
554 #ifdef BL_USE_MPI
555 
556  BL_PROFILE("FabArray::ParallelCopy_finish()");
558 
559  if (!pcd) { return; }
560 
561  const CPC* thecpc = pcd->cpc;
562 
563  const auto N_snds = static_cast<int>(thecpc->m_SndTags->size());
564  const auto N_rcvs = static_cast<int>(thecpc->m_RcvTags->size());
565 
566  if (N_rcvs > 0)
567  {
568  Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
569  for (int k = 0; k < N_rcvs; ++k)
570  {
571  if (pcd->recv_size[k] > 0)
572  {
573  auto const& cctc = thecpc->m_RcvTags->at(pcd->recv_from[k]);
574  recv_cctc[k] = &cctc;
575  }
576  }
577 
578  if (pcd->actual_n_rcvs > 0) {
579  Vector<MPI_Status> stats(N_rcvs);
580  ParallelDescriptor::Waitall(pcd->recv_reqs, stats);
581 #ifdef AMREX_DEBUG
582  if (!CheckRcvStats(stats, pcd->recv_size, pcd->tag))
583  {
584  amrex::Abort("ParallelCopy failed with wrong message size");
585  }
586 #endif
587  }
588 
589  bool is_thread_safe = thecpc->m_threadsafe_rcv;
590 
591 #ifdef AMREX_USE_GPU
592  if (Gpu::inLaunchRegion())
593  {
594  unpack_recv_buffer_gpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
595  recv_cctc, pcd->op, is_thread_safe);
596  }
597  else
598 #endif
599  {
600  unpack_recv_buffer_cpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
601  recv_cctc, pcd->op, is_thread_safe);
602  }
603 
604  if (pcd->the_recv_data)
605  {
606  amrex::The_Comms_Arena()->free(pcd->the_recv_data);
607  pcd->the_recv_data = nullptr;
608  }
609  }
610 
611  if (N_snds > 0) {
612  if (! thecpc->m_SndTags->empty()) {
613  Vector<MPI_Status> stats(pcd->send_reqs.size());
614  ParallelDescriptor::Waitall(pcd->send_reqs, stats);
615  }
616  amrex::The_Comms_Arena()->free(pcd->the_send_data);
617  pcd->the_send_data = nullptr;
618  }
619 
620  pcd.reset();
621 
622 #endif /*BL_USE_MPI*/
623 }
624 
625 template <class FAB>
626 void
627 FabArray<FAB>::copyTo (FAB& dest, int scomp, int dcomp, int ncomp, int nghost) const
628 {
629  BL_PROFILE("FabArray::copy(fab)");
630 
631  BL_ASSERT(dcomp + ncomp <= dest.nComp());
632  BL_ASSERT(IntVect(nghost).allLE(nGrowVect()));
633 
634  int root_proc = this->DistributionMap()[0];
635 
636  BoxArray ba(dest.box());
637  DistributionMapping dm(Vector<int>{root_proc});
638  FabArray<FAB> destmf(ba, dm, ncomp, 0, MFInfo().SetAlloc(false));
639  if (ParallelDescriptor::MyProc() == root_proc) {
640  destmf.setFab(0, FAB(dest, amrex::make_alias, dcomp, ncomp));
641  }
642 
643  destmf.ParallelCopy(*this, scomp, 0, ncomp, nghost, 0);
644 
645 #ifdef BL_USE_MPI
646  using T = typename FAB::value_type;
647  if (ParallelContext::NProcsSub() > 1) {
648  Long count = dest.numPts()*ncomp;
649  T* const p0 = dest.dataPtr(dcomp);
650  T* pb = p0;
651 #ifdef AMREX_USE_GPU
652  if (dest.arena()->isDevice()) {
653  pb = (T*)The_Pinned_Arena()->alloc(sizeof(T)*count);
654  Gpu::dtoh_memcpy_async(pb, p0, sizeof(T)*count);
656  }
657 #endif
660 #ifdef AMREX_USE_GPU
661  if (pb != p0) {
662  Gpu::htod_memcpy_async(p0, pb, sizeof(T)*count);
664  }
665 #endif
666  }
667 #endif
668 }
669 // \endcond
670 #ifdef BL_USE_MPI
671 template <class FAB>
672 template <typename BUF>
674 FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
675  Vector<char*>& send_data,
676  Vector<std::size_t>& send_size,
677  Vector<int>& send_rank,
678  Vector<MPI_Request>& send_reqs,
679  Vector<const CopyComTagsContainer*>& send_cctc,
680  int ncomp)
681 {
682  char* pointer = nullptr;
683  PrepareSendBuffers<BUF>(SndTags, pointer, send_data, send_size, send_rank, send_reqs, send_cctc, ncomp);
684  return TheFaArenaPointer(pointer);
685 }
686 
687 template <class FAB>
688 template <typename BUF>
689 void
690 FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
691  char*& the_send_data,
692  Vector<char*>& send_data,
693  Vector<std::size_t>& send_size,
694  Vector<int>& send_rank,
695  Vector<MPI_Request>& send_reqs,
696  Vector<const CopyComTagsContainer*>& send_cctc,
697  int ncomp)
698 {
699  send_data.clear();
700  send_size.clear();
701  send_rank.clear();
702  send_reqs.clear();
703  send_cctc.clear();
704  const auto N_snds = SndTags.size();
705  if (N_snds == 0) { return; }
706  send_data.reserve(N_snds);
707  send_size.reserve(N_snds);
708  send_rank.reserve(N_snds);
709  send_reqs.reserve(N_snds);
710  send_cctc.reserve(N_snds);
711 
712  Vector<std::size_t> offset; offset.reserve(N_snds);
713  std::size_t total_volume = 0;
714  for (auto const& kv : SndTags)
715  {
716  auto const& cctc = kv.second;
717 
718  std::size_t nbytes = 0;
719  for (auto const& cct : kv.second)
720  {
721  nbytes += cct.sbox.numPts() * ncomp * sizeof(BUF);
722  }
723 
724  std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
725  nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
726 
727  // Also need to align the offset properly
728  total_volume = amrex::aligned_size(std::max(alignof(BUF), acd),
729  total_volume);
730 
731  offset.push_back(total_volume);
732  total_volume += nbytes;
733 
734  send_data.push_back(nullptr);
735  send_size.push_back(nbytes);
736  send_rank.push_back(kv.first);
737  send_reqs.push_back(MPI_REQUEST_NULL);
738  send_cctc.push_back(&cctc);
739  }
740 
741  if (total_volume > 0)
742  {
743  the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume));
744  for (int i = 0, N = static_cast<int>(send_size.size()); i < N; ++i) {
745  send_data[i] = the_send_data + offset[i];
746  }
747  } else {
748  the_send_data = nullptr;
749  }
750 }
751 
752 template <class FAB>
753 void
754 FabArray<FAB>::PostSnds (Vector<char*> const& send_data,
755  Vector<std::size_t> const& send_size,
756  Vector<int> const& send_rank,
757  Vector<MPI_Request>& send_reqs,
758  int SeqNum)
759 {
761 
762  const auto N_snds = static_cast<int>(send_reqs.size());
763  for (int j = 0; j < N_snds; ++j)
764  {
765  if (send_size[j] > 0) {
766  const int rank = ParallelContext::global_to_local_rank(send_rank[j]);
767  send_reqs[j] = ParallelDescriptor::Asend
768  (send_data[j], send_size[j], rank, SeqNum, comm).req();
769  }
770  }
771 }
772 
773 template <class FAB>
774 template <typename BUF>
775 TheFaArenaPointer FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
776  Vector<char*>& recv_data,
777  Vector<std::size_t>& recv_size,
778  Vector<int>& recv_from,
779  Vector<MPI_Request>& recv_reqs,
780  int ncomp,
781  int SeqNum)
782 {
783  char* pointer = nullptr;
784  PostRcvs(RcvTags, pointer, recv_data, recv_size, recv_from, recv_reqs, ncomp, SeqNum);
785  return TheFaArenaPointer(pointer);
786 }
787 
788 template <class FAB>
789 template <typename BUF>
790 void
791 FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
792  char*& the_recv_data,
793  Vector<char*>& recv_data,
794  Vector<std::size_t>& recv_size,
795  Vector<int>& recv_from,
796  Vector<MPI_Request>& recv_reqs,
797  int ncomp,
798  int SeqNum)
799 {
800  recv_data.clear();
801  recv_size.clear();
802  recv_from.clear();
803  recv_reqs.clear();
804 
805  Vector<std::size_t> offset;
806  std::size_t TotalRcvsVolume = 0;
807  for (const auto& kv : RcvTags) // loop over senders
808  {
809  std::size_t nbytes = 0;
810  for (auto const& cct : kv.second)
811  {
812  nbytes += cct.dbox.numPts() * ncomp * sizeof(BUF);
813  }
814 
815  std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
816  nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
817 
818  // Also need to align the offset properly
819  TotalRcvsVolume = amrex::aligned_size(std::max(alignof(BUF),acd),
820  TotalRcvsVolume);
821 
822  offset.push_back(TotalRcvsVolume);
823  TotalRcvsVolume += nbytes;
824 
825  recv_data.push_back(nullptr);
826  recv_size.push_back(nbytes);
827  recv_from.push_back(kv.first);
828  recv_reqs.push_back(MPI_REQUEST_NULL);
829  }
830 
831  const auto nrecv = static_cast<int>(recv_from.size());
832 
834 
835  if (TotalRcvsVolume == 0)
836  {
837  the_recv_data = nullptr;
838  }
839  else
840  {
841  the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
842 
843  for (int i = 0; i < nrecv; ++i)
844  {
845  recv_data[i] = the_recv_data + offset[i];
846  if (recv_size[i] > 0)
847  {
848  const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
849  recv_reqs[i] = ParallelDescriptor::Arecv
850  (recv_data[i], recv_size[i], rank, SeqNum, comm).req();
851  }
852  }
853  }
854 }
855 #endif
856 
857 template <class FAB>
858 void
860  int scomp,
861  int dcomp,
862  int ncomp,
863  const IntVect& nghost)
864 {
866  "FabArray::Redistribute: must have the same BoxArray");
867 
868  if (ParallelContext::NProcsSub() == 1)
869  {
870  Copy(*this, src, scomp, dcomp, ncomp, nghost);
871  return;
872  }
873 
874 #ifdef BL_USE_MPI
875 
877 
878  ParallelCopy(src, scomp, dcomp, ncomp, nghost, nghost, Periodicity::NonPeriodic(),
879  FabArrayBase::COPY, &cpc);
880 
881 #endif
882 }
883 
884 template <class FAB>
885 void
887 {
888 #if defined(AMREX_USE_MPI) && !defined(AMREX_DEBUG)
889  // We only test if no DEBUG because in DEBUG we check the status later.
890  // If Test is done here, the status check will fail.
891  int flag;
892  ParallelDescriptor::Test(fbd->recv_reqs, flag, fbd->recv_stat);
893 #endif
894 }
895 
896 namespace detail {
897 template <class TagT>
898 void fbv_copy (Vector<TagT> const& tags)
899 {
900  const int N = tags.size();
901  if (N == 0) { return; }
902 #ifdef AMREX_USE_GPU
903  if (Gpu::inLaunchRegion()) {
904  ParallelFor(tags, 1,
905  [=] AMREX_GPU_DEVICE (int i, int j, int k, int, TagT const& tag) noexcept
906  {
907  const int ncomp = tag.dfab.nComp();
908  for (int n = 0; n < ncomp; ++n) {
909  tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
910  }
911  });
912  } else
913 #endif
914  {
915 #ifdef AMREX_USE_OMP
916 #pragma omp parallel for
917 #endif
918  for (int itag = 0; itag < N; ++itag) {
919  auto const& tag = tags[itag];
920  const int ncomp = tag.dfab.nComp();
921  AMREX_LOOP_4D(tag.dbox, ncomp, i, j, k, n,
922  {
923  tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
924  });
925  }
926  }
927 }
928 }
929 
930 template <class MF>
931 std::enable_if_t<IsFabArray<MF>::value>
932 FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
933  Vector<int> const& ncomp, Vector<IntVect> const& nghost,
934  Vector<Periodicity> const& period, Vector<int> const& cross = {})
935 {
936  BL_PROFILE("FillBoundary(Vector)");
937 #if 1
938  const int N = mf.size();
939  for (int i = 0; i < N; ++i) {
940  mf[i]->FillBoundary_nowait(scomp[i], ncomp[i], nghost[i], period[i],
941  cross.empty() ? 0 : cross[i]);
942  }
943  for (int i = 0; i < N; ++i) {
944  mf[i]->FillBoundary_finish();
945  }
946 
947 #else
948  using FAB = typename MF::FABType::value_type;
949  using T = typename FAB::value_type;
950 
951  const int nmfs = mf.size();
952  Vector<FabArrayBase::CommMetaData const*> cmds;
953  int N_locs = 0;
954  int N_rcvs = 0;
955  int N_snds = 0;
956  for (int imf = 0; imf < nmfs; ++imf) {
957  if (nghost[imf].max() > 0) {
958  auto const& TheFB = mf[imf]->getFB(nghost[imf], period[imf],
959  cross.empty() ? 0 : cross[imf]);
960  // The FB is cached. Therefore it's safe take its address for later use.
961  cmds.push_back(static_cast<FabArrayBase::CommMetaData const*>(&TheFB));
962  N_locs += TheFB.m_LocTags->size();
963  N_rcvs += TheFB.m_RcvTags->size();
964  N_snds += TheFB.m_SndTags->size();
965  } else {
966  cmds.push_back(nullptr);
967  }
968  }
969 
970  using TagT = Array4CopyTag<T>;
971  Vector<TagT> local_tags;
972  local_tags.reserve(N_locs);
973  static_assert(amrex::IsStoreAtomic<T>::value, "FillBoundary(Vector): storing T is not atomic");
974  for (int imf = 0; imf < nmfs; ++imf) {
975  if (cmds[imf]) {
976  auto const& tags = *(cmds[imf]->m_LocTags);
977  for (auto const& tag : tags) {
978  local_tags.push_back({(*mf[imf])[tag.dstIndex].array (scomp[imf],ncomp[imf]),
979  (*mf[imf])[tag.srcIndex].const_array(scomp[imf],ncomp[imf]),
980  tag.dbox,
981  (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
982  }
983  }
984  }
985 
986  if (ParallelContext::NProcsSub() == 1) {
987  detail::fbv_copy(local_tags);
988  return;
989  }
990 
991 #ifdef AMREX_USE_MPI
992  //
993  // Do this before prematurely exiting if running in parallel.
994  // Otherwise sequence numbers will not match across MPI processes.
995  //
998 
999  if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) { return; } // No work to do
1000 
1001  char* the_recv_data = nullptr;
1002  Vector<int> recv_from;
1003  Vector<std::size_t> recv_size;
1004  Vector<MPI_Request> recv_reqs;
1005  Vector<MPI_Status> recv_stat;
1006  Vector<TagT> recv_tags;
1007 
1008  if (N_rcvs > 0) {
1009 
1010  for (int imf = 0; imf < nmfs; ++imf) {
1011  if (cmds[imf]) {
1012  auto const& tags = *(cmds[imf]->m_RcvTags);
1013  for (const auto& kv : tags) {
1014  recv_from.push_back(kv.first);
1015  }
1016  }
1017  }
1018  amrex::RemoveDuplicates(recv_from);
1019  const int nrecv = recv_from.size();
1020 
1021  recv_reqs.resize(nrecv, MPI_REQUEST_NULL);
1022  recv_stat.resize(nrecv);
1023 
1024  recv_tags.reserve(N_rcvs);
1025 
1026  Vector<Vector<std::size_t> > recv_offset(nrecv);
1027  Vector<std::size_t> offset;
1028  recv_size.reserve(nrecv);
1029  offset.reserve(nrecv);
1030  std::size_t TotalRcvsVolume = 0;
1031  for (int i = 0; i < nrecv; ++i) {
1032  std::size_t nbytes = 0;
1033  for (int imf = 0; imf < nmfs; ++imf) {
1034  if (cmds[imf]) {
1035  auto const& tags = *(cmds[imf]->m_RcvTags);
1036  auto it = tags.find(recv_from[i]);
1037  if (it != tags.end()) {
1038  for (auto const& cct : it->second) {
1039  auto& dfab = (*mf[imf])[cct.dstIndex];
1040  recv_offset[i].push_back(nbytes);
1041  recv_tags.push_back({dfab.array(scomp[imf],ncomp[imf]),
1042  makeArray4<T const>(nullptr,cct.dbox,ncomp[imf]),
1043  cct.dbox, Dim3{0,0,0}});
1044  nbytes += dfab.nBytes(cct.dbox,ncomp[imf]);
1045  }
1046  }
1047  }
1048  }
1049 
1050  std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1051  nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
1052 
1053  // Also need to align the offset properly
1054  TotalRcvsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalRcvsVolume);
1055 
1056  offset.push_back(TotalRcvsVolume);
1057  TotalRcvsVolume += nbytes;
1058 
1059  recv_size.push_back(nbytes);
1060  }
1061 
1062  the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
1063 
1064  int k = 0;
1065  for (int i = 0; i < nrecv; ++i) {
1066  char* p = the_recv_data + offset[i];
1067  const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
1068  recv_reqs[i] = ParallelDescriptor::Arecv
1069  (p, recv_size[i], rank, SeqNum, comm).req();
1070  for (int j = 0, nj = recv_offset[i].size(); j < nj; ++j) {
1071  recv_tags[k++].sfab.p = (T const*)(p + recv_offset[i][j]);
1072  }
1073  }
1074  }
1075 
1076  char* the_send_data = nullptr;
1077  Vector<int> send_rank;
1078  Vector<char*> send_data;
1079  Vector<std::size_t> send_size;
1080  Vector<MPI_Request> send_reqs;
1081  if (N_snds > 0) {
1082  for (int imf = 0; imf < nmfs; ++imf) {
1083  if (cmds[imf]) {
1084  auto const& tags = *(cmds[imf]->m_SndTags);
1085  for (auto const& kv : tags) {
1086  send_rank.push_back(kv.first);
1087  }
1088  }
1089  }
1090  amrex::RemoveDuplicates(send_rank);
1091  const int nsend = send_rank.size();
1092 
1093  send_data.resize(nsend, nullptr);
1094  send_reqs.resize(nsend, MPI_REQUEST_NULL);
1095 
1096  Vector<TagT> send_tags;
1097  send_tags.reserve(N_snds);
1098 
1099  Vector<Vector<std::size_t> > send_offset(nsend);
1100  Vector<std::size_t> offset;
1101  send_size.reserve(nsend);
1102  offset.reserve(nsend);
1103  std::size_t TotalSndsVolume = 0;
1104  for (int i = 0; i < nsend; ++i) {
1105  std::size_t nbytes = 0;
1106  for (int imf = 0; imf < nmfs; ++imf) {
1107  if (cmds[imf]) {
1108  auto const& tags = *(cmds[imf]->m_SndTags);
1109  auto it = tags.find(send_rank[i]);
1110  if (it != tags.end()) {
1111  for (auto const& cct : it->second) {
1112  auto const& sfab = (*mf[imf])[cct.srcIndex];
1113  send_offset[i].push_back(nbytes);
1114  send_tags.push_back({amrex::makeArray4<T>(nullptr,cct.sbox,ncomp[imf]),
1115  sfab.const_array(scomp[imf],ncomp[imf]),
1116  cct.sbox, Dim3{0,0,0}});
1117  nbytes += sfab.nBytes(cct.sbox,ncomp[imf]);
1118  }
1119  }
1120  }
1121  }
1122 
1123  std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1124  nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
1125 
1126  // Also need to align the offset properly
1127  TotalSndsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalSndsVolume);
1128 
1129  offset.push_back(TotalSndsVolume);
1130  TotalSndsVolume += nbytes;
1131 
1132  send_size.push_back(nbytes);
1133  }
1134 
1135  the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalSndsVolume));
1136  int k = 0;
1137  for (int i = 0; i < nsend; ++i) {
1138  send_data[i] = the_send_data + offset[i];
1139  for (int j = 0, nj = send_offset[i].size(); j < nj; ++j) {
1140  send_tags[k++].dfab.p = (T*)(send_data[i] + send_offset[i][j]);
1141  }
1142  }
1143 
1144  detail::fbv_copy(send_tags);
1145 
1146  FabArray<FAB>::PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
1147  }
1148 
1149 #if !defined(AMREX_DEBUG)
1150  int recv_flag;
1151  ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1152 #endif
1153 
1154  if (N_locs > 0) {
1155  detail::fbv_copy(local_tags);
1156 #if !defined(AMREX_DEBUG)
1157  ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1158 #endif
1159  }
1160 
1161  if (N_rcvs > 0) {
1162  ParallelDescriptor::Waitall(recv_reqs, recv_stat);
1163 #ifdef AMREX_DEBUG
1164  if (!FabArrayBase::CheckRcvStats(recv_stat, recv_size, SeqNum)) {
1165  amrex::Abort("FillBoundary(vector) failed with wrong message size");
1166  }
1167 #endif
1168 
1169  detail::fbv_copy(recv_tags);
1170 
1171  amrex::The_Comms_Arena()->free(the_recv_data);
1172  }
1173 
1174  if (N_snds > 0) {
1175  Vector<MPI_Status> stats(send_reqs.size());
1176  ParallelDescriptor::Waitall(send_reqs, stats);
1177  amrex::The_Comms_Arena()->free(the_send_data);
1178  }
1179 
1180 #endif // #ifdef AMREX_USE_MPI
1181 #endif // #if 1 #else
1182 }
1183 
1184 template <class MF>
1185 std::enable_if_t<IsFabArray<MF>::value>
1186 FillBoundary (Vector<MF*> const& mf, const Periodicity& a_period = Periodicity::NonPeriodic())
1187 {
1188  Vector<int> scomp(mf.size(), 0);
1189  Vector<int> ncomp;
1190  Vector<IntVect> nghost;
1191  Vector<Periodicity> period(mf.size(), a_period);
1192  ncomp.reserve(mf.size());
1193  nghost.reserve(mf.size());
1194  for (auto const& x : mf) {
1195  ncomp.push_back(x->nComp());
1196  nghost.push_back(x->nGrowVect());
1197  }
1198  FillBoundary(mf, scomp, ncomp, nghost, period);
1199 }
#define BL_PROFILE(a)
Definition: AMReX_BLProfiler.H:551
#define BL_PROFILE_SYNC_STOP()
Definition: AMReX_BLProfiler.H:645
#define BL_PROFILE_SYNC_START_TIMED(fname)
Definition: AMReX_BLProfiler.H:644
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition: AMReX_BLassert.H:49
#define BL_ASSERT(EX)
Definition: AMReX_BLassert.H:39
#define AMREX_ASSERT_WITH_MESSAGE(EX, MSG)
Definition: AMReX_BLassert.H:37
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition: AMReX_Extension.H:80
#define AMREX_NODISCARD
Definition: AMReX_Extension.H:251
std::enable_if_t< IsFabArray< MF >::value > FillBoundary(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period, Vector< int > const &cross={})
Definition: AMReX_FabArrayCommI.H:932
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
#define AMREX_LOOP_4D(bx, ncomp, i, j, k, n, block)
Definition: AMReX_Loop.nolint.H:16
int MPI_Comm
Definition: AMReX_ccse-mpi.H:47
static constexpr int MPI_REQUEST_NULL
Definition: AMReX_ccse-mpi.H:53
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
IndexType ixType() const noexcept
Return index type of this BoxArray.
Definition: AMReX_BoxArray.H:837
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition: AMReX_FabArrayBase.H:94
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition: AMReX_FabArrayBase.H:130
An Array of FortranArrayBox(FAB)-like Objects.
Definition: AMReX_FabArray.H:344
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int max() const noexcept
maximum (no absolute values) value
Definition: AMReX_IntVect.H:214
MPI_Request req() const
Definition: AMReX_ParallelDescriptor.H:74
This provides length of period for periodic domains. 0 means it is not periodic in that direction....
Definition: AMReX_Periodicity.H:17
bool isAnyPeriodic() const noexcept
Definition: AMReX_Periodicity.H:22
Long size() const noexcept
Definition: AMReX_Vector.H:50
@ FAB
Definition: AMReX_AmrvisConstants.H:86
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26
bool inGraphRegion()
Definition: AMReX_GpuControl.H:115
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:265
bool inLaunchRegion() noexcept
Definition: AMReX_GpuControl.H:86
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251
int NProcs()
Process ID in MPI_COMM_WORLD.
Definition: AMReX_MPMD.cpp:122
int MyProc()
Definition: AMReX_MPMD.cpp:117
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_cpu(FabArray< FAB > &mf, int dcomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{}) noexcept
std::enable_if_t< IsBaseFab< FAB >) &&IsDataPacking< DataPacking, FAB >)> ParallelCopy_finish(FabArray< FAB > &dest, CommHandler handler, const FabArrayBase::CommMetaData &cmd, const DataPacking &data_packing)
Definition: AMReX_NonLocalBC.H:793
std::enable_if_t< IsBaseFab< FAB >::value > PrepareSendBuffers(const PackComponents &components, FabArray< FAB > &dest, const FabArray< FAB > &src, CommData &comm, const FabArrayBase::MapOfCopyComTagContainers &cctc)
Calls PrepareComBuffers.
Definition: AMReX_NonLocalBC.H:555
AMREX_NODISCARD CommHandler ParallelCopy_nowait(NoLocalCopy, FabArray< FAB > &dest, const FabArray< FAB > &src, const FabArrayBase::CommMetaData &cmd, const DataPacking &data_packing)
Definition: AMReX_NonLocalBC.H:701
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_gpu(FabArray< FAB > &mf, int scomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{})
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> FillBoundary_finish(CommHandler handler, FabArray< FAB > &mf, const FabArrayBase::CommMetaData &cmd, int scomp, int ncomp, DTOS const &dtos, Proj const &proj=Proj{})
Finish communication started by FillBoundary_nowait.
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition: AMReX_ParallelContext.H:70
int global_to_local_rank(int rank) noexcept
Definition: AMReX_ParallelContext.H:98
int NProcsSub() noexcept
number of ranks in current frame
Definition: AMReX_ParallelContext.H:74
void Test(MPI_Request &, int &, MPI_Status &)
Definition: AMReX_ParallelDescriptor.cpp:1207
Message Asend(const T *, size_t n, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1088
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition: AMReX_ParallelDescriptor.cpp:1295
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition: AMReX_ParallelDescriptor.cpp:1282
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition: AMReX_ParallelDescriptor.H:613
Message Arecv(T *, size_t n, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1130
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
@ make_alias
Definition: AMReX_MakeType.H:7
std::unique_ptr< char, TheFaArenaDeleter > TheFaArenaPointer
Definition: AMReX_FabArray.H:104
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
IntVect nGrowVect(FabArrayBase const &fa)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition: AMReX_Box.H:1211
void Copy(FabArray< DFAB > &dst, FabArray< SFAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition: AMReX_FabArray.H:179
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:315
BoxArray const & boxArray(FabArrayBase const &fa)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition: AMReX_Array4.H:308
Arena * The_Comms_Arena()
Definition: AMReX_Arena.cpp:669
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
void Add(FabArray< FAB > &dst, FabArray< FAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition: AMReX_FabArray.H:240
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:649
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:225
std::size_t aligned_size(std::size_t align_requirement, std::size_t size) noexcept
Given a minimum required size of size bytes, this returns the next largest arena size that will align...
Definition: AMReX_Arena.H:30
void ParallelCopy(MF &dst, MF const &src, int scomp, int dcomp, int ncomp, IntVect const &ng_src=IntVect(0), IntVect const &ng_dst=IntVect(0), Periodicity const &period=Periodicity::NonPeriodic())
dst = src w/ MPI communication
Definition: AMReX_FabArrayUtility.H:1672
void RemoveDuplicates(Vector< T > &vec)
Definition: AMReX_Vector.H:190
Definition: AMReX_FabArrayCommI.H:896
void fbv_copy(Vector< TagT > const &tags)
Definition: AMReX_FabArrayCommI.H:898
parallel copy or add
Definition: AMReX_FabArrayBase.H:536
bool m_threadsafe_rcv
Definition: AMReX_FabArrayBase.H:474
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition: AMReX_FabArrayBase.H:477
std::unique_ptr< MapOfCopyComTagContainers > m_SndTags
Definition: AMReX_FabArrayBase.H:476
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition: AMReX_FabArrayBase.H:475
FillBoundary.
Definition: AMReX_FabArrayBase.H:487
Definition: AMReX_TypeTraits.H:266