Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_FabArrayCommI.H
Go to the documentation of this file.
1
2#include <AMReX_FBI.H>
3#include <AMReX_PCI.H>
4
5namespace amrex {
6
7template <class FAB>
8template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
9void
10FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
11 const Periodicity& period, bool cross,
12 bool enforce_periodicity_only,
13 bool override_sync,
14 IntVect const& sumboundary_src_nghost,
15 bool deterministic) // so far the deterministic is for sumboundary only
16{
17 BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: FB");
18 BL_PROFILE("FillBoundary_nowait()");
19
20 AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
21 AMREX_ASSERT(!enforce_periodicity_only || !override_sync);
22
23 bool sumboundary = sumboundary_src_nghost.allGE(0);
24
25 bool work_to_do;
26 if (enforce_periodicity_only) {
27 work_to_do = period.isAnyPeriodic();
28 } else if (override_sync) {
29 work_to_do = (nghost.max() > 0) || !is_cell_centered();
30 } else if (sumboundary) {
31 work_to_do = true;
32 } else {
33 work_to_do = nghost.max() > 0;
34 }
35 if (!work_to_do) { return; }
36
37 const FB& TheFB = getFB(nghost, period, cross, enforce_periodicity_only, override_sync, sumboundary_src_nghost);
38
40 {
41 //
42 // There can only be local work to do.
43 //
44 int N_locs = (*TheFB.m_LocTags).size();
45 if (N_locs == 0) { return; }
46#ifdef AMREX_USE_GPU
48 {
49#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
50 if (Gpu::inGraphRegion() && !sumboundary)
51 {
52 FB_local_copy_cuda_graph_1(TheFB, scomp, ncomp);
53 }
54 else
55#endif
56 {
57 if (sumboundary) {
59 FB_local_add_gpu(TheFB, scomp, ncomp, deterministic);
60 } else {
61 amrex::Abort("SumBoundary requires operator+=");
62 }
63 } else {
64 FB_local_copy_gpu(TheFB, scomp, ncomp);
65 if (!Gpu::inNoSyncRegion()) {
67 }
68 }
69 }
70 }
71 else
72#endif
73 {
74 if (sumboundary) {
76 FB_local_add_cpu(TheFB, scomp, ncomp);
77 } else {
78 amrex::Abort("SumBoundary requires operator+=");
79 }
80 } else {
81 FB_local_copy_cpu(TheFB, scomp, ncomp);
82 }
83 }
84
85 return;
86 }
87
88#ifdef BL_USE_MPI
89
90 //
91 // Do this before prematurely exiting if running in parallel.
92 // Otherwise sequence numbers will not match across MPI processes.
93 //
94 int SeqNum = ParallelDescriptor::SeqNum();
95
96 const int N_locs = TheFB.m_LocTags->size();
97 const int N_rcvs = TheFB.m_RcvTags->size();
98 const int N_snds = TheFB.m_SndTags->size();
99
100 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
101 // No work to do.
102 return;
103 }
104
105 fbd = std::make_unique<FBData<FAB>>();
106 fbd->fb = &TheFB;
107 fbd->scomp = scomp;
108 fbd->ncomp = ncomp;
109 fbd->tag = SeqNum;
110 fbd->deterministic = deterministic;
111
112 //
113 // Post rcvs. Allocate one chunk of space to hold'm all.
114 //
115
116 if (N_rcvs > 0) {
117 PostRcvs<BUF>(*TheFB.m_RcvTags, fbd->the_recv_data,
118 fbd->recv_data, fbd->recv_size, fbd->recv_from, fbd->recv_reqs,
119 ncomp, SeqNum);
120 fbd->recv_stat.resize(N_rcvs);
121 }
122
123 //
124 // Post send's
125 //
126 char*& the_send_data = fbd->the_send_data;
127 Vector<char*> & send_data = fbd->send_data;
128 Vector<std::size_t> send_size;
129 Vector<int> send_rank;
130 Vector<MPI_Request>& send_reqs = fbd->send_reqs;
132
133 if (N_snds > 0)
134 {
135 PrepareSendBuffers<BUF>(*TheFB.m_SndTags, the_send_data, send_data, send_size, send_rank,
136 send_reqs, send_cctc, ncomp);
137
138#ifdef AMREX_USE_GPU
140 {
141#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
142 if (Gpu::inGraphRegion()) {
143 FB_pack_send_buffer_cuda_graph(TheFB, scomp, ncomp, send_data, send_size, send_cctc);
144 }
145 else
146#endif
147 {
148 pack_send_buffer_gpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc, TheFB.m_id);
149 }
150 }
151 else
152#endif
153 {
154 pack_send_buffer_cpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc);
155 }
156
157 AMREX_ASSERT(send_reqs.size() == N_snds);
158 PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
159 }
160
161 FillBoundary_test();
162
163 //
164 // Do the local work. Hope for a bit of communication/computation overlap.
165 //
166 if (N_locs > 0)
167 {
168#ifdef AMREX_USE_GPU
170 {
171#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
172 if (Gpu::inGraphRegion() && !sumboundary) {
173 FB_local_copy_cuda_graph_n(TheFB, scomp, ncomp);
174 }
175 else
176#endif
177 {
178 if (sumboundary) {
180 FB_local_add_gpu(TheFB, scomp, ncomp, deterministic);
181 } else {
182 amrex::Abort("SumBoundary requires operator+=");
183 }
184 } else {
185 FB_local_copy_gpu(TheFB, scomp, ncomp);
186 if (!Gpu::inNoSyncRegion() && N_rcvs == 0) {
188 }
189 }
190 }
191 }
192 else
193#endif
194 {
195 if (sumboundary) {
197 FB_local_add_cpu(TheFB, scomp, ncomp);
198 } else {
199 amrex::Abort("SumBoundary requires operator+=");
200 }
201 } else {
202 FB_local_copy_cpu(TheFB, scomp, ncomp);
203 }
204 }
205
206 FillBoundary_test();
207 }
208
209#endif /*BL_USE_MPI*/
210
211#ifndef AMREX_USE_GPU
212 amrex::ignore_unused(deterministic);
213#endif
214}
215
216template <class FAB>
217template <typename BUF, class F, std::enable_if_t<IsBaseFab<F>::value,int>Z>
218void
220{
221#ifdef AMREX_USE_MPI
222
223 BL_PROFILE("FillBoundary_finish()");
225
226 if (!fbd) { n_filled = IntVect::TheZeroVector(); return; }
227
228 const FB* TheFB = fbd->fb;
229 bool sumboundary = TheFB->m_sb_snghost.allGE(0);
230 const auto N_rcvs = static_cast<int>(TheFB->m_RcvTags->size());
231 if (N_rcvs > 0)
232 {
233 Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
234 for (int k = 0; k < N_rcvs; k++)
235 {
236 if (fbd->recv_size[k] > 0)
237 {
238 auto const& cctc = TheFB->m_RcvTags->at(fbd->recv_from[k]);
239 recv_cctc[k] = &cctc;
240 }
241 }
242
243 int actual_n_rcvs = N_rcvs - std::count(fbd->recv_data.begin(), fbd->recv_data.end(), nullptr);
244
245 if (actual_n_rcvs > 0) {
246 ParallelDescriptor::Waitall(fbd->recv_reqs, fbd->recv_stat);
247#ifdef AMREX_DEBUG
248 if (!CheckRcvStats(fbd->recv_stat, fbd->recv_size, fbd->tag))
249 {
250 amrex::Abort("FillBoundary_finish failed with wrong message size");
251 }
252#endif
253 }
254
255 bool is_thread_safe = TheFB->m_threadsafe_rcv;
256 auto op = sumboundary ? FabArrayBase::ADD : FabArrayBase::COPY;
257
258#ifdef AMREX_USE_GPU
260 {
261#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
262 if (Gpu::inGraphRegion() && !sumboundary)
263 {
264 FB_unpack_recv_buffer_cuda_graph(*TheFB, fbd->scomp, fbd->ncomp,
265 fbd->recv_data, fbd->recv_size,
266 recv_cctc, is_thread_safe);
267 }
268 else
269#endif
270 {
271 bool deterministic = fbd->deterministic;
272 unpack_recv_buffer_gpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
273 recv_cctc, op, is_thread_safe, TheFB->m_id, deterministic);
274 }
275 }
276 else
277#endif
278 {
279 unpack_recv_buffer_cpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
280 recv_cctc, op, is_thread_safe);
281 }
282
283 if (fbd->the_recv_data)
284 {
285 amrex::The_Comms_Arena()->free(fbd->the_recv_data);
286 fbd->the_recv_data = nullptr;
287 }
288 }
289
290 const auto N_snds = static_cast<int>(TheFB->m_SndTags->size());
291 if (N_snds > 0) {
292 Vector<MPI_Status> stats(fbd->send_reqs.size());
293 ParallelDescriptor::Waitall(fbd->send_reqs, stats);
294 amrex::The_Comms_Arena()->free(fbd->the_send_data);
295 fbd->the_send_data = nullptr;
296 }
297
298 fbd.reset();
299
300#endif
301}
302
303template <class FAB>
304void
306 int scomp,
307 int dcomp,
308 int ncomp,
309 const IntVect& snghost,
310 const IntVect& dnghost,
311 const Periodicity& period,
312 CpOp op,
313 const FabArrayBase::CPC * a_cpc,
314 bool deterministic)
315{
316 BL_PROFILE("FabArray::ParallelCopy()");
317
318 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc,
319 false, deterministic);
320 ParallelCopy_finish();
321}
322
323template <class FAB>
324void
325FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src, int src_comp, int dest_comp,
326 int num_comp, const IntVect& snghost, const IntVect& dnghost,
327 const IntVect& offset, const Periodicity& period)
328{
329 BL_PROFILE("FabArray::ParallelCopy()");
330
331 ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,snghost,dnghost,offset,period);
332 ParallelCopy_finish();
333}
334
335template <class FAB>
336void
337FabArray<FAB>::ParallelAdd (const FabArray<FAB>& src, int src_comp, int dest_comp,
338 int num_comp, const IntVect& snghost, const IntVect& dnghost,
339 const IntVect& offset, const Periodicity& period)
340{
341 BL_PROFILE("FabArray::ParallelAdd()");
342
343 ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,snghost,dnghost,offset,period,
345 ParallelCopy_finish();
346}
347
348template <class FAB>
349void
351 int scomp,
352 int dcomp,
353 int ncomp,
354 const IntVect& snghost,
355 const IntVect& dnghost,
356 const Periodicity& period)
357{
358 BL_PROFILE("FabArray::ParallelCopyToGhost()");
359
360 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
361 FabArrayBase::COPY, nullptr, true);
362 ParallelCopy_finish();
363}
364
365template <class FAB>
366void
368 int scomp,
369 int dcomp,
370 int ncomp,
371 const IntVect& snghost,
372 const IntVect& dnghost,
373 const Periodicity& period)
374{
375 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
376 FabArrayBase::COPY, nullptr, true);
377}
378
379template <class FAB>
380void
382{
383 ParallelCopy_finish();
384}
385
386
387template <class FAB>
388void
390 int scomp,
391 int dcomp,
392 int ncomp,
393 const IntVect& snghost,
394 const IntVect& dnghost,
395 const Periodicity& period,
396 CpOp op,
397 const FabArrayBase::CPC * a_cpc,
398 bool to_ghost_cells_only,
399 bool deterministic)
400{
401 ParallelCopy_nowait(src,scomp,dcomp,ncomp,snghost,dnghost,IntVect(0),period,op,a_cpc,
402 to_ghost_cells_only, deterministic);
403}
404
405template <class FAB>
406void
408 int scomp,
409 int dcomp,
410 int ncomp,
411 const IntVect& snghost,
412 const IntVect& dnghost,
413 const IntVect& offset,
414 const Periodicity& period,
415 CpOp op,
416 const FabArrayBase::CPC * a_cpc,
417 bool to_ghost_cells_only,
418 bool deterministic)
419{
420 BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: PC");
421 BL_PROFILE("FabArray::ParallelCopy_nowait()");
422
423 AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
424
425 if (empty() || src.empty()) {
426 return;
427 }
428
430 BL_ASSERT(boxArray().ixType() == src.boxArray().ixType());
431 BL_ASSERT(src.nGrowVect().allGE(snghost));
432 BL_ASSERT( nGrowVect().allGE(dnghost));
433
434 n_filled = dnghost;
435
436 if ((ParallelDescriptor::NProcs() == 1) &&
437 (this->size() == 1) && (src.size() == 1) &&
438 !period.isAnyPeriodic() && !to_ghost_cells_only && (offset == 0))
439 {
440 if (this != &src) { // avoid self copy or plus
441 auto const& da = this->array(0, dcomp);
442 auto const& sa = src.const_array(0, scomp);
443 Box box = amrex::grow(src.box(0),snghost)
444 & amrex::grow(this->box(0),dnghost);
445 if (op == FabArrayBase::COPY) {
446#ifdef AMREX_USE_GPU
447 if (Gpu::inLaunchRegion()) {
448 ParallelFor(box, ncomp,
449 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
450 da(i,j,k,n) = sa(i,j,k,n);
451 });
452 if (!Gpu::inNoSyncRegion()) {
454 }
455 } else
456#endif
457 {
458 auto const& lo = amrex::lbound(box);
459 auto const& hi = amrex::ubound(box);
460#ifdef AMREX_USE_OMP
461#pragma omp parallel for collapse(3)
462#endif
463 for (int n = 0; n < ncomp; ++n) {
464 for (int k = lo.z; k <= hi.z; ++k) {
465 for (int j = lo.y; j <= hi.y; ++j) {
467 for (int i = lo.x; i <= hi.x; ++i) {
468 da(i,j,k,n) = sa(i,j,k,n);
469 }}}}
470 }
471 } else {
472#ifdef AMREX_USE_GPU
473 if (Gpu::inLaunchRegion()) {
474 ParallelFor(box, ncomp,
475 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
476 da(i,j,k,n) += sa(i,j,k,n);
477 });
478 if (!Gpu::inNoSyncRegion()) {
480 }
481 } else
482#endif
483 {
484 auto const& lo = amrex::lbound(box);
485 auto const& hi = amrex::ubound(box);
486#ifdef AMREX_USE_OMP
487#pragma omp parallel for collapse(3)
488#endif
489 for (int n = 0; n < ncomp; ++n) {
490 for (int k = lo.z; k <= hi.z; ++k) {
491 for (int j = lo.y; j <= hi.y; ++j) {
493 for (int i = lo.x; i <= hi.x; ++i) {
494 da(i,j,k,n) += sa(i,j,k,n);
495 }}}}
496 }
497 }
498 }
499 return;
500 }
501
502 if ((src.boxArray().ixType().cellCentered() || op == FabArrayBase::COPY) &&
503 (boxarray == src.boxarray && distributionMap == src.distributionMap) &&
504 snghost == IntVect::TheZeroVector() &&
505 dnghost == IntVect::TheZeroVector() &&
506 !period.isAnyPeriodic() && !to_ghost_cells_only && (offset == 0))
507 {
508 //
509 // Short-circuit full intersection code if we're doing copy()s or if
510 // we're doing plus()s on cell-centered data. Don't do plus()s on
511 // non-cell-centered data this simplistic way.
512 //
513 if (this != &src) { // avoid self copy or plus
514 if (op == FabArrayBase::COPY) {
515 Copy(*this, src, scomp, dcomp, ncomp, IntVect(0));
516 } else {
517 Add(*this, src, scomp, dcomp, ncomp, IntVect(0));
518 }
519 }
520 return;
521 }
522
523 const CPC& thecpc = (a_cpc) ? *a_cpc : getCPC(dnghost, src, snghost, period,
524 to_ghost_cells_only,offset);
525
527 {
528 //
529 // There can only be local work to do.
530 //
531
532 int N_locs = (*thecpc.m_LocTags).size();
533 if (N_locs == 0) { return; }
534#ifdef AMREX_USE_GPU
536 {
537 PC_local_gpu(thecpc, src, scomp, dcomp, ncomp, op, deterministic);
538 }
539 else
540#endif
541 {
542 PC_local_cpu(thecpc, src, scomp, dcomp, ncomp, op);
543 }
544
545 return;
546 }
547
548#ifdef BL_USE_MPI
549
550 //
551 // Do this before prematurely exiting if running in parallel.
552 // Otherwise sequence numbers will not match across MPI processes.
553 //
554 int tag = ParallelDescriptor::SeqNum();
555
556 const int N_snds = thecpc.m_SndTags->size();
557 const int N_rcvs = thecpc.m_RcvTags->size();
558 const int N_locs = thecpc.m_LocTags->size();
559
560 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
561 //
562 // No work to do.
563 //
564
565 return;
566 }
567
568 //
569 // Send/Recv at most MaxComp components at a time to cut down memory usage.
570 //
571 int NCompLeft = ncomp;
572 int SC = scomp, DC = dcomp, NC;
573
574 for (int ipass = 0; ipass < ncomp; )
575 {
576 pcd = std::make_unique<PCData<FAB>>();
577 pcd->cpc = &thecpc;
578 pcd->src = &src;
579 pcd->op = op;
580 pcd->tag = tag;
581
582 NC = std::min(NCompLeft,FabArrayBase::MaxComp);
583 const bool last_iter = (NCompLeft == NC);
584
585 pcd->SC = SC;
586 pcd->DC = DC;
587 pcd->NC = NC;
588
589 //
590 // Post rcvs. Allocate one chunk of space to hold'm all.
591 //
592 pcd->the_recv_data = nullptr;
593
594 pcd->actual_n_rcvs = 0;
595 if (N_rcvs > 0) {
596 PostRcvs(*thecpc.m_RcvTags, pcd->the_recv_data,
597 pcd->recv_data, pcd->recv_size, pcd->recv_from, pcd->recv_reqs, NC, pcd->tag);
598 pcd->actual_n_rcvs = N_rcvs - std::count(pcd->recv_size.begin(), pcd->recv_size.end(), 0);
599 }
600
601 //
602 // Post send's
603 //
604 Vector<char*> send_data;
605 Vector<std::size_t> send_size;
606 Vector<int> send_rank;
608
609 if (N_snds > 0)
610 {
611 src.PrepareSendBuffers(*thecpc.m_SndTags, pcd->the_send_data, send_data, send_size,
612 send_rank, pcd->send_reqs, send_cctc, NC);
613
614#ifdef AMREX_USE_GPU
616 {
617 pack_send_buffer_gpu(src, SC, NC, send_data, send_size, send_cctc, thecpc.m_id);
618 }
619 else
620#endif
621 {
622 pack_send_buffer_cpu(src, SC, NC, send_data, send_size, send_cctc);
623 }
624
625 AMREX_ASSERT(pcd->send_reqs.size() == N_snds);
626 FabArray<FAB>::PostSnds(send_data, send_size, send_rank, pcd->send_reqs, pcd->tag);
627 }
628
629 //
630 // Do the local work. Hope for a bit of communication/computation overlap.
631 //
632 if (N_locs > 0)
633 {
634#ifdef AMREX_USE_GPU
636 {
637 PC_local_gpu(thecpc, src, SC, DC, NC, op, deterministic);
638 }
639 else
640#endif
641 {
642 PC_local_cpu(thecpc, src, SC, DC, NC, op);
643 }
644 }
645
646 if (!last_iter)
647 {
648 ParallelCopy_finish();
649
650 SC += NC;
651 DC += NC;
652 }
653
654 ipass += NC;
655 NCompLeft -= NC;
656 }
657
658#endif /*BL_USE_MPI*/
659
660#ifndef AMREX_USE_GPU
661 amrex::ignore_unused(deterministic);
662#endif
663}
664
665template <class FAB>
666void
668{
669
670#ifdef BL_USE_MPI
671
672 BL_PROFILE("FabArray::ParallelCopy_finish()");
674
675 if (!pcd) { return; }
676
677 const CPC* thecpc = pcd->cpc;
678
679 const auto N_snds = static_cast<int>(thecpc->m_SndTags->size());
680 const auto N_rcvs = static_cast<int>(thecpc->m_RcvTags->size());
681
682 if (N_rcvs > 0)
683 {
684 Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
685 for (int k = 0; k < N_rcvs; ++k)
686 {
687 if (pcd->recv_size[k] > 0)
688 {
689 auto const& cctc = thecpc->m_RcvTags->at(pcd->recv_from[k]);
690 recv_cctc[k] = &cctc;
691 }
692 }
693
694 if (pcd->actual_n_rcvs > 0) {
695 Vector<MPI_Status> stats(N_rcvs);
696 ParallelDescriptor::Waitall(pcd->recv_reqs, stats);
697#ifdef AMREX_DEBUG
698 if (!CheckRcvStats(stats, pcd->recv_size, pcd->tag))
699 {
700 amrex::Abort("ParallelCopy failed with wrong message size");
701 }
702#endif
703 }
704
705 bool is_thread_safe = thecpc->m_threadsafe_rcv;
706
707#ifdef AMREX_USE_GPU
709 {
710 unpack_recv_buffer_gpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
711 recv_cctc, pcd->op, is_thread_safe, thecpc->m_id, false);
712 }
713 else
714#endif
715 {
716 unpack_recv_buffer_cpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
717 recv_cctc, pcd->op, is_thread_safe);
718 }
719
720 if (pcd->the_recv_data)
721 {
722 amrex::The_Comms_Arena()->free(pcd->the_recv_data);
723 pcd->the_recv_data = nullptr;
724 }
725 }
726
727 if (N_snds > 0) {
728 if (! thecpc->m_SndTags->empty()) {
729 Vector<MPI_Status> stats(pcd->send_reqs.size());
730 ParallelDescriptor::Waitall(pcd->send_reqs, stats);
731 }
732 amrex::The_Comms_Arena()->free(pcd->the_send_data);
733 pcd->the_send_data = nullptr;
734 }
735
736 pcd.reset();
737
738#endif /*BL_USE_MPI*/
739}
740
741template <class FAB>
742void
743FabArray<FAB>::copyTo (FAB& dest, int scomp, int dcomp, int ncomp, int nghost) const
744{
745 BL_PROFILE("FabArray::copy(fab)");
746
747 BL_ASSERT(dcomp + ncomp <= dest.nComp());
748 BL_ASSERT(IntVect(nghost).allLE(nGrowVect()));
749
750 int root_proc = this->DistributionMap()[0];
751
752 BoxArray ba(dest.box());
753 DistributionMapping dm(Vector<int>{root_proc});
754 FabArray<FAB> destmf(ba, dm, ncomp, 0, MFInfo().SetAlloc(false));
755 if (ParallelDescriptor::MyProc() == root_proc) {
756 destmf.setFab(0, FAB(dest, amrex::make_alias, dcomp, ncomp));
757 }
758
759 destmf.ParallelCopy(*this, scomp, 0, ncomp, nghost, 0);
760
761#ifdef BL_USE_MPI
762 using T = typename FAB::value_type;
763 if (ParallelContext::NProcsSub() > 1) {
764 Long count = dest.numPts()*ncomp;
765 T* const p0 = dest.dataPtr(dcomp);
766 T* pb = p0;
767#ifdef AMREX_USE_GPU
768 if (dest.arena()->isDevice()) {
769 pb = (T*)The_Pinned_Arena()->alloc(sizeof(T)*count);
770 Gpu::dtoh_memcpy_async(pb, p0, sizeof(T)*count);
772 }
773#endif
776#ifdef AMREX_USE_GPU
777 if (pb != p0) {
778 Gpu::htod_memcpy_async(p0, pb, sizeof(T)*count);
780 }
781#endif
782 }
783#endif
784}
785
786#ifdef BL_USE_MPI
787template <class FAB>
788template <typename BUF>
790FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
791 Vector<char*>& send_data,
792 Vector<std::size_t>& send_size,
793 Vector<int>& send_rank,
794 Vector<MPI_Request>& send_reqs,
796 int ncomp)
797{
798 char* pointer = nullptr;
799 PrepareSendBuffers<BUF>(SndTags, pointer, send_data, send_size, send_rank, send_reqs, send_cctc, ncomp);
800 return TheFaArenaPointer(pointer);
801}
802
803template <class FAB>
804template <typename BUF>
805void
806FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
807 char*& the_send_data,
808 Vector<char*>& send_data,
809 Vector<std::size_t>& send_size,
810 Vector<int>& send_rank,
811 Vector<MPI_Request>& send_reqs,
812 Vector<const CopyComTagsContainer*>& send_cctc,
813 int ncomp)
814{
815 send_data.clear();
816 send_size.clear();
817 send_rank.clear();
818 send_reqs.clear();
819 send_cctc.clear();
820 const auto N_snds = SndTags.size();
821 if (N_snds == 0) { return; }
822 send_data.reserve(N_snds);
823 send_size.reserve(N_snds);
824 send_rank.reserve(N_snds);
825 send_reqs.reserve(N_snds);
826 send_cctc.reserve(N_snds);
827
828 Vector<std::size_t> offset; offset.reserve(N_snds);
829 std::size_t total_volume = 0;
830 for (auto const& kv : SndTags)
831 {
832 auto const& cctc = kv.second;
833
834 std::size_t nbytes = 0;
835 for (auto const& cct : kv.second)
836 {
837 nbytes += cct.sbox.numPts() * ncomp * sizeof(BUF);
838 }
839
840 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
841 nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
842
843 // Also need to align the offset properly
844 total_volume = amrex::aligned_size(std::max(alignof(BUF), acd),
845 total_volume);
846
847 offset.push_back(total_volume);
848 total_volume += nbytes;
849
850 send_data.push_back(nullptr);
851 send_size.push_back(nbytes);
852 send_rank.push_back(kv.first);
853 send_reqs.push_back(MPI_REQUEST_NULL);
854 send_cctc.push_back(&cctc);
855 }
856
857 if (total_volume > 0)
858 {
859 the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume));
860 for (int i = 0, N = static_cast<int>(send_size.size()); i < N; ++i) {
861 send_data[i] = the_send_data + offset[i];
862 }
863 } else {
864 the_send_data = nullptr;
865 }
866}
867
868template <class FAB>
869void
870FabArray<FAB>::PostSnds (Vector<char*> const& send_data,
871 Vector<std::size_t> const& send_size,
872 Vector<int> const& send_rank,
873 Vector<MPI_Request>& send_reqs,
874 int SeqNum)
875{
877
878 const auto N_snds = static_cast<int>(send_reqs.size());
879 for (int j = 0; j < N_snds; ++j)
880 {
881 if (send_size[j] > 0) {
882 const int rank = ParallelContext::global_to_local_rank(send_rank[j]);
883 send_reqs[j] = ParallelDescriptor::Asend
884 (send_data[j], send_size[j], rank, SeqNum, comm).req();
885 }
886 }
887}
888
889template <class FAB>
890template <typename BUF>
891TheFaArenaPointer FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
892 Vector<char*>& recv_data,
893 Vector<std::size_t>& recv_size,
894 Vector<int>& recv_from,
895 Vector<MPI_Request>& recv_reqs,
896 int ncomp,
897 int SeqNum)
898{
899 char* pointer = nullptr;
900 PostRcvs(RcvTags, pointer, recv_data, recv_size, recv_from, recv_reqs, ncomp, SeqNum);
901 return TheFaArenaPointer(pointer);
902}
903
904template <class FAB>
905template <typename BUF>
906void
907FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
908 char*& the_recv_data,
909 Vector<char*>& recv_data,
910 Vector<std::size_t>& recv_size,
911 Vector<int>& recv_from,
912 Vector<MPI_Request>& recv_reqs,
913 int ncomp,
914 int SeqNum)
915{
916 recv_data.clear();
917 recv_size.clear();
918 recv_from.clear();
919 recv_reqs.clear();
920
921 Vector<std::size_t> offset;
922 std::size_t TotalRcvsVolume = 0;
923 for (const auto& kv : RcvTags) // loop over senders
924 {
925 std::size_t nbytes = 0;
926 for (auto const& cct : kv.second)
927 {
928 nbytes += cct.dbox.numPts() * ncomp * sizeof(BUF);
929 }
930
931 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
932 nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
933
934 // Also need to align the offset properly
935 TotalRcvsVolume = amrex::aligned_size(std::max(alignof(BUF),acd),
936 TotalRcvsVolume);
937
938 offset.push_back(TotalRcvsVolume);
939 TotalRcvsVolume += nbytes;
940
941 recv_data.push_back(nullptr);
942 recv_size.push_back(nbytes);
943 recv_from.push_back(kv.first);
944 recv_reqs.push_back(MPI_REQUEST_NULL);
945 }
946
947 const auto nrecv = static_cast<int>(recv_from.size());
948
950
951 if (TotalRcvsVolume == 0)
952 {
953 the_recv_data = nullptr;
954 }
955 else
956 {
957 the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
958
959 for (int i = 0; i < nrecv; ++i)
960 {
961 recv_data[i] = the_recv_data + offset[i];
962 if (recv_size[i] > 0)
963 {
964 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
965 recv_reqs[i] = ParallelDescriptor::Arecv
966 (recv_data[i], recv_size[i], rank, SeqNum, comm).req();
967 }
968 }
969 }
970}
971#endif
972
973template <class FAB>
974void
976 int scomp,
977 int dcomp,
978 int ncomp,
979 const IntVect& nghost)
982 "FabArray::Redistribute: must have the same BoxArray");
983
985 {
986 Copy(*this, src, scomp, dcomp, ncomp, nghost);
987 return;
988 }
989
990#ifdef BL_USE_MPI
991
993
994 ParallelCopy(src, scomp, dcomp, ncomp, nghost, nghost, Periodicity::NonPeriodic(),
995 FabArrayBase::COPY, &cpc);
996
997#endif
998}
999
1000template <class FAB>
1001void
1003{
1004#if defined(AMREX_USE_MPI) && !defined(AMREX_DEBUG)
1005 // We only test if no DEBUG because in DEBUG we check the status later.
1006 // If Test is done here, the status check will fail.
1007 int flag;
1008 ParallelDescriptor::Test(fbd->recv_reqs, flag, fbd->recv_stat);
1009#endif
1010}
1011
1013namespace detail {
1014template <class TagT>
1015void fbv_copy (Vector<TagT> const& tags)
1016{
1017 const int N = tags.size();
1018 if (N == 0) { return; }
1019#ifdef AMREX_USE_GPU
1020 if (Gpu::inLaunchRegion()) {
1021 ParallelFor(tags, 1,
1022 [=] AMREX_GPU_DEVICE (int i, int j, int k, int, TagT const& tag) noexcept
1024 const int ncomp = tag.dfab.nComp();
1025 for (int n = 0; n < ncomp; ++n) {
1026 tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
1027 }
1028 });
1029 } else
1030#endif
1031 {
1032#ifdef AMREX_USE_OMP
1033#pragma omp parallel for
1034#endif
1035 for (int itag = 0; itag < N; ++itag) {
1036 auto const& tag = tags[itag];
1037 const int ncomp = tag.dfab.nComp();
1038 AMREX_LOOP_4D(tag.dbox, ncomp, i, j, k, n,
1039 {
1040 tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
1041 });
1042 }
1043 }
1044}
1045}
1047
1048template <class MF>
1049std::enable_if_t<IsFabArray<MF>::value>
1050FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
1051 Vector<int> const& ncomp, Vector<IntVect> const& nghost,
1052 Vector<Periodicity> const& period, Vector<int> const& cross = {})
1054 BL_PROFILE("FillBoundary(Vector)");
1055#if 1
1056 const int N = mf.size();
1057 for (int i = 0; i < N; ++i) {
1058 mf[i]->FillBoundary_nowait(scomp[i], ncomp[i], nghost[i], period[i],
1059 cross.empty() ? 0 : cross[i]);
1060 }
1061 for (int i = 0; i < N; ++i) {
1062 mf[i]->FillBoundary_finish();
1063 }
1064
1065#else
1066 using FAB = typename MF::FABType::value_type;
1067 using T = typename FAB::value_type;
1068
1069 const int nmfs = mf.size();
1071 int N_locs = 0;
1072 int N_rcvs = 0;
1073 int N_snds = 0;
1074 for (int imf = 0; imf < nmfs; ++imf) {
1075 if (nghost[imf].max() > 0) {
1076 auto const& TheFB = mf[imf]->getFB(nghost[imf], period[imf],
1077 cross.empty() ? 0 : cross[imf]);
1078 // The FB is cached. Therefore it's safe take its address for later use.
1079 cmds.push_back(static_cast<FabArrayBase::CommMetaData const*>(&TheFB));
1080 N_locs += TheFB.m_LocTags->size();
1081 N_rcvs += TheFB.m_RcvTags->size();
1082 N_snds += TheFB.m_SndTags->size();
1083 } else {
1084 cmds.push_back(nullptr);
1085 }
1086 }
1087
1088 using TagT = Array4CopyTag<T>;
1089 Vector<TagT> local_tags;
1090 local_tags.reserve(N_locs);
1091 static_assert(amrex::IsStoreAtomic<T>::value, "FillBoundary(Vector): storing T is not atomic");
1092 for (int imf = 0; imf < nmfs; ++imf) {
1093 if (cmds[imf]) {
1094 auto const& tags = *(cmds[imf]->m_LocTags);
1095 for (auto const& tag : tags) {
1096 local_tags.push_back({(*mf[imf])[tag.dstIndex].array (scomp[imf],ncomp[imf]),
1097 (*mf[imf])[tag.srcIndex].const_array(scomp[imf],ncomp[imf]),
1098 tag.dbox,
1099 (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
1101 }
1102 }
1103
1104 if (ParallelContext::NProcsSub() == 1) {
1105 detail::fbv_copy(local_tags);
1106 return;
1107 }
1108
1109#ifdef AMREX_USE_MPI
1110 //
1111 // Do this before prematurely exiting if running in parallel.
1112 // Otherwise sequence numbers will not match across MPI processes.
1113 //
1116
1117 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) { return; } // No work to do
1118
1119 char* the_recv_data = nullptr;
1120 Vector<int> recv_from;
1121 Vector<std::size_t> recv_size;
1122 Vector<MPI_Request> recv_reqs;
1123 Vector<MPI_Status> recv_stat;
1124 Vector<TagT> recv_tags;
1125
1126 if (N_rcvs > 0) {
1127
1128 for (int imf = 0; imf < nmfs; ++imf) {
1129 if (cmds[imf]) {
1130 auto const& tags = *(cmds[imf]->m_RcvTags);
1131 for (const auto& kv : tags) {
1132 recv_from.push_back(kv.first);
1133 }
1134 }
1135 }
1136 amrex::RemoveDuplicates(recv_from);
1137 const int nrecv = recv_from.size();
1138
1139 recv_reqs.resize(nrecv, MPI_REQUEST_NULL);
1140 recv_stat.resize(nrecv);
1141
1142 recv_tags.reserve(N_rcvs);
1143
1144 Vector<Vector<std::size_t> > recv_offset(nrecv);
1145 Vector<std::size_t> offset;
1146 recv_size.reserve(nrecv);
1147 offset.reserve(nrecv);
1148 std::size_t TotalRcvsVolume = 0;
1149 for (int i = 0; i < nrecv; ++i) {
1150 std::size_t nbytes = 0;
1151 for (int imf = 0; imf < nmfs; ++imf) {
1152 if (cmds[imf]) {
1153 auto const& tags = *(cmds[imf]->m_RcvTags);
1154 auto it = tags.find(recv_from[i]);
1155 if (it != tags.end()) {
1156 for (auto const& cct : it->second) {
1157 auto& dfab = (*mf[imf])[cct.dstIndex];
1158 recv_offset[i].push_back(nbytes);
1159 recv_tags.push_back({dfab.array(scomp[imf],ncomp[imf]),
1160 makeArray4<T const>(nullptr,cct.dbox,ncomp[imf]),
1161 cct.dbox, Dim3{0,0,0}});
1162 nbytes += dfab.nBytes(cct.dbox,ncomp[imf]);
1164 }
1166 }
1167
1168 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1169 nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
1170
1171 // Also need to align the offset properly
1172 TotalRcvsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalRcvsVolume);
1173
1174 offset.push_back(TotalRcvsVolume);
1175 TotalRcvsVolume += nbytes;
1176
1177 recv_size.push_back(nbytes);
1178 }
1179
1180 the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
1181
1182 int k = 0;
1183 for (int i = 0; i < nrecv; ++i) {
1184 char* p = the_recv_data + offset[i];
1185 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
1186 recv_reqs[i] = ParallelDescriptor::Arecv
1187 (p, recv_size[i], rank, SeqNum, comm).req();
1188 for (int j = 0, nj = recv_offset[i].size(); j < nj; ++j) {
1189 recv_tags[k++].sfab.p = (T const*)(p + recv_offset[i][j]);
1190 }
1191 }
1192 }
1193
1194 char* the_send_data = nullptr;
1195 Vector<int> send_rank;
1196 Vector<char*> send_data;
1197 Vector<std::size_t> send_size;
1198 Vector<MPI_Request> send_reqs;
1199 if (N_snds > 0) {
1200 for (int imf = 0; imf < nmfs; ++imf) {
1201 if (cmds[imf]) {
1202 auto const& tags = *(cmds[imf]->m_SndTags);
1203 for (auto const& kv : tags) {
1204 send_rank.push_back(kv.first);
1205 }
1206 }
1207 }
1208 amrex::RemoveDuplicates(send_rank);
1209 const int nsend = send_rank.size();
1210
1211 send_data.resize(nsend, nullptr);
1212 send_reqs.resize(nsend, MPI_REQUEST_NULL);
1213
1214 Vector<TagT> send_tags;
1215 send_tags.reserve(N_snds);
1216
1217 Vector<Vector<std::size_t> > send_offset(nsend);
1218 Vector<std::size_t> offset;
1219 send_size.reserve(nsend);
1220 offset.reserve(nsend);
1221 std::size_t TotalSndsVolume = 0;
1222 for (int i = 0; i < nsend; ++i) {
1223 std::size_t nbytes = 0;
1224 for (int imf = 0; imf < nmfs; ++imf) {
1225 if (cmds[imf]) {
1226 auto const& tags = *(cmds[imf]->m_SndTags);
1227 auto it = tags.find(send_rank[i]);
1228 if (it != tags.end()) {
1229 for (auto const& cct : it->second) {
1230 auto const& sfab = (*mf[imf])[cct.srcIndex];
1231 send_offset[i].push_back(nbytes);
1232 send_tags.push_back({amrex::makeArray4<T>(nullptr,cct.sbox,ncomp[imf]),
1233 sfab.const_array(scomp[imf],ncomp[imf]),
1234 cct.sbox, Dim3{0,0,0}});
1235 nbytes += sfab.nBytes(cct.sbox,ncomp[imf]);
1236 }
1237 }
1238 }
1239 }
1240
1241 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1242 nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
1243
1244 // Also need to align the offset properly
1245 TotalSndsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalSndsVolume);
1246
1247 offset.push_back(TotalSndsVolume);
1248 TotalSndsVolume += nbytes;
1249
1250 send_size.push_back(nbytes);
1251 }
1252
1253 the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalSndsVolume));
1254 int k = 0;
1255 for (int i = 0; i < nsend; ++i) {
1256 send_data[i] = the_send_data + offset[i];
1257 for (int j = 0, nj = send_offset[i].size(); j < nj; ++j) {
1258 send_tags[k++].dfab.p = (T*)(send_data[i] + send_offset[i][j]);
1259 }
1260 }
1261
1262 detail::fbv_copy(send_tags);
1263
1264 FabArray<FAB>::PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
1265 }
1266
1267#if !defined(AMREX_DEBUG)
1268 int recv_flag;
1269 ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1270#endif
1271
1272 if (N_locs > 0) {
1273 detail::fbv_copy(local_tags);
1274#if !defined(AMREX_DEBUG)
1275 ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1276#endif
1277 }
1278
1279 if (N_rcvs > 0) {
1280 ParallelDescriptor::Waitall(recv_reqs, recv_stat);
1281#ifdef AMREX_DEBUG
1282 if (!FabArrayBase::CheckRcvStats(recv_stat, recv_size, SeqNum)) {
1283 amrex::Abort("FillBoundary(vector) failed with wrong message size");
1284 }
1285#endif
1286
1287 detail::fbv_copy(recv_tags);
1288
1289 amrex::The_Comms_Arena()->free(the_recv_data);
1290 }
1291
1292 if (N_snds > 0) {
1293 Vector<MPI_Status> stats(send_reqs.size());
1294 ParallelDescriptor::Waitall(send_reqs, stats);
1295 amrex::The_Comms_Arena()->free(the_send_data);
1296 }
1297
1298#endif // #ifdef AMREX_USE_MPI
1299#endif // #if 1 #else
1300}
1301
1302template <class MF>
1303std::enable_if_t<IsFabArray<MF>::value>
1305{
1306 Vector<int> scomp(mf.size(), 0);
1307 Vector<int> ncomp;
1308 Vector<IntVect> nghost;
1309 Vector<Periodicity> period(mf.size(), a_period);
1310 ncomp.reserve(mf.size());
1311 nghost.reserve(mf.size());
1312 for (auto const& x : mf) {
1313 ncomp.push_back(x->nComp());
1314 nghost.push_back(x->nGrowVect());
1315 }
1316 FillBoundary(mf, scomp, ncomp, nghost, period);
1317}
1318
1319}
#define BL_PROFILE(a)
Definition AMReX_BLProfiler.H:551
#define BL_PROFILE_SYNC_STOP()
Definition AMReX_BLProfiler.H:645
#define BL_PROFILE_SYNC_START_TIMED(fname)
Definition AMReX_BLProfiler.H:644
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:37
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_NODISCARD
Definition AMReX_Extension.H:251
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
#define AMREX_LOOP_4D(bx, ncomp, i, j, k, n, block)
Definition AMReX_Loop.nolint.H:16
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
A collection of Boxes stored in an Array.
Definition AMReX_BoxArray.H:567
IndexType ixType() const noexcept
Return index type of this BoxArray.
Definition AMReX_BoxArray.H:857
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
IntVect nGrowVect() const noexcept
Definition AMReX_FabArrayBase.H:80
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:110
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
bool empty() const noexcept
Definition AMReX_FabArrayBase.H:89
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
@ ADD
Definition AMReX_FabArrayBase.H:394
@ COPY
Definition AMReX_FabArrayBase.H:394
Box box(int K) const noexcept
Return the Kth Box in the BoxArray. That is, the valid region of the Kth grid.
Definition AMReX_FabArrayBase.H:101
DistributionMapping distributionMap
Definition AMReX_FabArrayBase.H:445
static int MaxComp
The maximum number of components to copy() at a time.
Definition AMReX_FabArrayBase.H:292
BoxArray boxarray
Definition AMReX_FabArrayBase.H:444
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:347
void ParallelCopyToGhost_finish()
Definition AMReX_FabArrayCommI.H:381
void FBEP_nowait(int scomp, int ncomp, const IntVect &nghost, const Periodicity &period, bool cross, bool enforce_periodicity_only=false, bool override_sync=false, IntVect const &sumboundary_src_nghost=IntVect(-1), bool deterministic=false)
Definition AMReX_FabArrayCommI.H:10
Array4< typename FabArray< FAB >::value_type const > const_array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:587
void ParallelCopy(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic(), CpOp op=FabArrayBase::COPY)
Definition AMReX_FabArray.H:847
void Redistribute(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &nghost)
Copy from src to this. this and src have the same BoxArray, but different DistributionMapping.
Definition AMReX_FabArrayCommI.H:975
void ParallelCopyToGhost(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &snghost, const IntVect &dnghost, const Periodicity &period=Periodicity::NonPeriodic())
Definition AMReX_FabArrayCommI.H:350
void ParallelCopy_finish()
Definition AMReX_FabArrayCommI.H:667
void ParallelAdd(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic())
This function copies data from src to this FabArray. Each FAB in fa is intersected with all FABs in t...
Definition AMReX_FabArray.H:844
void ParallelCopyToGhost_nowait(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &snghost, const IntVect &dnghost, const Periodicity &period=Periodicity::NonPeriodic())
Definition AMReX_FabArrayCommI.H:367
void FillBoundary_test()
Definition AMReX_FabArrayCommI.H:1002
void copyTo(FAB &dest, int nghost=0) const
Copy the values contained in the intersection of the valid + nghost region of this FabArray with the ...
Definition AMReX_FabArray.H:2644
void ParallelCopy_nowait(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic(), CpOp op=FabArrayBase::COPY)
Definition AMReX_FabArray.H:861
void FillBoundary_finish()
Definition AMReX_FabArrayCommI.H:219
__host__ __device__ bool cellCentered() const noexcept
True if the IndexTypeND is CELL based in all directions.
Definition AMReX_IndexType.H:104
__host__ __device__ bool allGE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is greater than or equal to argument for all components. NOTE: This is NOT a str...
Definition AMReX_IntVect.H:451
__host__ __device__ int max() const noexcept
maximum (no absolute values) value
Definition AMReX_IntVect.H:222
__host__ static __device__ constexpr IntVectND< dim > TheZeroVector() noexcept
This static member function returns a reference to a constant IntVectND object, all of whose dim argu...
Definition AMReX_IntVect.H:680
MPI_Request req() const
Definition AMReX_ParallelDescriptor.H:74
This provides length of period for periodic domains. 0 means it is not periodic in that direction....
Definition AMReX_Periodicity.H:17
static const Periodicity & NonPeriodic() noexcept
Definition AMReX_Periodicity.cpp:52
bool isAnyPeriodic() const noexcept
Definition AMReX_Periodicity.H:22
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1280
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:126
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:246
bool inGraphRegion()
Definition AMReX_GpuControl.H:121
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:315
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
bool inNoSyncRegion() noexcept
Definition AMReX_GpuControl.H:152
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:301
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int global_to_local_rank(int rank) noexcept
Definition AMReX_ParallelContext.H:98
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
void Test(MPI_Request &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1216
Message Asend(const T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1164
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1304
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition AMReX_ParallelDescriptor.cpp:1291
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:688
Message Arecv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1206
int MPI_Comm
Definition AMReX_ccse-mpi.H:51
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:49
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:319
@ make_alias
Definition AMReX_MakeType.H:7
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
std::unique_ptr< char, TheFaArenaDeleter > TheFaArenaPointer
Definition AMReX_FabArray.H:104
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2869
IntVect nGrowVect(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2859
std::enable_if_t< IsFabArray< MF >::value > FillBoundary(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period, Vector< int > const &cross={})
Definition AMReX_FabArrayCommI.H:1050
void Copy(FabArray< DFAB > &dst, FabArray< SFAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition AMReX_FabArray.H:180
double second() noexcept
Definition AMReX_Utility.cpp:940
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:843
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35
void Add(FabArray< FAB > &dst, FabArray< FAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition AMReX_FabArray.H:241
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
std::size_t aligned_size(std::size_t align_requirement, std::size_t size) noexcept
Given a minimum required size of size bytes, this returns the next largest arena size that will align...
Definition AMReX_Arena.H:30
void ParallelCopy(MF &dst, MF const &src, int scomp, int dcomp, int ncomp, IntVect const &ng_src=IntVect(0), IntVect const &ng_dst=IntVect(0), Periodicity const &period=Periodicity::NonPeriodic())
dst = src w/ MPI communication
Definition AMReX_FabArrayUtility.H:2019
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:211
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2864
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:312
parallel copy or add
Definition AMReX_FabArrayBase.H:538
std::uint64_t m_id
Definition AMReX_FabArrayBase.H:553
Definition AMReX_FabArrayBase.H:472
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< MapOfCopyComTagContainers > m_SndTags
Definition AMReX_FabArrayBase.H:477
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
FillBoundary.
Definition AMReX_FabArrayBase.H:488
std::uint64_t m_id
Definition AMReX_FabArrayBase.H:494
IntVect m_sb_snghost
Definition AMReX_FabArrayBase.H:498
Definition AMReX_TypeTraits.H:66
Definition AMReX_TypeTraits.H:280
FabArray memory allocation information.
Definition AMReX_FabArray.H:66