Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FabArrayCommI.H
Go to the documentation of this file.
1
2#include <AMReX_FBI.H>
3#include <AMReX_PCI.H>
4
5namespace amrex {
6
7template <class FAB>
8template <typename BUF, class F>
9requires (BaseFabType<F>)
10void
11FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
12 const Periodicity& period, bool cross,
13 bool enforce_periodicity_only,
14 bool override_sync,
15 IntVect const& sumboundary_src_nghost,
16 bool deterministic) // so far the deterministic is for sumboundary only
17{
18 BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: FB");
19 BL_PROFILE("FillBoundary_nowait()");
20
21 AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
22 AMREX_ASSERT(!enforce_periodicity_only || !override_sync);
23
24 bool sumboundary = sumboundary_src_nghost.allGE(0);
25
26 bool work_to_do;
27 if (enforce_periodicity_only) {
28 work_to_do = period.isAnyPeriodic();
29 } else if (override_sync) {
30 work_to_do = (nghost.max() > 0) || !is_cell_centered();
31 } else if (sumboundary) {
32 work_to_do = true;
33 } else {
34 work_to_do = nghost.max() > 0;
35 }
36 if (!work_to_do) { return; }
37
38 const FB& TheFB = getFB(nghost, period, cross, enforce_periodicity_only, override_sync, sumboundary_src_nghost);
39
41 {
42 //
43 // There can only be local work to do.
44 //
45 int N_locs = (*TheFB.m_LocTags).size();
46 if (N_locs == 0) { return; }
47#ifdef AMREX_USE_GPU
49 {
50#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
51 if (Gpu::inGraphRegion() && !sumboundary)
52 {
53 FB_local_copy_cuda_graph_1(TheFB, scomp, ncomp);
54 }
55 else
56#endif
57 {
58 if (sumboundary) {
60 FB_local_add_gpu(TheFB, scomp, ncomp, deterministic);
61 } else {
62 amrex::Abort("SumBoundary requires operator+=");
63 }
64 } else {
65 FB_local_copy_gpu(TheFB, scomp, ncomp);
66 if (!Gpu::inNoSyncRegion()) {
68 }
69 }
70 }
71 }
72 else
73#endif
74 {
75 if (sumboundary) {
77 FB_local_add_cpu(TheFB, scomp, ncomp);
78 } else {
79 amrex::Abort("SumBoundary requires operator+=");
80 }
81 } else {
82 FB_local_copy_cpu(TheFB, scomp, ncomp);
83 }
84 }
85
86 return;
87 }
88
89#ifdef BL_USE_MPI
90
91 //
92 // Do this before prematurely exiting if running in parallel.
93 // Otherwise sequence numbers will not match across MPI processes.
94 //
95 int SeqNum = ParallelDescriptor::SeqNum();
96
97 const int N_locs = TheFB.m_LocTags->size();
98 const int N_rcvs = TheFB.m_RcvTags->size();
99 const int N_snds = TheFB.m_SndTags->size();
100
101 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
102 // No work to do.
103 return;
104 }
105
106 fbd = std::make_unique<FBData<FAB>>();
107 fbd->fb = &TheFB;
108 fbd->scomp = scomp;
109 fbd->ncomp = ncomp;
110 fbd->tag = SeqNum;
111 fbd->deterministic = deterministic;
112
113 //
114 // Post rcvs. Allocate one chunk of space to hold'm all.
115 //
116
117 if (N_rcvs > 0) {
118 PostRcvs<BUF>(*TheFB.m_RcvTags, fbd->the_recv_data,
119 fbd->recv_data, fbd->recv_size, fbd->recv_from, fbd->recv_reqs,
120 ncomp, SeqNum);
121 fbd->recv_stat.resize(N_rcvs);
122 }
123
124 //
125 // Post send's
126 //
127 char*& the_send_data = fbd->the_send_data;
128 Vector<char*> & send_data = fbd->send_data;
129 Vector<std::size_t> send_size;
130 Vector<int> send_rank;
131 Vector<MPI_Request>& send_reqs = fbd->send_reqs;
133
134 if (N_snds > 0)
135 {
136 PrepareSendBuffers<BUF>(*TheFB.m_SndTags, the_send_data, send_data, send_size, send_rank,
137 send_reqs, send_cctc, ncomp);
138
139#ifdef AMREX_USE_GPU
141 {
142#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
143 if (Gpu::inGraphRegion()) {
144 FB_pack_send_buffer_cuda_graph(TheFB, scomp, ncomp, send_data, send_size, send_cctc);
145 }
146 else
147#endif
148 {
149 pack_send_buffer_gpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc, TheFB.m_id);
150 }
151 }
152 else
153#endif
154 {
155 pack_send_buffer_cpu<BUF>(*this, scomp, ncomp, send_data, send_size, send_cctc);
156 }
157
158 AMREX_ASSERT(send_reqs.size() == N_snds);
159 PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
160 }
161
162 FillBoundary_test();
163
164 //
165 // Do the local work. Hope for a bit of communication/computation overlap.
166 //
167 if (N_locs > 0)
168 {
169#ifdef AMREX_USE_GPU
171 {
172#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
173 if (Gpu::inGraphRegion() && !sumboundary) {
174 FB_local_copy_cuda_graph_n(TheFB, scomp, ncomp);
175 }
176 else
177#endif
178 {
179 if (sumboundary) {
181 FB_local_add_gpu(TheFB, scomp, ncomp, deterministic);
182 } else {
183 amrex::Abort("SumBoundary requires operator+=");
184 }
185 } else {
186 FB_local_copy_gpu(TheFB, scomp, ncomp);
187 if (!Gpu::inNoSyncRegion() && N_rcvs == 0) {
189 }
190 }
191 }
192 }
193 else
194#endif
195 {
196 if (sumboundary) {
198 FB_local_add_cpu(TheFB, scomp, ncomp);
199 } else {
200 amrex::Abort("SumBoundary requires operator+=");
201 }
202 } else {
203 FB_local_copy_cpu(TheFB, scomp, ncomp);
204 }
205 }
206
207 FillBoundary_test();
208 }
209
210#endif /*BL_USE_MPI*/
211
212#ifndef AMREX_USE_GPU
213 amrex::ignore_unused(deterministic);
214#endif
215}
216
217template <class FAB>
218template <typename BUF, class F>
219requires (BaseFabType<F>)
220void
222{
223#ifdef AMREX_USE_MPI
224
225 BL_PROFILE("FillBoundary_finish()");
227
228 if (!fbd) { n_filled = IntVect::TheZeroVector(); return; }
229
230 const FB* TheFB = fbd->fb;
231 bool sumboundary = TheFB->m_sb_snghost.allGE(0);
232 const auto N_rcvs = static_cast<int>(TheFB->m_RcvTags->size());
233 if (N_rcvs > 0)
234 {
235 Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
236 for (int k = 0; k < N_rcvs; k++)
237 {
238 if (fbd->recv_size[k] > 0)
239 {
240 auto const& cctc = TheFB->m_RcvTags->at(fbd->recv_from[k]);
241 recv_cctc[k] = &cctc;
242 }
243 }
244
245 int actual_n_rcvs = N_rcvs - std::count(fbd->recv_data.begin(), fbd->recv_data.end(), nullptr);
246
247 if (actual_n_rcvs > 0) {
248 ParallelDescriptor::Waitall(fbd->recv_reqs, fbd->recv_stat);
249#ifdef AMREX_DEBUG
250 if (!CheckRcvStats(fbd->recv_stat, fbd->recv_size, fbd->tag))
251 {
252 amrex::Abort("FillBoundary_finish failed with wrong message size");
253 }
254#endif
255 }
256
257 bool is_thread_safe = TheFB->m_threadsafe_rcv;
258 auto op = sumboundary ? FabArrayBase::ADD : FabArrayBase::COPY;
259
260#ifdef AMREX_USE_GPU
262 {
263#if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
264 if (Gpu::inGraphRegion() && !sumboundary)
265 {
266 FB_unpack_recv_buffer_cuda_graph(*TheFB, fbd->scomp, fbd->ncomp,
267 fbd->recv_data, fbd->recv_size,
268 recv_cctc, is_thread_safe);
269 }
270 else
271#endif
272 {
273 bool deterministic = fbd->deterministic;
274 unpack_recv_buffer_gpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
275 recv_cctc, op, is_thread_safe, TheFB->m_id, deterministic);
276 }
277 }
278 else
279#endif
280 {
281 unpack_recv_buffer_cpu<BUF>(*this, fbd->scomp, fbd->ncomp, fbd->recv_data, fbd->recv_size,
282 recv_cctc, op, is_thread_safe);
283 }
284
285 if (fbd->the_recv_data)
286 {
287 amrex::The_Comms_Arena()->free(fbd->the_recv_data);
288 fbd->the_recv_data = nullptr;
289 }
290 }
291
292 const auto N_snds = static_cast<int>(TheFB->m_SndTags->size());
293 if (N_snds > 0) {
294 Vector<MPI_Status> stats(fbd->send_reqs.size());
295 ParallelDescriptor::Waitall(fbd->send_reqs, stats);
296 amrex::The_Comms_Arena()->free(fbd->the_send_data);
297 fbd->the_send_data = nullptr;
298 }
299
300 fbd.reset();
301
302#endif
303}
304
305template <class FAB>
306void
308 int scomp,
309 int dcomp,
310 int ncomp,
311 const IntVect& snghost,
312 const IntVect& dnghost,
313 const Periodicity& period,
314 CpOp op,
315 const FabArrayBase::CPC * a_cpc,
316 bool deterministic)
317{
318 BL_PROFILE("FabArray::ParallelCopy()");
319
320 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period, op, a_cpc,
321 false, deterministic);
322 ParallelCopy_finish();
323}
324
325template <class FAB>
326void
327FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src, int src_comp, int dest_comp,
328 int num_comp, const IntVect& snghost, const IntVect& dnghost,
329 const IntVect& offset, const Periodicity& period)
330{
331 BL_PROFILE("FabArray::ParallelCopy()");
332
333 ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,snghost,dnghost,offset,period);
334 ParallelCopy_finish();
335}
336
337template <class FAB>
338void
339FabArray<FAB>::ParallelAdd (const FabArray<FAB>& src, int src_comp, int dest_comp,
340 int num_comp, const IntVect& snghost, const IntVect& dnghost,
341 const IntVect& offset, const Periodicity& period)
342{
343 BL_PROFILE("FabArray::ParallelAdd()");
344
345 ParallelCopy_nowait(src,src_comp,dest_comp,num_comp,snghost,dnghost,offset,period,
347 ParallelCopy_finish();
348}
349
350template <class FAB>
351void
353 int scomp,
354 int dcomp,
355 int ncomp,
356 const IntVect& snghost,
357 const IntVect& dnghost,
358 const Periodicity& period)
359{
360 BL_PROFILE("FabArray::ParallelCopyToGhost()");
361
362 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
363 FabArrayBase::COPY, nullptr, true);
364 ParallelCopy_finish();
365}
366
367template <class FAB>
368void
370 int scomp,
371 int dcomp,
372 int ncomp,
373 const IntVect& snghost,
374 const IntVect& dnghost,
375 const Periodicity& period)
376{
377 ParallelCopy_nowait(src, scomp, dcomp, ncomp, snghost, dnghost, period,
378 FabArrayBase::COPY, nullptr, true);
379}
380
381template <class FAB>
382void
384{
385 ParallelCopy_finish();
386}
387
388
389template <class FAB>
390void
392 int scomp,
393 int dcomp,
394 int ncomp,
395 const IntVect& snghost,
396 const IntVect& dnghost,
397 const Periodicity& period,
398 CpOp op,
399 const FabArrayBase::CPC * a_cpc,
400 bool to_ghost_cells_only,
401 bool deterministic)
402{
403 ParallelCopy_nowait(src,scomp,dcomp,ncomp,snghost,dnghost,IntVect(0),period,op,a_cpc,
404 to_ghost_cells_only, deterministic);
405}
406
407template <class FAB>
408void
410 int scomp,
411 int dcomp,
412 int ncomp,
413 const IntVect& snghost,
414 const IntVect& dnghost,
415 const IntVect& offset,
416 const Periodicity& period,
417 CpOp op,
418 const FabArrayBase::CPC * a_cpc,
419 bool to_ghost_cells_only,
420 bool deterministic)
421{
422 BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: PC");
423 BL_PROFILE("FabArray::ParallelCopy_nowait()");
424
425 AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
426
427 if (empty() || src.empty()) {
428 return;
429 }
430
432 BL_ASSERT(boxArray().ixType() == src.boxArray().ixType());
433 BL_ASSERT(src.nGrowVect().allGE(snghost));
434 BL_ASSERT( nGrowVect().allGE(dnghost));
435
436 n_filled = dnghost;
437
438 if ((ParallelDescriptor::NProcs() == 1) &&
439 (this->size() == 1) && (src.size() == 1) &&
440 !period.isAnyPeriodic() && !to_ghost_cells_only && (offset == 0))
441 {
442 if (this != &src) { // avoid self copy or plus
443 auto const& da = this->array(0, dcomp);
444 auto const& sa = src.const_array(0, scomp);
445 Box box = amrex::grow(src.box(0),snghost)
446 & amrex::grow(this->box(0),dnghost);
447 if (op == FabArrayBase::COPY) {
448#ifdef AMREX_USE_GPU
449 if (Gpu::inLaunchRegion()) {
450 ParallelFor(box, ncomp,
451 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
452 da(i,j,k,n) = sa(i,j,k,n);
453 });
454 if (!Gpu::inNoSyncRegion()) {
456 }
457 } else
458#endif
459 {
460 auto const& lo = amrex::lbound(box);
461 auto const& hi = amrex::ubound(box);
462#ifdef AMREX_USE_OMP
463#pragma omp parallel for collapse(3)
464#endif
465 for (int n = 0; n < ncomp; ++n) {
466 for (int k = lo.z; k <= hi.z; ++k) {
467 for (int j = lo.y; j <= hi.y; ++j) {
469 for (int i = lo.x; i <= hi.x; ++i) {
470 da(i,j,k,n) = sa(i,j,k,n);
471 }}}}
472 }
473 } else {
474#ifdef AMREX_USE_GPU
475 if (Gpu::inLaunchRegion()) {
476 ParallelFor(box, ncomp,
477 [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) {
478 da(i,j,k,n) += sa(i,j,k,n);
479 });
480 if (!Gpu::inNoSyncRegion()) {
482 }
483 } else
484#endif
485 {
486 auto const& lo = amrex::lbound(box);
487 auto const& hi = amrex::ubound(box);
488#ifdef AMREX_USE_OMP
489#pragma omp parallel for collapse(3)
490#endif
491 for (int n = 0; n < ncomp; ++n) {
492 for (int k = lo.z; k <= hi.z; ++k) {
493 for (int j = lo.y; j <= hi.y; ++j) {
495 for (int i = lo.x; i <= hi.x; ++i) {
496 da(i,j,k,n) += sa(i,j,k,n);
497 }}}}
498 }
499 }
500 }
501 return;
502 }
503
504 if ((src.boxArray().ixType().cellCentered() || op == FabArrayBase::COPY) &&
505 (boxarray == src.boxarray && distributionMap == src.distributionMap) &&
506 snghost == IntVect::TheZeroVector() &&
507 dnghost == IntVect::TheZeroVector() &&
508 !period.isAnyPeriodic() && !to_ghost_cells_only && (offset == 0))
509 {
510 //
511 // Short-circuit full intersection code if we're doing copy()s or if
512 // we're doing plus()s on cell-centered data. Don't do plus()s on
513 // non-cell-centered data this simplistic way.
514 //
515 if (this != &src) { // avoid self copy or plus
516 if (op == FabArrayBase::COPY) {
517 Copy(*this, src, scomp, dcomp, ncomp, IntVect(0));
518 } else {
519 Add(*this, src, scomp, dcomp, ncomp, IntVect(0));
520 }
521 }
522 return;
523 }
524
525 const CPC& thecpc = (a_cpc) ? *a_cpc : getCPC(dnghost, src, snghost, period,
526 to_ghost_cells_only,offset);
527
529 {
530 //
531 // There can only be local work to do.
532 //
533
534 int N_locs = (*thecpc.m_LocTags).size();
535 if (N_locs == 0) { return; }
536#ifdef AMREX_USE_GPU
538 {
539 PC_local_gpu(thecpc, src, scomp, dcomp, ncomp, op, deterministic);
540 }
541 else
542#endif
543 {
544 PC_local_cpu(thecpc, src, scomp, dcomp, ncomp, op);
545 }
546
547 return;
548 }
549
550#ifdef BL_USE_MPI
551
552 //
553 // Do this before prematurely exiting if running in parallel.
554 // Otherwise sequence numbers will not match across MPI processes.
555 //
556 int tag = ParallelDescriptor::SeqNum();
557
558 const int N_snds = thecpc.m_SndTags->size();
559 const int N_rcvs = thecpc.m_RcvTags->size();
560 const int N_locs = thecpc.m_LocTags->size();
561
562 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) {
563 //
564 // No work to do.
565 //
566
567 return;
568 }
569
570 //
571 // Send/Recv at most MaxComp components at a time to cut down memory usage.
572 //
573 int NCompLeft = ncomp;
574 int SC = scomp, DC = dcomp, NC;
575
576 for (int ipass = 0; ipass < ncomp; )
577 {
578 pcd = std::make_unique<PCData<FAB>>();
579 pcd->cpc = &thecpc;
580 pcd->src = &src;
581 pcd->op = op;
582 pcd->tag = tag;
583 // Deterministic GPU unpacking is currently implemented only for ADD.
584 pcd->deterministic = deterministic && (op == FabArrayBase::ADD);
585
586 NC = std::min(NCompLeft,FabArrayBase::MaxComp);
587 const bool last_iter = (NCompLeft == NC);
588
589 pcd->SC = SC;
590 pcd->DC = DC;
591 pcd->NC = NC;
592
593 //
594 // Post rcvs. Allocate one chunk of space to hold'm all.
595 //
596 pcd->the_recv_data = nullptr;
597
598 pcd->actual_n_rcvs = 0;
599 if (N_rcvs > 0) {
600 PostRcvs(*thecpc.m_RcvTags, pcd->the_recv_data,
601 pcd->recv_data, pcd->recv_size, pcd->recv_from, pcd->recv_reqs, NC, pcd->tag);
602 pcd->actual_n_rcvs = N_rcvs - std::count(pcd->recv_size.begin(), pcd->recv_size.end(), 0);
603 }
604
605 //
606 // Post send's
607 //
608 Vector<char*> send_data;
609 Vector<std::size_t> send_size;
610 Vector<int> send_rank;
612
613 if (N_snds > 0)
614 {
615 src.PrepareSendBuffers(*thecpc.m_SndTags, pcd->the_send_data, send_data, send_size,
616 send_rank, pcd->send_reqs, send_cctc, NC);
617
618#ifdef AMREX_USE_GPU
620 {
621 pack_send_buffer_gpu(src, SC, NC, send_data, send_size, send_cctc, thecpc.m_id);
622 }
623 else
624#endif
625 {
626 pack_send_buffer_cpu(src, SC, NC, send_data, send_size, send_cctc);
627 }
628
629 AMREX_ASSERT(pcd->send_reqs.size() == N_snds);
630 FabArray<FAB>::PostSnds(send_data, send_size, send_rank, pcd->send_reqs, pcd->tag);
631 }
632
633 //
634 // Do the local work. Hope for a bit of communication/computation overlap.
635 //
636 if (N_locs > 0)
637 {
638#ifdef AMREX_USE_GPU
640 {
641 PC_local_gpu(thecpc, src, SC, DC, NC, op, deterministic);
642 }
643 else
644#endif
645 {
646 PC_local_cpu(thecpc, src, SC, DC, NC, op);
647 }
648 }
649
650 if (!last_iter)
651 {
652 ParallelCopy_finish();
653
654 SC += NC;
655 DC += NC;
656 }
657
658 ipass += NC;
659 NCompLeft -= NC;
660 }
661
662#endif /*BL_USE_MPI*/
663
664#ifndef AMREX_USE_GPU
665 amrex::ignore_unused(deterministic);
666#endif
667}
668
669template <class FAB>
670void
672{
673
674#ifdef BL_USE_MPI
675
676 BL_PROFILE("FabArray::ParallelCopy_finish()");
678
679 if (!pcd) { return; }
680
681 const CPC* thecpc = pcd->cpc;
682
683 const auto N_snds = static_cast<int>(thecpc->m_SndTags->size());
684 const auto N_rcvs = static_cast<int>(thecpc->m_RcvTags->size());
685
686 if (N_rcvs > 0)
687 {
688 Vector<const CopyComTagsContainer*> recv_cctc(N_rcvs,nullptr);
689 for (int k = 0; k < N_rcvs; ++k)
690 {
691 if (pcd->recv_size[k] > 0)
692 {
693 auto const& cctc = thecpc->m_RcvTags->at(pcd->recv_from[k]);
694 recv_cctc[k] = &cctc;
695 }
696 }
697
698 if (pcd->actual_n_rcvs > 0) {
699 Vector<MPI_Status> stats(N_rcvs);
700 ParallelDescriptor::Waitall(pcd->recv_reqs, stats);
701#ifdef AMREX_DEBUG
702 if (!CheckRcvStats(stats, pcd->recv_size, pcd->tag))
703 {
704 amrex::Abort("ParallelCopy failed with wrong message size");
705 }
706#endif
707 }
708
709 bool is_thread_safe = thecpc->m_threadsafe_rcv;
710
711#ifdef AMREX_USE_GPU
713 {
714 unpack_recv_buffer_gpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
715 recv_cctc, pcd->op, is_thread_safe, thecpc->m_id,
716 pcd->deterministic);
717 }
718 else
719#endif
720 {
721 unpack_recv_buffer_cpu(*this, pcd->DC, pcd->NC, pcd->recv_data, pcd->recv_size,
722 recv_cctc, pcd->op, is_thread_safe);
723 }
724
725 if (pcd->the_recv_data)
726 {
727 amrex::The_Comms_Arena()->free(pcd->the_recv_data);
728 pcd->the_recv_data = nullptr;
729 }
730 }
731
732 if (N_snds > 0) {
733 if (! thecpc->m_SndTags->empty()) {
734 Vector<MPI_Status> stats(pcd->send_reqs.size());
735 ParallelDescriptor::Waitall(pcd->send_reqs, stats);
736 }
737 amrex::The_Comms_Arena()->free(pcd->the_send_data);
738 pcd->the_send_data = nullptr;
739 }
740
741 pcd.reset();
742
743#endif /*BL_USE_MPI*/
744}
745
746template <class FAB>
747void
748FabArray<FAB>::copyTo (FAB& dest, int scomp, int dcomp, int ncomp, int nghost) const
749{
750 BL_PROFILE("FabArray::copy(fab)");
751
752 BL_ASSERT(dcomp + ncomp <= dest.nComp());
753 BL_ASSERT(IntVect(nghost).allLE(nGrowVect()));
754
755 int root_proc = this->DistributionMap()[0];
756
757 BoxArray ba(dest.box());
758 DistributionMapping dm(Vector<int>{root_proc});
759 FabArray<FAB> destmf(ba, dm, ncomp, 0, MFInfo().SetAlloc(false));
760 if (ParallelDescriptor::MyProc() == root_proc) {
761 destmf.setFab(0, FAB(dest, amrex::make_alias, dcomp, ncomp));
762 }
763
764 destmf.ParallelCopy(*this, scomp, 0, ncomp, nghost, 0);
765
766#ifdef BL_USE_MPI
767 using T = typename FAB::value_type;
768 if (ParallelContext::NProcsSub() > 1) {
769 Long count = dest.numPts()*ncomp;
770 T* const p0 = dest.dataPtr(dcomp);
771 T* pb = p0;
772#ifdef AMREX_USE_GPU
773 if (dest.arena()->isDevice() && !ParallelDescriptor::UseGpuAwareMpi()) {
774 pb = (T*)The_Pinned_Arena()->alloc(sizeof(T)*count);
775 Gpu::dtoh_memcpy_async(pb, p0, sizeof(T)*count);
777 }
778#endif
781#ifdef AMREX_USE_GPU
782 if (pb != p0) {
783 Gpu::htod_memcpy_async(p0, pb, sizeof(T)*count);
785 The_Pinned_Arena()->free(pb);
786 }
787#endif
788 }
789#endif
790}
791
792#ifdef BL_USE_MPI
793template <class FAB>
794template <typename BUF>
796FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
797 Vector<char*>& send_data,
798 Vector<std::size_t>& send_size,
799 Vector<int>& send_rank,
800 Vector<MPI_Request>& send_reqs,
802 int ncomp)
803{
804 char* pointer = nullptr;
805 PrepareSendBuffers<BUF>(SndTags, pointer, send_data, send_size, send_rank, send_reqs, send_cctc, ncomp);
806 return TheFaArenaPointer(pointer);
807}
808
809template <class FAB>
810template <typename BUF>
811void
812FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,
813 char*& the_send_data,
814 Vector<char*>& send_data,
815 Vector<std::size_t>& send_size,
816 Vector<int>& send_rank,
817 Vector<MPI_Request>& send_reqs,
818 Vector<const CopyComTagsContainer*>& send_cctc,
819 int ncomp)
820{
821 send_data.clear();
822 send_size.clear();
823 send_rank.clear();
824 send_reqs.clear();
825 send_cctc.clear();
826 const auto N_snds = SndTags.size();
827 if (N_snds == 0) { return; }
828 send_data.reserve(N_snds);
829 send_size.reserve(N_snds);
830 send_rank.reserve(N_snds);
831 send_reqs.reserve(N_snds);
832 send_cctc.reserve(N_snds);
833
834 Vector<std::size_t> offset; offset.reserve(N_snds);
835 std::size_t total_volume = 0;
836 for (auto const& kv : SndTags)
837 {
838 auto const& cctc = kv.second;
839
840 std::size_t nbytes = 0;
841 for (auto const& cct : kv.second)
842 {
843 nbytes += cct.sbox.numPts() * ncomp * sizeof(BUF);
844 }
845
846 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
847 nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
848
849 // Also need to align the offset properly
850 total_volume = amrex::aligned_size(std::max(alignof(BUF), acd),
851 total_volume);
852
853 offset.push_back(total_volume);
854 total_volume += nbytes;
855
856 send_data.push_back(nullptr);
857 send_size.push_back(nbytes);
858 send_rank.push_back(kv.first);
859 send_reqs.push_back(MPI_REQUEST_NULL);
860 send_cctc.push_back(&cctc);
861 }
862
863 if (total_volume > 0)
864 {
865 the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume));
866 for (int i = 0, N = static_cast<int>(send_size.size()); i < N; ++i) {
867 send_data[i] = the_send_data + offset[i];
868 }
869 } else {
870 the_send_data = nullptr;
871 }
872}
873
874template <class FAB>
875void
876FabArray<FAB>::PostSnds (Vector<char*> const& send_data,
877 Vector<std::size_t> const& send_size,
878 Vector<int> const& send_rank,
879 Vector<MPI_Request>& send_reqs,
880 int SeqNum)
881{
883
884 const auto N_snds = static_cast<int>(send_reqs.size());
885 for (int j = 0; j < N_snds; ++j)
886 {
887 if (send_size[j] > 0) {
888 const int rank = ParallelContext::global_to_local_rank(send_rank[j]);
889 send_reqs[j] = ParallelDescriptor::Asend
890 (send_data[j], send_size[j], rank, SeqNum, comm).req();
891 }
892 }
893}
894
895template <class FAB>
896template <typename BUF>
897TheFaArenaPointer FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
898 Vector<char*>& recv_data,
899 Vector<std::size_t>& recv_size,
900 Vector<int>& recv_from,
901 Vector<MPI_Request>& recv_reqs,
902 int ncomp,
903 int SeqNum)
904{
905 char* pointer = nullptr;
906 PostRcvs(RcvTags, pointer, recv_data, recv_size, recv_from, recv_reqs, ncomp, SeqNum);
907 return TheFaArenaPointer(pointer);
908}
909
910template <class FAB>
911template <typename BUF>
912void
913FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,
914 char*& the_recv_data,
915 Vector<char*>& recv_data,
916 Vector<std::size_t>& recv_size,
917 Vector<int>& recv_from,
918 Vector<MPI_Request>& recv_reqs,
919 int ncomp,
920 int SeqNum)
921{
922 recv_data.clear();
923 recv_size.clear();
924 recv_from.clear();
925 recv_reqs.clear();
926
927 Vector<std::size_t> offset;
928 std::size_t TotalRcvsVolume = 0;
929 for (const auto& kv : RcvTags) // loop over senders
930 {
931 std::size_t nbytes = 0;
932 for (auto const& cct : kv.second)
933 {
934 nbytes += cct.dbox.numPts() * ncomp * sizeof(BUF);
935 }
936
937 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
938 nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
939
940 // Also need to align the offset properly
941 TotalRcvsVolume = amrex::aligned_size(std::max(alignof(BUF),acd),
942 TotalRcvsVolume);
943
944 offset.push_back(TotalRcvsVolume);
945 TotalRcvsVolume += nbytes;
946
947 recv_data.push_back(nullptr);
948 recv_size.push_back(nbytes);
949 recv_from.push_back(kv.first);
950 recv_reqs.push_back(MPI_REQUEST_NULL);
951 }
952
953 const auto nrecv = static_cast<int>(recv_from.size());
954
956
957 if (TotalRcvsVolume == 0)
958 {
959 the_recv_data = nullptr;
960 }
961 else
962 {
963 the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
965 for (int i = 0; i < nrecv; ++i)
966 {
967 recv_data[i] = the_recv_data + offset[i];
968 if (recv_size[i] > 0)
969 {
970 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
971 recv_reqs[i] = ParallelDescriptor::Arecv
972 (recv_data[i], recv_size[i], rank, SeqNum, comm).req();
973 }
974 }
975 }
976}
977#endif
978
979template <class FAB>
980void
982 int scomp,
983 int dcomp,
984 int ncomp,
985 const IntVect& nghost)
986{
988 "FabArray::Redistribute: must have the same BoxArray");
989
991 {
992 Copy(*this, src, scomp, dcomp, ncomp, nghost);
993 return;
994 }
995
996#ifdef BL_USE_MPI
997
999
1000 ParallelCopy(src, scomp, dcomp, ncomp, nghost, nghost, Periodicity::NonPeriodic(),
1001 FabArrayBase::COPY, &cpc);
1002
1003#endif
1004}
1005
1006template <class FAB>
1007void
1009{
1010#if defined(AMREX_USE_MPI) && !defined(AMREX_DEBUG)
1011 // We only test if no DEBUG because in DEBUG we check the status later.
1012 // If Test is done here, the status check will fail.
1013 int flag;
1014 ParallelDescriptor::Test(fbd->recv_reqs, flag, fbd->recv_stat);
1015#endif
1016}
1017
1019namespace detail {
1020template <class TagT>
1021void fbv_copy (Vector<TagT> const& tags)
1022{
1023 const int N = tags.size();
1024 if (N == 0) { return; }
1025#ifdef AMREX_USE_GPU
1026 if (Gpu::inLaunchRegion()) {
1027 ParallelFor(tags, 1,
1028 [=] AMREX_GPU_DEVICE (int i, int j, int k, int, TagT const& tag) noexcept
1029 {
1030 const int ncomp = tag.dfab.nComp();
1031 for (int n = 0; n < ncomp; ++n) {
1032 tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
1034 });
1035 } else
1036#endif
1037 {
1038#ifdef AMREX_USE_OMP
1039#pragma omp parallel for
1040#endif
1041 for (int itag = 0; itag < N; ++itag) {
1042 auto const& tag = tags[itag];
1043 const int ncomp = tag.dfab.nComp();
1044 AMREX_LOOP_4D(tag.dbox, ncomp, i, j, k, n,
1045 {
1046 tag.dfab(i,j,k,n) = tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n);
1047 });
1048 }
1050}
1051}
1053
1067template <FabArrayType MF>
1068void
1070 Vector<int> const& ncomp, Vector<IntVect> const& nghost,
1071 Vector<Periodicity> const& period,
1072 Vector<int> const& cross = {})
1073{
1074 BL_PROFILE("FillBoundary_nowait(Vector)");
1075 const int N = mf.size();
1076 for (int i = 0; i < N; ++i) {
1077 mf[i]->FillBoundary_nowait(scomp[i], ncomp[i], nghost[i], period[i],
1078 cross.empty() ? 0 : cross[i]);
1080}
1081
1090template <FabArrayType MF>
1091void
1093 const Periodicity& a_period = Periodicity::NonPeriodic())
1094{
1095 Vector<int> scomp(mf.size(), 0);
1096 Vector<int> ncomp;
1097 Vector<IntVect> nghost;
1098 Vector<Periodicity> period(mf.size(), a_period);
1099 ncomp.reserve(mf.size());
1100 nghost.reserve(mf.size());
1101 for (auto const& x : mf) {
1102 ncomp.push_back(x->nComp());
1103 nghost.push_back(x->nGrowVect());
1104 }
1105 FillBoundary_nowait(mf, scomp, ncomp, nghost, period);
1107
1113template <FabArrayType MF>
1114void
1116{
1117 BL_PROFILE("FillBoundary_finish(Vector)");
1118 const int N = mf.size();
1119 for (int i = 0; i < N; ++i) {
1120 mf[i]->FillBoundary_finish();
1121 }
1122}
1123
1136template <FabArrayType MF>
1137void
1139 Vector<int> const& ncomp, Vector<IntVect> const& nghost,
1140 Vector<Periodicity> const& period)
1141{
1142 BL_PROFILE("FillBoundaryAndSync_nowait(Vector)");
1143 const int N = mf.size();
1144 for (int i = 0; i < N; ++i) {
1145 mf[i]->FillBoundaryAndSync_nowait(scomp[i], ncomp[i], nghost[i], period[i]);
1146 }
1147}
1148
1155template <FabArrayType MF>
1156void
1158 const Periodicity& a_period = Periodicity::NonPeriodic())
1159{
1160 Vector<int> scomp(mf.size(), 0);
1161 Vector<int> ncomp;
1162 Vector<IntVect> nghost;
1163 Vector<Periodicity> period(mf.size(), a_period);
1164 ncomp.reserve(mf.size());
1165 nghost.reserve(mf.size());
1166 for (auto const& x : mf) {
1167 ncomp.push_back(x->nComp());
1168 nghost.push_back(x->nGrowVect());
1169 }
1170 FillBoundaryAndSync_nowait(mf, scomp, ncomp, nghost, period);
1171}
1172
1178template <FabArrayType MF>
1179void
1181{
1182 BL_PROFILE("FillBoundaryAndSync_finish(Vector)");
1183 const int N = mf.size();
1184 for (int i = 0; i < N; ++i) {
1185 mf[i]->FillBoundaryAndSync_finish();
1186 }
1187}
1188
1204template <FabArrayType MF>
1205void
1206FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
1207 Vector<int> const& ncomp, Vector<IntVect> const& nghost,
1208 Vector<Periodicity> const& period, Vector<int> const& cross = {})
1209{
1210 BL_PROFILE("FillBoundary(Vector)");
1211#if 1
1212 const int N = mf.size();
1213 for (int i = 0; i < N; ++i) {
1214 mf[i]->FillBoundary_nowait(scomp[i], ncomp[i], nghost[i], period[i],
1215 cross.empty() ? 0 : cross[i]);
1216 }
1217 for (int i = 0; i < N; ++i) {
1218 mf[i]->FillBoundary_finish();
1219 }
1220
1221#else
1222 using FAB = typename MF::FABType::value_type;
1223 using T = typename FAB::value_type;
1224
1225 const int nmfs = mf.size();
1226 Vector<FabArrayBase::CommMetaData const*> cmds;
1227 int N_locs = 0;
1228 int N_rcvs = 0;
1229 int N_snds = 0;
1230 for (int imf = 0; imf < nmfs; ++imf) {
1231 if (nghost[imf].max() > 0) {
1232 auto const& TheFB = mf[imf]->getFB(nghost[imf], period[imf],
1233 cross.empty() ? 0 : cross[imf]);
1234 // The FB is cached. Therefore it's safe take its address for later use.
1235 cmds.push_back(static_cast<FabArrayBase::CommMetaData const*>(&TheFB));
1236 N_locs += TheFB.m_LocTags->size();
1237 N_rcvs += TheFB.m_RcvTags->size();
1238 N_snds += TheFB.m_SndTags->size();
1239 } else {
1240 cmds.push_back(nullptr);
1241 }
1242 }
1243
1244 using TagT = Array4CopyTag<T>;
1245 Vector<TagT> local_tags;
1246 local_tags.reserve(N_locs);
1247 static_assert(amrex::IsStoreAtomic<T>::value, "FillBoundary(Vector): storing T is not atomic");
1248 for (int imf = 0; imf < nmfs; ++imf) {
1249 if (cmds[imf]) {
1250 auto const& tags = *(cmds[imf]->m_LocTags);
1251 for (auto const& tag : tags) {
1252 local_tags.push_back(TagT{.dfab = (*mf[imf])[tag.dstIndex].array(scomp[imf],ncomp[imf]),
1253 .dindex = tag.dstIndex,
1254 .sfab = (*mf[imf])[tag.srcIndex].const_array(scomp[imf],ncomp[imf]),
1255 .dbox = tag.dbox,
1256 .offset = (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
1257 }
1258 }
1259 }
1260
1261 if (ParallelContext::NProcsSub() == 1) {
1262 detail::fbv_copy(local_tags);
1263 return;
1264 }
1265
1266#ifdef AMREX_USE_MPI
1267 //
1268 // Do this before prematurely exiting if running in parallel.
1269 // Otherwise sequence numbers will not match across MPI processes.
1270 //
1273
1274 if (N_locs == 0 && N_rcvs == 0 && N_snds == 0) { return; } // No work to do
1275
1276 char* the_recv_data = nullptr;
1277 Vector<int> recv_from;
1278 Vector<std::size_t> recv_size;
1279 Vector<MPI_Request> recv_reqs;
1280 Vector<MPI_Status> recv_stat;
1281 Vector<TagT> recv_tags;
1282
1283 if (N_rcvs > 0) {
1284
1285 for (int imf = 0; imf < nmfs; ++imf) {
1286 if (cmds[imf]) {
1287 auto const& tags = *(cmds[imf]->m_RcvTags);
1288 for (const auto& kv : tags) {
1289 recv_from.push_back(kv.first);
1290 }
1291 }
1292 }
1293 amrex::RemoveDuplicates(recv_from);
1294 const int nrecv = recv_from.size();
1295
1296 recv_reqs.resize(nrecv, MPI_REQUEST_NULL);
1297 recv_stat.resize(nrecv);
1298
1299 recv_tags.reserve(N_rcvs);
1300
1301 Vector<Vector<std::size_t> > recv_offset(nrecv);
1302 Vector<std::size_t> offset;
1303 recv_size.reserve(nrecv);
1304 offset.reserve(nrecv);
1305 std::size_t TotalRcvsVolume = 0;
1306 for (int i = 0; i < nrecv; ++i) {
1307 std::size_t nbytes = 0;
1308 for (int imf = 0; imf < nmfs; ++imf) {
1309 if (cmds[imf]) {
1310 auto const& tags = *(cmds[imf]->m_RcvTags);
1311 auto it = tags.find(recv_from[i]);
1312 if (it != tags.end()) {
1313 for (auto const& cct : it->second) {
1314 auto& dfab = (*mf[imf])[cct.dstIndex];
1315 recv_offset[i].push_back(nbytes);
1316 recv_tags.push_back(TagT{
1317 .dfab = dfab.array(scomp[imf],ncomp[imf]),
1318 .dindex = cct.dstIndex,
1319 .sfab = makeArray4<T const>(nullptr,cct.dbox,ncomp[imf]),
1320 .dbox = cct.dbox,
1321 .offset = Dim3{.x = 0, .y = 0, .z = 0}
1322 });
1323 nbytes += dfab.nBytes(cct.dbox,ncomp[imf]);
1324 }
1325 }
1326 }
1327 }
1328
1329 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1330 nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
1331
1332 // Also need to align the offset properly
1333 TotalRcvsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalRcvsVolume);
1334
1335 offset.push_back(TotalRcvsVolume);
1336 TotalRcvsVolume += nbytes;
1337
1338 recv_size.push_back(nbytes);
1339 }
1340
1341 the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
1342
1343 int k = 0;
1344 for (int i = 0; i < nrecv; ++i) {
1345 char* p = the_recv_data + offset[i];
1346 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
1347 recv_reqs[i] = ParallelDescriptor::Arecv
1348 (p, recv_size[i], rank, SeqNum, comm).req();
1349 for (int j = 0, nj = recv_offset[i].size(); j < nj; ++j) {
1350 recv_tags[k++].sfab.p = (T const*)(p + recv_offset[i][j]);
1351 }
1352 }
1353 }
1354
1355 char* the_send_data = nullptr;
1356 Vector<int> send_rank;
1357 Vector<char*> send_data;
1358 Vector<std::size_t> send_size;
1359 Vector<MPI_Request> send_reqs;
1360 if (N_snds > 0) {
1361 for (int imf = 0; imf < nmfs; ++imf) {
1362 if (cmds[imf]) {
1363 auto const& tags = *(cmds[imf]->m_SndTags);
1364 for (auto const& kv : tags) {
1365 send_rank.push_back(kv.first);
1366 }
1367 }
1368 }
1369 amrex::RemoveDuplicates(send_rank);
1370 const int nsend = send_rank.size();
1371
1372 send_data.resize(nsend, nullptr);
1373 send_reqs.resize(nsend, MPI_REQUEST_NULL);
1374
1375 Vector<TagT> send_tags;
1376 send_tags.reserve(N_snds);
1377
1378 Vector<Vector<std::size_t> > send_offset(nsend);
1380 send_size.reserve(nsend);
1381 offset.reserve(nsend);
1382 std::size_t TotalSndsVolume = 0;
1383 for (int i = 0; i < nsend; ++i) {
1384 std::size_t nbytes = 0;
1385 for (int imf = 0; imf < nmfs; ++imf) {
1386 if (cmds[imf]) {
1387 auto const& tags = *(cmds[imf]->m_SndTags);
1388 auto it = tags.find(send_rank[i]);
1389 if (it != tags.end()) {
1390 for (auto const& cct : it->second) {
1391 auto const& sfab = (*mf[imf])[cct.srcIndex];
1392 send_offset[i].push_back(nbytes);
1393 send_tags.push_back(TagT{
1394 .dfab = amrex::makeArray4<T>(nullptr,cct.sbox,ncomp[imf]),
1395 .dindex = cct.dstIndex,
1396 .sfab = sfab.const_array(scomp[imf],ncomp[imf]),
1397 .dbox = cct.sbox,
1398 .offset = Dim3{.x = 0, .y = 0, .z = 0}
1399 });
1400 nbytes += sfab.nBytes(cct.sbox,ncomp[imf]);
1401 }
1402 }
1403 }
1404 }
1405
1406 std::size_t acd = ParallelDescriptor::sizeof_selected_comm_data_type(nbytes);
1407 nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
1408
1409 // Also need to align the offset properly
1410 TotalSndsVolume = amrex::aligned_size(std::max(alignof(T),acd), TotalSndsVolume);
1411
1412 offset.push_back(TotalSndsVolume);
1413 TotalSndsVolume += nbytes;
1414
1415 send_size.push_back(nbytes);
1416 }
1417
1418 the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalSndsVolume));
1419 int k = 0;
1420 for (int i = 0; i < nsend; ++i) {
1421 send_data[i] = the_send_data + offset[i];
1422 for (int j = 0, nj = send_offset[i].size(); j < nj; ++j) {
1423 send_tags[k++].dfab.p = (T*)(send_data[i] + send_offset[i][j]);
1424 }
1425 }
1426
1427 detail::fbv_copy(send_tags);
1428
1429 FabArray<FAB>::PostSnds(send_data, send_size, send_rank, send_reqs, SeqNum);
1430 }
1431
1432#if !defined(AMREX_DEBUG)
1433 int recv_flag;
1434 ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1435#endif
1436
1437 if (N_locs > 0) {
1438 detail::fbv_copy(local_tags);
1439#if !defined(AMREX_DEBUG)
1440 ParallelDescriptor::Test(recv_reqs, recv_flag, recv_stat);
1441#endif
1442 }
1443
1444 if (N_rcvs > 0) {
1445 ParallelDescriptor::Waitall(recv_reqs, recv_stat);
1446#ifdef AMREX_DEBUG
1447 if (!FabArrayBase::CheckRcvStats(recv_stat, recv_size, SeqNum)) {
1448 amrex::Abort("FillBoundary(vector) failed with wrong message size");
1449 }
1450#endif
1451
1452 detail::fbv_copy(recv_tags);
1453
1454 amrex::The_Comms_Arena()->free(the_recv_data);
1455 }
1456
1457 if (N_snds > 0) {
1458 Vector<MPI_Status> stats(send_reqs.size());
1459 ParallelDescriptor::Waitall(send_reqs, stats);
1460 amrex::The_Comms_Arena()->free(the_send_data);
1461 }
1462
1463#endif // #ifdef AMREX_USE_MPI
1464#endif // #if 1 #else
1465}
1466
1479template <FabArrayType MF>
1480void
1482 Vector<int> const& ncomp, Vector<IntVect> const& nghost,
1483 Vector<Periodicity> const& period)
1484{
1485 BL_PROFILE("FillBoundaryAndSync(Vector)");
1486 const int N = mf.size();
1487 for (int i = 0; i < N; ++i) {
1488 mf[i]->FillBoundaryAndSync_nowait(scomp[i], ncomp[i], nghost[i], period[i]);
1489 }
1490 for (int i = 0; i < N; ++i) {
1491 mf[i]->FillBoundaryAndSync_finish();
1492 }
1493}
1494
1496template <FabArrayType MF>
1497void
1499{
1500 FillBoundary_nowait(mf, a_period);
1502}
1503
1505template <FabArrayType MF>
1506void
1512
1513}
#define BL_PROFILE(a)
Definition AMReX_BLProfiler.H:551
#define BL_PROFILE_SYNC_STOP()
Definition AMReX_BLProfiler.H:645
#define BL_PROFILE_SYNC_START_TIMED(fname)
Definition AMReX_BLProfiler.H:644
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:37
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_NODISCARD
Definition AMReX_Extension.H:252
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
#define AMREX_LOOP_4D(bx, ncomp, i, j, k, n, block)
Definition AMReX_Loop.nolint.H:16
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
A collection of Boxes stored in an Array.
Definition AMReX_BoxArray.H:564
IndexType ixType() const noexcept
Return index type of this BoxArray.
Definition AMReX_BoxArray.H:854
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
IntVect nGrowVect() const noexcept
Definition AMReX_FabArrayBase.H:80
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:110
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition AMReX_FabArrayBase.H:131
bool empty() const noexcept
Definition AMReX_FabArrayBase.H:89
CpOp
parallel copy or add
Definition AMReX_FabArrayBase.H:394
@ ADD
Definition AMReX_FabArrayBase.H:394
@ COPY
Definition AMReX_FabArrayBase.H:394
Box box(int K) const noexcept
Return the Kth Box in the BoxArray. That is, the valid region of the Kth grid.
Definition AMReX_FabArrayBase.H:101
DistributionMapping distributionMap
Definition AMReX_FabArrayBase.H:445
static int MaxComp
The maximum number of components to copy() at a time.
Definition AMReX_FabArrayBase.H:292
BoxArray boxarray
Definition AMReX_FabArrayBase.H:444
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition AMReX_FabArrayBase.H:95
An Array of FortranArrayBox(FAB)-like Objects.
Definition AMReX_FabArray.H:344
void ParallelCopyToGhost_finish()
Definition AMReX_FabArrayCommI.H:383
void ParallelCopy(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic(), CpOp op=FabArrayBase::COPY)
Definition AMReX_FabArray.H:873
void Redistribute(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &nghost)
Copy from src to this. this and src have the same BoxArray, but different DistributionMapping.
Definition AMReX_FabArrayCommI.H:981
void ParallelCopyToGhost(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &snghost, const IntVect &dnghost, const Periodicity &period=Periodicity::NonPeriodic())
Definition AMReX_FabArrayCommI.H:352
void ParallelCopy_finish()
Definition AMReX_FabArrayCommI.H:671
void ParallelAdd(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic())
This function copies data from src to this FabArray. Each FAB in fa is intersected with all FABs in t...
Definition AMReX_FabArray.H:870
void ParallelCopyToGhost_nowait(const FabArray< FAB > &src, int scomp, int dcomp, int ncomp, const IntVect &snghost, const IntVect &dnghost, const Periodicity &period=Periodicity::NonPeriodic())
Definition AMReX_FabArrayCommI.H:369
void FillBoundary_test()
Definition AMReX_FabArrayCommI.H:1008
void copyTo(FAB &dest, int nghost=0) const
Copy the values contained in the intersection of the valid + nghost region of this FabArray with the ...
Definition AMReX_FabArray.H:2696
void ParallelCopy_nowait(const FabArray< FAB > &src, const Periodicity &period=Periodicity::NonPeriodic(), CpOp op=FabArrayBase::COPY)
Definition AMReX_FabArray.H:887
Array4< typename FabArray< FAB >::value_type const > const_array(const MFIter &mfi) const noexcept
Definition AMReX_FabArray.H:585
__host__ __device__ bool cellCentered() const noexcept
True if the IndexTypeND is CELL based in all directions.
Definition AMReX_IndexType.H:102
__host__ __device__ constexpr bool allGE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is greater than or equal to argument for all components. NOTE: This is NOT a str...
Definition AMReX_IntVect.H:542
__host__ static __device__ constexpr IntVectND< dim > TheZeroVector() noexcept
This static member function returns a reference to a constant IntVectND object, all of whose dim argu...
Definition AMReX_IntVect.H:771
__host__ __device__ constexpr int max() const noexcept
maximum (no absolute values) value
Definition AMReX_IntVect.H:313
MPI_Request req() const
Definition AMReX_ParallelDescriptor.H:74
This provides length of period for periodic domains. 0 means it is not periodic in that direction....
Definition AMReX_Periodicity.H:17
static const Periodicity & NonPeriodic() noexcept
Definition AMReX_Periodicity.cpp:52
bool isAnyPeriodic() const noexcept
Definition AMReX_Periodicity.H:22
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:29
Long size() const noexcept
Definition AMReX_Vector.H:54
Checks if a type is derived from amrex::BaseFab.
Definition AMReX_Concepts.H:13
amrex_long Long
Definition AMReX_INT.H:30
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1359
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1289
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:880
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
bool inGraphRegion()
Definition AMReX_GpuControl.H:117
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
void dtoh_memcpy_async(void *p_h, const void *p_d, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:435
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88
bool inNoSyncRegion() noexcept
Definition AMReX_GpuControl.H:148
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:421
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int global_to_local_rank(int rank) noexcept
Definition AMReX_ParallelContext.H:98
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
void Test(MPI_Request &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1220
Message Asend(const T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1154
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:113
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1308
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition AMReX_ParallelDescriptor.cpp:1295
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:678
Message Arecv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1196
int MPI_Comm
Definition AMReX_ccse-mpi.H:51
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:50
@ make_alias
Definition AMReX_MakeType.H:7
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
void FillBoundary_finish(Vector< MF * > const &mf)
Wait for outstanding FillBoundary_nowait operations launched with the vector helper to complete.
Definition AMReX_FabArrayCommI.H:1115
void Copy(FabArray< DFAB > &dst, FabArray< SFAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition AMReX_FabArray.H:180
void Add(FabArray< FAB > &dst, FabArray< FAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition AMReX_FabArray.H:239
std::unique_ptr< char, TheFaArenaDeleter > TheFaArenaPointer
Definition AMReX_FabArray.H:106
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2867
IntVect nGrowVect(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2857
void FillBoundary_nowait(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period, Vector< int > const &cross={})
Launch FillBoundary_nowait across a vector of FabArrays.
Definition AMReX_FabArrayCommI.H:1069
void FillBoundaryAndSync_nowait(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period)
Launch FillBoundaryAndSync_nowait across a vector of FabArrays.
Definition AMReX_FabArrayCommI.H:1138
void FillBoundaryAndSync(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period)
Perform FillBoundaryAndSync on a batch of FabArrays (e.g., MultiFabs).
Definition AMReX_FabArrayCommI.H:1481
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202
double second() noexcept
Definition AMReX_Utility.cpp:940
void ParallelCopy(MF &dst, MF const &src, int scomp, int dcomp, int ncomp, IntVect const &ng_src=IntVect(0), IntVect const &ng_dst=IntVect(0), Periodicity const &period=Periodicity::NonPeriodic())
dst = src w/ MPI communication
Definition AMReX_FabArrayUtility.H:1951
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:45
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
std::size_t aligned_size(std::size_t align_requirement, std::size_t size) noexcept
Given a minimum required size in bytes, this returns the smallest size greater or equal to size that ...
Definition AMReX_Arena.H:33
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:210
void FillBoundaryAndSync_finish(Vector< MF * > const &mf)
Wait for outstanding FillBoundaryAndSync_nowait operations launched with the vector helper to complet...
Definition AMReX_FabArrayCommI.H:1180
void FillBoundary(Vector< MF * > const &mf, Vector< int > const &scomp, Vector< int > const &ncomp, Vector< IntVect > const &nghost, Vector< Periodicity > const &period, Vector< int > const &cross={})
Perform FillBoundary on a batch of FabArrays (e.g., MultiFabs).
Definition AMReX_FabArrayCommI.H:1206
BoxArray const & boxArray(FabArrayBase const &fa)
Definition AMReX_FabArrayBase.cpp:2862
Definition AMReX_Dim3.H:13
int x
Definition AMReX_Dim3.H:13
parallel copy or add
Definition AMReX_FabArrayBase.H:538
std::uint64_t m_id
Definition AMReX_FabArrayBase.H:553
bool m_threadsafe_rcv
Definition AMReX_FabArrayBase.H:475
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition AMReX_FabArrayBase.H:478
std::unique_ptr< MapOfCopyComTagContainers > m_SndTags
Definition AMReX_FabArrayBase.H:477
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition AMReX_FabArrayBase.H:476
FillBoundary.
Definition AMReX_FabArrayBase.H:488
std::uint64_t m_id
Definition AMReX_FabArrayBase.H:494
IntVect m_sb_snghost
Definition AMReX_FabArrayBase.H:498
Definition AMReX_TypeTraits.H:61
Definition AMReX_TypeTraits.H:277
FabArray memory allocation information.
Definition AMReX_FabArray.H:68