20 template <
class T0,
class T1>
26 *d =
static_cast<T0
>(s);
30 template <
class T0,
class T1>
36 *d +=
static_cast<T0
>(s);
40 template <
class T0,
class T1>
43 template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,
int> = 0>
51 template <
class T0,
class T1,
class F>
53 fab_to_fab (Vector<Array4CopyTag<T0, T1> >
const& copy_tags,
int scomp,
int dcomp,
int ncomp,
59 sycl::nd_item<1>
const& ,
61 int icell,
int ncells,
int i,
int j,
int k, Array4CopyTag<T0, T1>
const tag) noexcept
64 for (
int n = 0; n < ncomp; ++n) {
65 f(&(tag.dfab(i,j,k,n+dcomp)),
66 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
72 template <
class T0,
class T1,
class F>
74 fab_to_fab (Vector<Array4CopyTag<T0, T1> >
const& copy_tags,
int scomp,
int dcomp,
75 int ncomp, F &&
f, Vector<Array4Tag<int> >
const& masks)
77 using TagType = Array4MaskCopyTag<T0, T1>;
79 const int N = copy_tags.size();
81 for (
int i = 0; i < N; ++i) {
82 tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
83 copy_tags[i].dbox, copy_tags[i].offset});
86 amrex::Abort(
"xxxxx TODO This function still has a bug. Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type.");
91 sycl::nd_item<1>
const& item,
93 int icell,
int ncells,
int i,
int j,
int k, TagType
const& tag) noexcept
96 int g_tid = item.get_global_id(0);
97 int g_wid = g_tid / Gpu::Device::warp_size;
99 int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) :
nullptr;
100 int mypriority = g_wid+1;
104 if (sycl::all_of_group(item.get_sub_group(), msk == 0)) {
107 if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) {
109 sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
117 if (icell < ncells) {
118 for (
int n = 0; n < ncomp; ++n) {
119 f(&(tag.dfab(i,j,k,n+dcomp)),
120 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
128 int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
129 int g_wid = g_tid / Gpu::Device::warp_size;
131 int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) :
nullptr;
132 int mypriority = g_wid+1;
135 int msk = (m && to_try) ? atomicCAS(m, 0, mypriority) : 0;
136 #ifdef AMREX_USE_CUDA
137 if (__all_sync(0xffffffff, msk == 0)) {
138 #elif defined(AMREX_USE_HIP)
139 if (__all(msk == 0)) {
143 #ifdef AMREX_USE_CUDA
144 if (__any_sync(0xffffffff, msk > mypriority)) {
145 #elif defined(AMREX_USE_HIP)
146 if (__any(msk > mypriority)) {
157 if (icell < ncells) {
158 for (
int n = 0; n < ncomp; ++n) {
159 f(&(tag.dfab(i,j,k,n+dcomp)),
160 tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
169 template <
typename T0,
typename T1,
170 std::enable_if_t<amrex::IsStoreAtomic<T0>::value,
int> = 0>
173 int dcomp,
int ncomp, Vector<Array4Tag<int> >
const&)
178 template <
typename T0,
typename T1,
179 std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,
int> = 0>
182 int dcomp,
int ncomp, Vector<Array4Tag<int> >
const& masks)
184 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
187 template <
typename T0,
typename T1,
188 std::enable_if_t<amrex::HasAtomicAdd<T0>::value,
int> = 0>
191 int dcomp,
int ncomp, Vector<Array4Tag<int> >
const&)
196 template <
typename T0,
typename T1,
197 std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,
int> = 0>
200 int dcomp,
int ncomp, Vector<Array4Tag<int> >
const& masks)
202 fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
213 auto const& LocTags = *(TheFB.
m_LocTags);
214 auto N_locs =
static_cast<int>(LocTags.size());
215 if (N_locs == 0) {
return; }
220 #pragma omp parallel for
222 for (
int i = 0; i < N_locs; ++i)
231 dfab->template copy<RunOn::Host>(*sfab, tag.
sbox, scomp, tag.
dbox, scomp, ncomp);
237 for (
int i = 0; i < N_locs; ++i)
244 loc_copy_tags[tag.
dstIndex].push_back
252 const auto& tags = loc_copy_tags[mfi];
253 auto dfab = this->array(mfi);
254 for (
auto const & tag : tags)
256 auto const sfab = tag.sfab->array();
257 const auto offset = tag.offset.dim3();
259 [=] (
int i,
int j,
int k,
int n) noexcept
261 dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
274 auto const& LocTags = *(TheFB.
m_LocTags);
275 int N_locs = LocTags.
size();
276 if (N_locs == 0) {
return; }
281 loc_copy_tags.reserve(N_locs);
287 maskfabs.resize(this->local_size());
288 masks.reserve(N_locs);
291 for (
int i = 0; i < N_locs; ++i)
298 int li = this->localindex(tag.
dstIndex);
299 loc_copy_tags.push_back
300 ({this->atLocalIdx(li).array(),
301 this->fabPtr(tag.
srcIndex)->const_array(),
305 if (maskfabs.
size() > 0) {
306 if (!maskfabs[li].isAllocated()) {
307 maskfabs[li].resize(this->atLocalIdx(li).box());
313 if (maskfabs.
size() > 0) {
321 if (is_thread_safe) {
322 detail::fab_to_fab<value_type, value_type>(loc_copy_tags, scomp, scomp,
325 detail::fab_to_fab_atomic_cpy<value_type, value_type>(
326 loc_copy_tags, scomp, scomp, ncomp, masks);
335 auto const& LocTags = *(thecmd.
m_LocTags);
336 int N_locs = LocTags.
size();
337 if (N_locs == 0) {
return; }
342 loc_setval_tags.reserve(N_locs);
346 for (
int i = 0; i < N_locs; ++i)
350 loc_setval_tags.push_back({this->array(tag.
dstIndex), tag.
dbox});
354 [
x,scomp]
AMREX_GPU_DEVICE (
int i,
int j,
int k,
int n, TagType
const& tag) noexcept
356 tag.dfab(i,j,k,n+scomp) =
x;
365 auto const& RcvTags = *(thecmd.
m_RcvTags);
371 for (
auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
372 for (
auto const& tag: it->second) {
373 rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
377 if (rcv_setval_tags.empty()) {
return; }
382 [
x,scomp]
AMREX_GPU_DEVICE (
int i,
int j,
int k,
int n, TagType
const& tag) noexcept
384 tag.dfab(i,j,k,n+scomp) =
x;
388 #if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
393 const int N_locs = (*TheFB.m_LocTags).
size();
395 for (
int i = 0; i < N_locs; ++i)
397 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
402 loc_copy_tags[tag.dstIndex].push_back
403 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
407 if ( !(TheFB.m_localCopy.ready()) )
409 const_cast<FB&
>(TheFB).m_localCopy.resize(N_locs);
413 for (MFIter mfi(*
this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
415 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
416 const_cast<FB&
>(TheFB).m_localCopy.getHostPtr(0),
417 (TheFB).m_localCopy.getDevicePtr(0),
418 std::size_t(
sizeof(CopyMemory)*N_locs) );
420 const auto& tags = loc_copy_tags[mfi];
421 for (
auto const & tag : tags)
423 const auto offset = tag.offset.dim3();
424 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
428 auto const dst = cmem->getDst<value_type>();
429 auto const src = cmem->getSrc<value_type>();
430 for (int n = 0; n < cmem->ncomp; ++n) {
431 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
436 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
437 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
438 if (last_iter) {
const_cast<FB&
>(TheFB).m_localCopy.setGraph( graphExec ); }
447 for (MFIter mfi(*
this); mfi.isValid(); ++mfi)
449 auto const dst_array = this->array(mfi);
450 const auto& tags = loc_copy_tags[mfi];
451 for (
auto const & tag : tags)
453 const_cast<FB&
>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
460 TheFB.m_localCopy.executeGraph();
466 FabArray<FAB>::FB_local_copy_cuda_graph_n (
const FB& TheFB,
int scomp,
int ncomp)
468 const int N_locs = TheFB.m_LocTags->size();
472 for (
int i = 0; i < N_locs; ++i)
474 const CopyComTag& tag = (*TheFB.m_LocTags)[i];
481 loc_copy_tags[tag.dstIndex].push_back
482 ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
489 if ( !(TheFB.m_localCopy.ready()) )
491 const_cast<FB&
>(TheFB).m_localCopy.resize(launches);
495 for (MFIter mfi(*
this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
497 const auto& tags = loc_copy_tags[mfi];
498 for (
int t = 0; t<tags.size(); ++t)
500 Gpu::Device::setStreamIndex(cuda_stream++);
501 amrex::Gpu::Device::startGraphRecording( (idx == 0),
502 const_cast<FB&
>(TheFB).m_localCopy.getHostPtr(0),
503 (TheFB).m_localCopy.getDevicePtr(0),
504 std::size_t(
sizeof(CopyMemory)*launches) );
506 const auto& tag = tags[t];
507 const Dim3
offset = tag.offset.dim3();
509 CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
512 auto const dst = cmem->getDst<value_type>();
513 auto const src = cmem->getSrc<value_type>();
514 for (int n = 0; n < cmem->ncomp; ++n) {
515 dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
519 bool last_iter = idx == launches;
520 cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
521 if (last_iter) {
const_cast<FB&
>(TheFB).m_localCopy.setGraph( graphExec ); }
529 for (MFIter mfi(*
this); mfi.isValid(); ++mfi)
531 const auto& dst_array = this->array(mfi);
532 const auto& tags = loc_copy_tags[mfi];
533 for (
auto const & tag : tags)
535 const_cast<FB&
>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
542 TheFB.m_localCopy.executeGraph(
false);
554 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
558 FabArray<FAB>::FB_pack_send_buffer_cuda_graph (
const FB& TheFB,
int scomp,
int ncomp,
559 Vector<char*>& send_data,
560 Vector<std::size_t>
const& send_size,
561 Vector<
typename FabArray<FAB>::CopyComTagsContainer
const*>
const& send_cctc)
563 const int N_snds = send_data.size();
564 if (N_snds == 0) {
return; }
566 if ( !(TheFB.m_copyToBuffer.ready()) )
571 for (
int send = 0; send < N_snds; ++send) {
572 if (send_size[send] > 0) {
573 launches += send_cctc[send]->size();
576 const_cast<FB&
>(TheFB).m_copyToBuffer.resize(launches);
580 for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
581 sit.isValid(); ++sit)
583 amrex::Gpu::Device::startGraphRecording( (sit() == 0),
584 const_cast<FB&
>(TheFB).m_copyToBuffer.getHostPtr(0),
585 (TheFB).m_copyToBuffer.getDevicePtr(0),
586 std::size_t(
sizeof(CopyMemory)*launches) );
589 if (send_size[j] > 0)
591 auto const& cctc = *send_cctc[j];
592 for (
auto const& tag : cctc)
594 const Box& bx = tag.sbox;
595 CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
598 auto const pfab = cmem->getDst<value_type>();
599 auto const sfab = cmem->getSrc<value_type>();
600 for (
int n = 0; n < cmem->ncomp; ++n)
602 pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
608 bool last_iter = sit() == (N_snds-1);
609 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
610 if (last_iter) {
const_cast<FB&
>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
616 for (
int send = 0; send < N_snds; ++send)
619 if (send_size[j] > 0)
621 char* dptr = send_data[j];
622 auto const& cctc = *send_cctc[j];
623 for (
auto const& tag : cctc)
625 const_cast<FB&
>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
631 dptr += (tag.sbox.numPts() * ncomp *
sizeof(value_type));
634 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
639 TheFB.m_copyToBuffer.executeGraph();
644 FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (
const FB& TheFB,
int dcomp,
int ncomp,
645 Vector<char*>
const& recv_data,
646 Vector<std::size_t>
const& recv_size,
647 Vector<CopyComTagsContainer const*>
const& recv_cctc,
650 const int N_rcvs = recv_cctc.size();
651 if (N_rcvs == 0) {
return; }
655 for (
int k = 0; k < N_rcvs; ++k)
657 if (recv_size[k] > 0)
659 const char* dptr = recv_data[k];
660 auto const& cctc = *recv_cctc[k];
661 for (
auto const& tag : cctc)
663 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
664 dptr += tag.dbox.numPts() * ncomp *
sizeof(value_type);
668 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
672 if ( !(TheFB.m_copyFromBuffer.ready()) )
674 const_cast<FB&
>(TheFB).m_copyFromBuffer.resize(launches);
677 for (MFIter mfi(*
this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
679 amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
680 const_cast<FB&
>(TheFB).m_copyFromBuffer.getHostPtr(0),
681 (TheFB).m_copyFromBuffer.getDevicePtr(0),
682 std::size_t(
sizeof(CopyMemory)*launches) );
684 const auto& tags = recv_copy_tags[mfi];
685 for (
auto const & tag : tags)
687 CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
690 auto const pfab = cmem->getSrc<value_type>();
691 auto const dfab = cmem->getDst<value_type>();
692 for (int n = 0; n < cmem->ncomp; ++n)
694 dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
699 bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
700 cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
701 if (last_iter) {
const_cast<FB&
>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
707 for (MFIter mfi(*
this); mfi.isValid(); ++mfi)
709 auto dst_array = this->array(mfi);
710 const auto & tags = recv_copy_tags[mfi];
711 for (
auto const & tag : tags)
713 const_cast<FB&
>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(
amrex::makeArray4((value_type*)(tag.p),
722 TheFB.m_copyFromBuffer.executeGraph();
728 template <
typename BUF>
737 const int N_snds = send_data.
size();
738 if (N_snds == 0) {
return; }
740 char* pbuffer = send_data[0];
741 std::size_t szbuffer = 0;
747 szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1];
754 for (
int j = 0; j < N_snds; ++j)
756 if (send_size[j] > 0)
758 std::size_t
offset = send_data[j]-send_data[0];
759 char* dptr = pbuffer +
offset;
760 auto const& cctc = *send_cctc[j];
761 for (
auto const& tag : cctc)
763 snd_copy_tags.emplace_back(TagType{
765 src.
array(tag.srcIndex),
769 dptr += (tag.sbox.numPts() * ncomp *
sizeof(BUF));
775 detail::fab_to_fab<BUF, value_type>(snd_copy_tags, scomp, 0, ncomp,
780 if (pbuffer != send_data[0]) {
788 template <
typename BUF>
794 CpOp op,
bool is_thread_safe)
798 const int N_rcvs = recv_cctc.
size();
799 if (N_rcvs == 0) {
return; }
801 char* pbuffer = recv_data[0];
803 std::size_t szbuffer = 0;
808 szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
817 recv_copy_tags.reserve(N_rcvs);
830 for (
int k = 0; k < N_rcvs; ++k)
832 if (recv_size[k] > 0)
834 std::size_t
offset = recv_data[k]-recv_data[0];
835 const char* dptr = pbuffer +
offset;
836 auto const& cctc = *recv_cctc[k];
837 for (
auto const& tag : cctc)
840 recv_copy_tags.emplace_back(TagType{
846 dptr += tag.dbox.numPts() * ncomp *
sizeof(BUF);
848 if (maskfabs.
size() > 0) {
849 if (!maskfabs[li].isAllocated()) {
850 maskfabs[li].resize(dst.
atLocalIdx(li).box());
859 if (maskfabs.
size() > 0) {
867 if (op == FabArrayBase::COPY)
869 if (is_thread_safe) {
870 detail::fab_to_fab<value_type, BUF>(
873 detail::fab_to_fab_atomic_cpy<value_type, BUF>(
874 recv_copy_tags, 0, dcomp, ncomp, masks);
879 if (is_thread_safe) {
880 detail::fab_to_fab<value_type, BUF>(
883 detail::fab_to_fab_atomic_add<value_type, BUF>(
884 recv_copy_tags, 0, dcomp, ncomp, masks);
890 if (pbuffer != recv_data[0]) {
898 template <
typename BUF>
907 auto const N_snds =
static_cast<int>(send_data.
size());
908 if (N_snds == 0) {
return; }
911 #pragma omp parallel for
913 for (
int j = 0; j < N_snds; ++j)
915 if (send_size[j] > 0)
917 char* dptr = send_data[j];
918 auto const& cctc = *send_cctc[j];
919 for (
auto const& tag : cctc)
921 const Box& bx = tag.sbox;
922 auto const sfab = src.
array(tag.srcIndex);
925 [=] (
int ii,
int jj,
int kk,
int n) noexcept
927 pfab(ii,jj,kk,n) =
static_cast<BUF
>(sfab(ii,jj,kk,n+scomp));
929 dptr += (bx.
numPts() * ncomp *
sizeof(BUF));
931 BL_ASSERT(dptr <= send_data[j] + send_size[j]);
937 template <
typename BUF>
943 CpOp op,
bool is_thread_safe)
947 auto const N_rcvs =
static_cast<int>(recv_cctc.
size());
948 if (N_rcvs == 0) {
return; }
953 #pragma omp parallel for
955 for (
int k = 0; k < N_rcvs; ++k)
957 if (recv_size[k] > 0)
959 const char* dptr = recv_data[k];
960 auto const& cctc = *recv_cctc[k];
961 for (
auto const& tag : cctc)
963 const Box& bx = tag.dbox;
964 FAB& dfab = dst[tag.dstIndex];
965 if (op == FabArrayBase::COPY)
967 dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
971 dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
973 dptr += bx.
numPts() * ncomp *
sizeof(BUF);
975 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
983 for (
int k = 0; k < N_rcvs; ++k)
985 if (recv_size[k] > 0)
987 const char* dptr = recv_data[k];
988 auto const& cctc = *recv_cctc[k];
989 for (
auto const& tag : cctc)
991 recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
992 dptr += tag.dbox.numPts() * ncomp *
sizeof(BUF);
994 BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
1003 const auto& tags = recv_copy_tags[mfi];
1004 auto dfab = dst.
array(mfi);
1005 for (
auto const & tag : tags)
1008 if (op == FabArrayBase::COPY)
1011 [=] (
int i,
int j,
int k,
int n) noexcept
1013 dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1019 [=] (
int i,
int j,
int k,
int n) noexcept
1021 dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
#define BL_ASSERT(EX)
Definition: AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition: AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition: AMReX_GpuLaunch.nolint.H:50
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
AMREX_GPU_HOST_DEVICE const IntVectND< dim > & smallEnd() const &noexcept
Get the smallend of the BoxND.
Definition: AMReX_Box.H:105
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition: AMReX_FabArrayBase.H:94
int size() const noexcept
Return the number of FABs in the FabArray.
Definition: AMReX_FabArrayBase.H:109
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition: AMReX_FabArrayBase.H:118
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition: AMReX_FabArrayBase.H:112
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition: AMReX_FabArrayBase.H:130
CpOp
parallel copy or add
Definition: AMReX_FabArrayBase.H:393
An Array of FortranArrayBox(FAB)-like Objects.
Definition: AMReX_FabArray.H:344
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition: AMReX_FabArray.H:355
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition: AMReX_FabArray.H:1561
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition: AMReX_FabArray.H:530
a one-thingy-per-box distributed object
Definition: AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition: AMReX_LayoutData.H:25
Definition: AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27
Long size() const noexcept
Definition: AMReX_Vector.H:50
@ FAB
Definition: AMReX_AmrvisConstants.H:86
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition: AMReX_GpuAtomic.H:511
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition: AMReX_GpuAtomic.H:281
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:233
static constexpr DeviceToHost deviceToHost
Definition: AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition: AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
int MyProc()
Definition: AMReX_MPMD.cpp:117
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_cpu(FabArray< FAB > &mf, int dcomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{}) noexcept
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_gpu(FabArray< FAB > &mf, int scomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{})
bool sameTeam(int rank) noexcept
Definition: AMReX_ParallelDescriptor.H:329
bool UseGpuAwareMpi()
Definition: AMReX_ParallelDescriptor.H:111
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
void ParallelFor_doit(Vector< TagType > const &tags, F &&f)
Definition: AMReX_TagParallelFor.H:170
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition: AMReX_BaseFab.H:87
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition: AMReX_Loop.H:378
constexpr AMREX_GPU_HOST_DEVICE GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition: AMReX_Tuple.H:179
BoxArray const & boxArray(FabArrayBase const &fa)
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:225
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609
Definition: AMReX_FabArrayCommI.H:896
void fab_to_fab(Vector< Array4CopyTag< T0, T1 > > const ©_tags, int scomp, int dcomp, int ncomp, F &&f)
Definition: AMReX_FBI.H:53
void fab_to_fab_atomic_add(Vector< Array4CopyTag< T0, T1 > > const ©_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition: AMReX_FBI.H:190
void fab_to_fab_atomic_cpy(Vector< Array4CopyTag< T0, T1 > > const ©_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition: AMReX_FBI.H:172
Definition: AMReX_FBI.H:5
IntVect offset
Definition: AMReX_FBI.H:8
FAB const * sfab
Definition: AMReX_FBI.H:6
Box dbox
Definition: AMReX_FBI.H:7
Definition: AMReX_FBI.H:11
char const * p
Definition: AMReX_FBI.H:12
Box dbox
Definition: AMReX_FBI.H:13
Definition: AMReX_TagParallelFor.H:57
Definition: AMReX_TagParallelFor.H:26
Definition: AMReX_TagParallelFor.H:49
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:50
Definition: AMReX_Dim3.H:12
Used by a bunch of routines when communicating via MPI.
Definition: AMReX_FabArrayBase.H:194
Box sbox
Definition: AMReX_FabArrayBase.H:196
int srcIndex
Definition: AMReX_FabArrayBase.H:198
Box dbox
Definition: AMReX_FabArrayBase.H:195
int dstIndex
Definition: AMReX_FabArrayBase.H:197
FillBoundary.
Definition: AMReX_FabArrayBase.H:487
Definition: AMReX_TypeTraits.H:56
Definition: AMReX_TypeTraits.H:266
Definition: AMReX_FBI.H:32
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:34
Definition: AMReX_FBI.H:42
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(U0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:45
Definition: AMReX_FBI.H:22
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:24