Block-Structured AMR Software Framework
AMReX_FBI.H
Go to the documentation of this file.
1 #ifndef AMREX_FBI_H_
2 #define AMREX_FBI_H_
3 
4 template <class FAB>
5 struct FabCopyTag {
6  FAB const* sfab;
8  IntVect offset; // sbox.smallEnd() - dbox.smallEnd()
9 };
10 
11 struct VoidCopyTag {
12  char const* p;
14 };
15 
16 namespace detail {
17 
18 #ifdef AMREX_USE_GPU
19 
20 template <class T0, class T1>
21 struct CellStore
22 {
24  operator() (T0* d, T1 s) const noexcept
25  {
26  *d = static_cast<T0>(s);
27  }
28 };
29 
30 template <class T0, class T1>
31 struct CellAdd
32 {
34  operator() (T0* d, T1 s) const noexcept
35  {
36  *d += static_cast<T0>(s);
37  }
38 };
39 
40 template <class T0, class T1>
42 {
43  template<class U0=T0, std::enable_if_t<amrex::HasAtomicAdd<U0>::value,int> = 0>
45  operator() (U0* d, T1 s) const noexcept
46  {
47  Gpu::Atomic::AddNoRet(d, static_cast<U0>(s));
48  }
49 };
50 
51 template <class T0, class T1, class F>
52 void
53 fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp, int ncomp,
54  F && f)
55 {
56  detail::ParallelFor_doit(copy_tags,
57  [=] AMREX_GPU_DEVICE (
58 #ifdef AMREX_USE_SYCL
59  sycl::nd_item<1> const& /*item*/,
60 #endif
61  int icell, int ncells, int i, int j, int k, Array4CopyTag<T0, T1> const tag) noexcept
62  {
63  if (icell < ncells) {
64  for (int n = 0; n < ncomp; ++n) {
65  f(&(tag.dfab(i,j,k,n+dcomp)),
66  tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
67  }
68  }
69  });
70 }
71 
72 template <class T0, class T1, class F>
73 void
74 fab_to_fab (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp, int dcomp,
75  int ncomp, F && f, Vector<Array4Tag<int> > const& masks)
76 {
77  using TagType = Array4MaskCopyTag<T0, T1>;
78  Vector<TagType> tags;
79  const int N = copy_tags.size();
80  tags.reserve(N);
81  for (int i = 0; i < N; ++i) {
82  tags.push_back(TagType{copy_tags[i].dfab, copy_tags[i].sfab, masks[i].dfab,
83  copy_tags[i].dbox, copy_tags[i].offset});
84  }
85 
86  amrex::Abort("xxxxx TODO This function still has a bug. Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type.");
87 
89  [=] AMREX_GPU_DEVICE (
90 #ifdef AMREX_USE_SYCL
91  sycl::nd_item<1> const& item,
92 #endif
93  int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept
94  {
95 #ifdef AMREX_USE_SYCL
96  int g_tid = item.get_global_id(0);
97  int g_wid = g_tid / Gpu::Device::warp_size;
98 
99  int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) : nullptr;
100  int mypriority = g_wid+1;
101  int to_try = 1;
102  while (true) {
103  int msk = (m && to_try) ? Gpu::Atomic::CAS(m, 0, mypriority) : 0;
104  if (sycl::all_of_group(item.get_sub_group(), msk == 0)) { // 0 means lock acquired
105  break; // all threads have acquired.
106  } else {
107  if (sycl::any_of_group(item.get_sub_group(), msk > mypriority)) {
108  if (m) { *m = 0; } // yield
109  sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
110  to_try = 1;
111  } else {
112  to_try = (msk > 0); // hold on to my lock
113  }
114  }
115  };
116 
117  if (icell < ncells) {
118  for (int n = 0; n < ncomp; ++n) {
119  f(&(tag.dfab(i,j,k,n+dcomp)),
120  tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
121  }
122  }
123 
124  if (m) *m = 0;
125 
126 #else
127 
128  int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
129  int g_wid = g_tid / Gpu::Device::warp_size;
130 
131  int* m = (icell < ncells) ? tag.mask.ptr(i,j,k) : nullptr;
132  int mypriority = g_wid+1;
133  int to_try = 1;
134  while (true) {
135  int msk = (m && to_try) ? atomicCAS(m, 0, mypriority) : 0;
136 #ifdef AMREX_USE_CUDA
137  if (__all_sync(0xffffffff, msk == 0)) { // 0 means lock acquired
138 #elif defined(AMREX_USE_HIP)
139  if (__all(msk == 0)) {
140 #endif
141  break; // all threads have acquired.
142  } else {
143 #ifdef AMREX_USE_CUDA
144  if (__any_sync(0xffffffff, msk > mypriority)) {
145 #elif defined(AMREX_USE_HIP)
146  if (__any(msk > mypriority)) {
147 #endif
148  if (m) *m = 0; // yield
149  __threadfence();
150  to_try = 1;
151  } else {
152  to_try = (msk > 0); // hold on to my lock
153  }
154  }
155  };
156 
157  if (icell < ncells) {
158  for (int n = 0; n < ncomp; ++n) {
159  f(&(tag.dfab(i,j,k,n+dcomp)),
160  tag.sfab(i+tag.offset.x,j+tag.offset.y,k+tag.offset.z,n+scomp));
161  }
162  }
163 
164  if (m) *m = 0;
165 #endif
166  });
167 }
168 
169 template <typename T0, typename T1,
170  std::enable_if_t<amrex::IsStoreAtomic<T0>::value,int> = 0>
171 void
172 fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
173  int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
174 {
175  fab_to_fab<T0, T1>(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>());
176 }
177 
178 template <typename T0, typename T1,
179  std::enable_if_t<!amrex::IsStoreAtomic<T0>::value,int> = 0>
180 void
181 fab_to_fab_atomic_cpy (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
182  int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
183 {
184  fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellStore<T0, T1>(), masks);
185 }
186 
187 template <typename T0, typename T1,
188  std::enable_if_t<amrex::HasAtomicAdd<T0>::value,int> = 0>
189 void
190 fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
191  int dcomp, int ncomp, Vector<Array4Tag<int> > const&)
192 {
193  fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAtomicAdd<T0, T1>());
194 }
195 
196 template <typename T0, typename T1,
197  std::enable_if_t<!amrex::HasAtomicAdd<T0>::value,int> = 0>
198 void
199 fab_to_fab_atomic_add (Vector<Array4CopyTag<T0, T1> > const& copy_tags, int scomp,
200  int dcomp, int ncomp, Vector<Array4Tag<int> > const& masks)
201 {
202  fab_to_fab(copy_tags, scomp, dcomp, ncomp, CellAdd<T0, T1>(), masks);
203 }
204 
205 #endif /* AMREX_USE_GPU */
206 
207 }
208 
209 template <class FAB>
210 void
211 FabArray<FAB>::FB_local_copy_cpu (const FB& TheFB, int scomp, int ncomp)
212 {
213  auto const& LocTags = *(TheFB.m_LocTags);
214  auto N_locs = static_cast<int>(LocTags.size());
215  if (N_locs == 0) { return; }
216  bool is_thread_safe = TheFB.m_threadsafe_loc;
217  if (is_thread_safe)
218  {
219 #ifdef AMREX_USE_OMP
220 #pragma omp parallel for
221 #endif
222  for (int i = 0; i < N_locs; ++i)
223  {
224  const CopyComTag& tag = LocTags[i];
225 
226  BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
227  BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
228 
229  const FAB* sfab = &(get(tag.srcIndex));
230  FAB* dfab = &(get(tag.dstIndex));
231  dfab->template copy<RunOn::Host>(*sfab, tag.sbox, scomp, tag.dbox, scomp, ncomp);
232  }
233  }
234  else
235  {
237  for (int i = 0; i < N_locs; ++i)
238  {
239  const CopyComTag& tag = LocTags[i];
240 
241  BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
242  BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
243 
244  loc_copy_tags[tag.dstIndex].push_back
245  ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
246  }
247 #ifdef AMREX_USE_OMP
248 #pragma omp parallel
249 #endif
250  for (MFIter mfi(*this); mfi.isValid(); ++mfi)
251  {
252  const auto& tags = loc_copy_tags[mfi];
253  auto dfab = this->array(mfi);
254  for (auto const & tag : tags)
255  {
256  auto const sfab = tag.sfab->array();
257  const auto offset = tag.offset.dim3();
258  amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
259  [=] (int i, int j, int k, int n) noexcept
260  {
261  dfab(i,j,k,n+scomp) = sfab(i+offset.x,j+offset.y,k+offset.z,n+scomp);
262  });
263  }
264  }
265  }
266 }
267 
268 #ifdef AMREX_USE_GPU
269 
270 template <class FAB>
271 void
272 FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
273 {
274  auto const& LocTags = *(TheFB.m_LocTags);
275  int N_locs = LocTags.size();
276  if (N_locs == 0) { return; }
277  bool is_thread_safe = TheFB.m_threadsafe_loc;
278 
279  using TagType = Array4CopyTag<value_type>;
280  Vector<TagType> loc_copy_tags;
281  loc_copy_tags.reserve(N_locs);
282 
283  Vector<BaseFab<int> > maskfabs;
284  Vector<Array4Tag<int> > masks;
285  if (!amrex::IsStoreAtomic<value_type>::value && !is_thread_safe)
286  {
287  maskfabs.resize(this->local_size());
288  masks.reserve(N_locs);
289  }
290 
291  for (int i = 0; i < N_locs; ++i)
292  {
293  const CopyComTag& tag = LocTags[i];
294 
295  BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
296  BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
297 
298  int li = this->localindex(tag.dstIndex);
299  loc_copy_tags.push_back
300  ({this->atLocalIdx(li).array(),
301  this->fabPtr(tag.srcIndex)->const_array(),
302  tag.dbox,
303  (tag.sbox.smallEnd()-tag.dbox.smallEnd()).dim3()});
304 
305  if (maskfabs.size() > 0) {
306  if (!maskfabs[li].isAllocated()) {
307  maskfabs[li].resize(this->atLocalIdx(li).box());
308  }
309  masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
310  }
311  }
312 
313  if (maskfabs.size() > 0) {
314  amrex::ParallelFor(masks,
315  [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
316  {
317  msk.dfab(i,j,k) = 0;
318  });
319  }
320 
321  if (is_thread_safe) {
322  detail::fab_to_fab<value_type, value_type>(loc_copy_tags, scomp, scomp,
324  } else {
325  detail::fab_to_fab_atomic_cpy<value_type, value_type>(
326  loc_copy_tags, scomp, scomp, ncomp, masks);
327  }
328 }
329 
330 template <class FAB>
331 void
333  const CommMetaData& thecmd, int scomp, int ncomp)
334 {
335  auto const& LocTags = *(thecmd.m_LocTags);
336  int N_locs = LocTags.size();
337  if (N_locs == 0) { return; }
338  bool is_thread_safe = thecmd.m_threadsafe_loc;
339 
340  using TagType = Array4BoxTag<value_type>;
341  Vector<TagType> loc_setval_tags;
342  loc_setval_tags.reserve(N_locs);
343 
345 
346  for (int i = 0; i < N_locs; ++i)
347  {
348  const CopyComTag& tag = LocTags[i];
349  BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
350  loc_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
351  }
352 
353  amrex::ParallelFor(loc_setval_tags, ncomp,
354  [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
355  {
356  tag.dfab(i,j,k,n+scomp) = x;
357  });
358 }
359 
360 template <class FAB>
361 void
363  const CommMetaData& thecmd, int scomp, int ncomp)
364 {
365  auto const& RcvTags = *(thecmd.m_RcvTags);
366  bool is_thread_safe = thecmd.m_threadsafe_rcv;
367 
368  using TagType = Array4BoxTag<value_type>;
369  Vector<TagType> rcv_setval_tags;
370 
371  for (auto it = RcvTags.begin(); it != RcvTags.end(); ++it) {
372  for (auto const& tag: it->second) {
373  rcv_setval_tags.push_back({this->array(tag.dstIndex), tag.dbox});
374  }
375  }
376 
377  if (rcv_setval_tags.empty()) { return; }
378 
380 
381  amrex::ParallelFor(rcv_setval_tags, ncomp,
382  [x,scomp] AMREX_GPU_DEVICE (int i, int j, int k, int n, TagType const& tag) noexcept
383  {
384  tag.dfab(i,j,k,n+scomp) = x;
385  });
386 }
387 
388 #if defined(__CUDACC__) && defined (AMREX_USE_CUDA)
389 template <class FAB>
390 void
391 FabArray<FAB>::FB_local_copy_cuda_graph_1 (const FB& TheFB, int scomp, int ncomp)
392 {
393  const int N_locs = (*TheFB.m_LocTags).size();
395  for (int i = 0; i < N_locs; ++i)
396  {
397  const CopyComTag& tag = (*TheFB.m_LocTags)[i];
398 
399  BL_ASSERT(distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc());
400  BL_ASSERT(distributionMap[tag.srcIndex] == ParallelDescriptor::MyProc());
401 
402  loc_copy_tags[tag.dstIndex].push_back
403  ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
404  }
405 
406  // Create Graph if one is needed.
407  if ( !(TheFB.m_localCopy.ready()) )
408  {
409  const_cast<FB&>(TheFB).m_localCopy.resize(N_locs);
410 
411  int idx = 0;
412  // Record the graph.
413  for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
414  {
415  amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
416  const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
417  (TheFB).m_localCopy.getDevicePtr(0),
418  std::size_t(sizeof(CopyMemory)*N_locs) );
419 
420  const auto& tags = loc_copy_tags[mfi];
421  for (auto const & tag : tags)
422  {
423  const auto offset = tag.offset.dim3();
424  CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
425  AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
426  {
427  // Build the Array4's.
428  auto const dst = cmem->getDst<value_type>();
429  auto const src = cmem->getSrc<value_type>();
430  for (int n = 0; n < cmem->ncomp; ++n) {
431  dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
432  }
433  });
434  }
435 
436  bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
437  cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
438  if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
439  }
440  }
441 
442  // Setup Launch Parameters
443  // This is perfectly threadable, right?
444  // Additional optimization -> Check to see whether values need to be reset?
445  // Can then remove this setup and memcpy from CudaGraph::executeGraph.
446  int idx = 0;
447  for (MFIter mfi(*this); mfi.isValid(); ++mfi)
448  {
449  auto const dst_array = this->array(mfi);
450  const auto& tags = loc_copy_tags[mfi];
451  for (auto const & tag : tags)
452  {
453  const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
454  dst_array,
455  scomp, ncomp));
456  }
457  }
458 
459  // Launch Graph
460  TheFB.m_localCopy.executeGraph();
461 }
462 
463 #ifdef AMREX_USE_MPI
464 template <class FAB>
465 void
466 FabArray<FAB>::FB_local_copy_cuda_graph_n (const FB& TheFB, int scomp, int ncomp)
467 {
468  const int N_locs = TheFB.m_LocTags->size();
469 
470  int launches = 0; // Used for graphs only.
471  LayoutData<Vector<FabCopyTag<FAB> > > loc_copy_tags(boxArray(),DistributionMap());
472  for (int i = 0; i < N_locs; ++i)
473  {
474  const CopyComTag& tag = (*TheFB.m_LocTags)[i];
475 
476  BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.dstIndex]));
477  BL_ASSERT(ParallelDescriptor::sameTeam(distributionMap[tag.srcIndex]));
478 
479  if (distributionMap[tag.dstIndex] == ParallelDescriptor::MyProc())
480  {
481  loc_copy_tags[tag.dstIndex].push_back
482  ({this->fabPtr(tag.srcIndex), tag.dbox, tag.sbox.smallEnd()-tag.dbox.smallEnd()});
483  launches++;
484  }
485  }
486 
487  FillBoundary_test();
488 
489  if ( !(TheFB.m_localCopy.ready()) )
490  {
491  const_cast<FB&>(TheFB).m_localCopy.resize(launches);
492 
493  int idx = 0;
494  int cuda_stream = 0;
495  for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
496  {
497  const auto& tags = loc_copy_tags[mfi];
498  for (int t = 0; t<tags.size(); ++t)
499  {
500  Gpu::Device::setStreamIndex(cuda_stream++);
501  amrex::Gpu::Device::startGraphRecording( (idx == 0),
502  const_cast<FB&>(TheFB).m_localCopy.getHostPtr(0),
503  (TheFB).m_localCopy.getDevicePtr(0),
504  std::size_t(sizeof(CopyMemory)*launches) );
505 
506  const auto& tag = tags[t];
507  const Dim3 offset = tag.offset.dim3();
508 
509  CopyMemory* cmem = TheFB.m_localCopy.getDevicePtr(idx++);
510  AMREX_HOST_DEVICE_FOR_3D(tag.dbox, i, j, k,
511  {
512  auto const dst = cmem->getDst<value_type>();
513  auto const src = cmem->getSrc<value_type>();
514  for (int n = 0; n < cmem->ncomp; ++n) {
515  dst(i,j,k,(cmem->scomp)+n) = src(i+offset.x,j+offset.y,k+offset.z,(cmem->scomp)+n);
516  }
517  });
518 
519  bool last_iter = idx == launches;
520  cudaGraphExec_t graphExec = Gpu::Device::stopGraphRecording(last_iter);
521  if (last_iter) { const_cast<FB&>(TheFB).m_localCopy.setGraph( graphExec ); }
522  }
523  }
524  }
525 
526  // Setup Launch Parameters
527  // This is perfectly threadable, right?
528  int idx = 0;
529  for (MFIter mfi(*this); mfi.isValid(); ++mfi)
530  {
531  const auto& dst_array = this->array(mfi);
532  const auto& tags = loc_copy_tags[mfi];
533  for (auto const & tag : tags)
534  {
535  const_cast<FB&>(TheFB).m_localCopy.setParams(idx++, makeCopyMemory(tag.sfab->array(),
536  dst_array,
537  scomp, ncomp));
538  }
539  }
540 
541  // Launch Graph without synch. Local work is entirely independent.
542  TheFB.m_localCopy.executeGraph(false);
543 }
544 #endif /* AMREX_USE_MPI */
545 
546 #endif /* __CUDACC__ */
547 
548 #endif /* AMREX_USE_GPU */
549 
550 #ifdef AMREX_USE_MPI
551 
552 #ifdef AMREX_USE_GPU
553 
554 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
555 
556 template <class FAB>
557 void
558 FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int ncomp,
559  Vector<char*>& send_data,
560  Vector<std::size_t> const& send_size,
561  Vector<typename FabArray<FAB>::CopyComTagsContainer const*> const& send_cctc)
562 {
563  const int N_snds = send_data.size();
564  if (N_snds == 0) { return; }
565 
566  if ( !(TheFB.m_copyToBuffer.ready()) )
567  {
568  // Set size of CudaGraph buffer.
569  // Is the conditional ever expected false?
570  int launches = 0;
571  for (int send = 0; send < N_snds; ++send) {
572  if (send_size[send] > 0) {
573  launches += send_cctc[send]->size();
574  }
575  }
576  const_cast<FB&>(TheFB).m_copyToBuffer.resize(launches);
577 
578  // Record the graph.
579  int idx = 0;
580  for (Gpu::StreamIter sit(N_snds,Gpu::StreamItInfo().DisableDeviceSync());
581  sit.isValid(); ++sit)
582  {
583  amrex::Gpu::Device::startGraphRecording( (sit() == 0),
584  const_cast<FB&>(TheFB).m_copyToBuffer.getHostPtr(0),
585  (TheFB).m_copyToBuffer.getDevicePtr(0),
586  std::size_t(sizeof(CopyMemory)*launches) );
587 
588  const int j = sit();
589  if (send_size[j] > 0)
590  {
591  auto const& cctc = *send_cctc[j];
592  for (auto const& tag : cctc)
593  {
594  const Box& bx = tag.sbox;
595  CopyMemory* cmem = TheFB.m_copyToBuffer.getDevicePtr(idx++);
596  AMREX_HOST_DEVICE_FOR_3D (bx, ii, jj, kk,
597  {
598  auto const pfab = cmem->getDst<value_type>();
599  auto const sfab = cmem->getSrc<value_type>();
600  for (int n = 0; n < cmem->ncomp; ++n)
601  {
602  pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+(cmem->scomp));
603  }
604  });
605  }
606  }
607 
608  bool last_iter = sit() == (N_snds-1);
609  cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
610  if (last_iter) { const_cast<FB&>(TheFB).m_copyToBuffer.setGraph( graphExec ); }
611  }
612  }
613 
614  // Setup Launch Parameters
615  int idx = 0;
616  for (int send = 0; send < N_snds; ++send)
617  {
618  const int j = send;
619  if (send_size[j] > 0)
620  {
621  char* dptr = send_data[j];
622  auto const& cctc = *send_cctc[j];
623  for (auto const& tag : cctc)
624  {
625  const_cast<FB&>(TheFB).m_copyToBuffer.setParams(idx++, makeCopyMemory(this->array(tag.srcIndex),
626  amrex::makeArray4((value_type*)(dptr),
627  tag.sbox,
628  ncomp),
629  scomp, ncomp));
630 
631  dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
632  }
633  amrex::ignore_unused(send_size);
634  BL_ASSERT(dptr <= send_data[j] + send_size[j]);
635  }
636  }
637 
638  // Launch Graph synched, so copyToBuffer is complete prior to posting sends.
639  TheFB.m_copyToBuffer.executeGraph();
640 }
641 
642 template <class FAB>
643 void
644 FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int ncomp,
645  Vector<char*> const& recv_data,
646  Vector<std::size_t> const& recv_size,
647  Vector<CopyComTagsContainer const*> const& recv_cctc,
648  bool /*is_thread_safe*/)
649 {
650  const int N_rcvs = recv_cctc.size();
651  if (N_rcvs == 0) { return; }
652 
653  int launches = 0;
654  LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
655  for (int k = 0; k < N_rcvs; ++k)
656  {
657  if (recv_size[k] > 0)
658  {
659  const char* dptr = recv_data[k];
660  auto const& cctc = *recv_cctc[k];
661  for (auto const& tag : cctc)
662  {
663  recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
664  dptr += tag.dbox.numPts() * ncomp * sizeof(value_type);
665  launches++;
666  }
667  amrex::ignore_unused(recv_size);
668  BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
669  }
670  }
671 
672  if ( !(TheFB.m_copyFromBuffer.ready()) )
673  {
674  const_cast<FB&>(TheFB).m_copyFromBuffer.resize(launches);
675 
676  int idx = 0;
677  for (MFIter mfi(*this, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi)
678  {
679  amrex::Gpu::Device::startGraphRecording( (mfi.LocalIndex() == 0),
680  const_cast<FB&>(TheFB).m_copyFromBuffer.getHostPtr(0),
681  (TheFB).m_copyFromBuffer.getDevicePtr(0),
682  std::size_t(sizeof(CopyMemory)*launches) );
683 
684  const auto& tags = recv_copy_tags[mfi];
685  for (auto const & tag : tags)
686  {
687  CopyMemory* cmem = TheFB.m_copyFromBuffer.getDevicePtr(idx++);
688  AMREX_HOST_DEVICE_FOR_3D (tag.dbox, i, j, k,
689  {
690  auto const pfab = cmem->getSrc<value_type>();
691  auto const dfab = cmem->getDst<value_type>();
692  for (int n = 0; n < cmem->ncomp; ++n)
693  {
694  dfab(i,j,k,n+(cmem->scomp)) = pfab(i,j,k,n);
695  }
696  });
697  }
698 
699  bool last_iter = mfi.LocalIndex() == (this->local_size()-1);
700  cudaGraphExec_t graphExec = amrex::Gpu::Device::stopGraphRecording(last_iter);
701  if (last_iter) { const_cast<FB&>(TheFB).m_copyFromBuffer.setGraph( graphExec ); }
702  }
703  }
704 
705  // Setup graph.
706  int idx = 0;
707  for (MFIter mfi(*this); mfi.isValid(); ++mfi)
708  {
709  auto dst_array = this->array(mfi);
710  const auto & tags = recv_copy_tags[mfi];
711  for (auto const & tag : tags)
712  {
713  const_cast<FB&>(TheFB).m_copyFromBuffer.setParams(idx++, makeCopyMemory(amrex::makeArray4((value_type*)(tag.p),
714  tag.dbox,
715  ncomp),
716  dst_array,
717  dcomp, ncomp));
718  }
719  }
720 
721  // Launch Graph - synced because next action is freeing recv buffer.
722  TheFB.m_copyFromBuffer.executeGraph();
723 }
724 
725 #endif /* __CUDACC__ */
726 
727 template <class FAB>
728 template <typename BUF>
729 void
730 FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
731  Vector<char*> const& send_data,
732  Vector<std::size_t> const& send_size,
733  Vector<CopyComTagsContainer const*> const& send_cctc)
734 {
735  amrex::ignore_unused(send_size);
736 
737  const int N_snds = send_data.size();
738  if (N_snds == 0) { return; }
739 
740  char* pbuffer = send_data[0];
741  std::size_t szbuffer = 0;
742 #if 0
743  // For linear solver test on summit, this is slower than writing to
744  // pinned memory directly on device.
746  // Memory in send_data is pinned.
747  szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1];
748  pbuffer = (char*)The_Arena()->alloc(szbuffer);
749  }
750 #endif
751 
752  using TagType = Array4CopyTag<BUF, value_type>;
753  Vector<TagType> snd_copy_tags;
754  for (int j = 0; j < N_snds; ++j)
755  {
756  if (send_size[j] > 0)
757  {
758  std::size_t offset = send_data[j]-send_data[0];
759  char* dptr = pbuffer + offset;
760  auto const& cctc = *send_cctc[j];
761  for (auto const& tag : cctc)
762  {
763  snd_copy_tags.emplace_back(TagType{
764  amrex::makeArray4((BUF*)(dptr), tag.sbox, ncomp),
765  src.array(tag.srcIndex),
766  tag.sbox,
767  Dim3{0,0,0}
768  });
769  dptr += (tag.sbox.numPts() * ncomp * sizeof(BUF));
770  }
771  BL_ASSERT(dptr <= pbuffer + offset + send_size[j]);
772  }
773  }
774 
775  detail::fab_to_fab<BUF, value_type>(snd_copy_tags, scomp, 0, ncomp,
777 
778  // There is Gpu::streamSynchronize in fab_to_fab.
779 
780  if (pbuffer != send_data[0]) {
781  Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
783  The_Arena()->free(pbuffer);
784  }
785 }
786 
787 template <class FAB>
788 template <typename BUF>
789 void
791  Vector<char*> const& recv_data,
792  Vector<std::size_t> const& recv_size,
793  Vector<CopyComTagsContainer const*> const& recv_cctc,
794  CpOp op, bool is_thread_safe)
795 {
796  amrex::ignore_unused(recv_size);
797 
798  const int N_rcvs = recv_cctc.size();
799  if (N_rcvs == 0) { return; }
800 
801  char* pbuffer = recv_data[0];
802 #if 0
803  std::size_t szbuffer = 0;
804  // For linear solver test on summit, this is slower than writing to
805  // pinned memory directly on device.
807  // Memory in recv_data is pinned.
808  szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
809  pbuffer = (char*)The_Arena()->alloc(szbuffer);
810  Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
812  }
813 #endif
814 
815  using TagType = Array4CopyTag<value_type, BUF>;
816  Vector<TagType> recv_copy_tags;
817  recv_copy_tags.reserve(N_rcvs);
818 
819  Vector<BaseFab<int> > maskfabs;
820  Vector<Array4Tag<int> > masks;
821  if (!is_thread_safe)
822  {
823  if ((op == FabArrayBase::COPY && !amrex::IsStoreAtomic<value_type>::value) ||
824  (op == FabArrayBase::ADD && !amrex::HasAtomicAdd <value_type>::value))
825  {
826  maskfabs.resize(dst.local_size());
827  }
828  }
829 
830  for (int k = 0; k < N_rcvs; ++k)
831  {
832  if (recv_size[k] > 0)
833  {
834  std::size_t offset = recv_data[k]-recv_data[0];
835  const char* dptr = pbuffer + offset;
836  auto const& cctc = *recv_cctc[k];
837  for (auto const& tag : cctc)
838  {
839  const int li = dst.localindex(tag.dstIndex);
840  recv_copy_tags.emplace_back(TagType{
841  dst.atLocalIdx(li).array(),
842  amrex::makeArray4((BUF const*)(dptr), tag.dbox, ncomp),
843  tag.dbox,
844  Dim3{0,0,0}
845  });
846  dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
847 
848  if (maskfabs.size() > 0) {
849  if (!maskfabs[li].isAllocated()) {
850  maskfabs[li].resize(dst.atLocalIdx(li).box());
851  }
852  masks.emplace_back(Array4Tag<int>{maskfabs[li].array()});
853  }
854  }
855  BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
856  }
857  }
858 
859  if (maskfabs.size() > 0) {
860  amrex::ParallelFor(masks,
861  [=] AMREX_GPU_DEVICE (int i, int j, int k, Array4Tag<int> const& msk) noexcept
862  {
863  msk.dfab(i,j,k) = 0;
864  });
865  }
866 
867  if (op == FabArrayBase::COPY)
868  {
869  if (is_thread_safe) {
870  detail::fab_to_fab<value_type, BUF>(
871  recv_copy_tags, 0, dcomp, ncomp, detail::CellStore<value_type, BUF>());
872  } else {
873  detail::fab_to_fab_atomic_cpy<value_type, BUF>(
874  recv_copy_tags, 0, dcomp, ncomp, masks);
875  }
876  }
877  else
878  {
879  if (is_thread_safe) {
880  detail::fab_to_fab<value_type, BUF>(
881  recv_copy_tags, 0, dcomp, ncomp, detail::CellAdd<value_type, BUF>());
882  } else {
883  detail::fab_to_fab_atomic_add<value_type, BUF>(
884  recv_copy_tags, 0, dcomp, ncomp, masks);
885  }
886  }
887 
888  // There is Gpu::streamSynchronize in fab_to_fab.
889 
890  if (pbuffer != recv_data[0]) {
891  The_Arena()->free(pbuffer);
892  }
893 }
894 
895 #endif /* AMREX_USE_GPU */
896 
897 template <class FAB>
898 template <typename BUF>
899 void
900 FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
901  Vector<char*> const& send_data,
902  Vector<std::size_t> const& send_size,
903  Vector<CopyComTagsContainer const*> const& send_cctc)
904 {
905  amrex::ignore_unused(send_size);
906 
907  auto const N_snds = static_cast<int>(send_data.size());
908  if (N_snds == 0) { return; }
909 
910 #ifdef AMREX_USE_OMP
911 #pragma omp parallel for
912 #endif
913  for (int j = 0; j < N_snds; ++j)
914  {
915  if (send_size[j] > 0)
916  {
917  char* dptr = send_data[j];
918  auto const& cctc = *send_cctc[j];
919  for (auto const& tag : cctc)
920  {
921  const Box& bx = tag.sbox;
922  auto const sfab = src.array(tag.srcIndex);
923  auto pfab = amrex::makeArray4((BUF*)(dptr),bx,ncomp);
924  amrex::LoopConcurrentOnCpu( bx, ncomp,
925  [=] (int ii, int jj, int kk, int n) noexcept
926  {
927  pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
928  });
929  dptr += (bx.numPts() * ncomp * sizeof(BUF));
930  }
931  BL_ASSERT(dptr <= send_data[j] + send_size[j]);
932  }
933  }
934 }
935 
936 template <class FAB>
937 template <typename BUF>
938 void
940  Vector<char*> const& recv_data,
941  Vector<std::size_t> const& recv_size,
942  Vector<CopyComTagsContainer const*> const& recv_cctc,
943  CpOp op, bool is_thread_safe)
944 {
945  amrex::ignore_unused(recv_size);
946 
947  auto const N_rcvs = static_cast<int>(recv_cctc.size());
948  if (N_rcvs == 0) { return; }
949 
950  if (is_thread_safe)
951  {
952 #ifdef AMREX_USE_OMP
953 #pragma omp parallel for
954 #endif
955  for (int k = 0; k < N_rcvs; ++k)
956  {
957  if (recv_size[k] > 0)
958  {
959  const char* dptr = recv_data[k];
960  auto const& cctc = *recv_cctc[k];
961  for (auto const& tag : cctc)
962  {
963  const Box& bx = tag.dbox;
964  FAB& dfab = dst[tag.dstIndex];
965  if (op == FabArrayBase::COPY)
966  {
967  dfab.template copyFromMem<RunOn::Host, BUF>(bx, dcomp, ncomp, dptr);
968  }
969  else
970  {
971  dfab.template addFromMem<RunOn::Host, BUF>(tag.dbox, dcomp, ncomp, dptr);
972  }
973  dptr += bx.numPts() * ncomp * sizeof(BUF);
974  }
975  BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
976  }
977  }
978  }
979  else
980  {
981  LayoutData<Vector<VoidCopyTag> > recv_copy_tags;
982  recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
983  for (int k = 0; k < N_rcvs; ++k)
984  {
985  if (recv_size[k] > 0)
986  {
987  const char* dptr = recv_data[k];
988  auto const& cctc = *recv_cctc[k];
989  for (auto const& tag : cctc)
990  {
991  recv_copy_tags[tag.dstIndex].push_back({dptr,tag.dbox});
992  dptr += tag.dbox.numPts() * ncomp * sizeof(BUF);
993  }
994  BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
995  }
996  }
997 
998 #ifdef AMREX_USE_OMP
999 #pragma omp parallel
1000 #endif
1001  for (MFIter mfi(dst); mfi.isValid(); ++mfi)
1002  {
1003  const auto& tags = recv_copy_tags[mfi];
1004  auto dfab = dst.array(mfi);
1005  for (auto const & tag : tags)
1006  {
1007  auto pfab = amrex::makeArray4((BUF*)(tag.p), tag.dbox, ncomp);
1008  if (op == FabArrayBase::COPY)
1009  {
1010  amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1011  [=] (int i, int j, int k, int n) noexcept
1012  {
1013  dfab(i,j,k,n+dcomp) = pfab(i,j,k,n);
1014  });
1015  }
1016  else
1017  {
1018  amrex::LoopConcurrentOnCpu(tag.dbox, ncomp,
1019  [=] (int i, int j, int k, int n) noexcept
1020  {
1021  dfab(i,j,k,n+dcomp) += pfab(i,j,k,n);
1022  });
1023  }
1024  }
1025  }
1026  }
1027 }
1028 
1029 #endif /* AMREX_USE_MPI */
1030 
1031 #endif
#define BL_ASSERT(EX)
Definition: AMReX_BLassert.H:39
#define AMREX_ALWAYS_ASSERT(EX)
Definition: AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_3D(...)
Definition: AMReX_GpuLaunch.nolint.H:50
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
AMREX_GPU_HOST_DEVICE const IntVectND< dim > & smallEnd() const &noexcept
Get the smallend of the BoxND.
Definition: AMReX_Box.H:105
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
const BoxArray & boxArray() const noexcept
Return a constant reference to the BoxArray that defines the valid region associated with this FabArr...
Definition: AMReX_FabArrayBase.H:94
int size() const noexcept
Return the number of FABs in the FabArray.
Definition: AMReX_FabArrayBase.H:109
int localindex(int K) const noexcept
Return local index in the vector of FABs.
Definition: AMReX_FabArrayBase.H:118
int local_size() const noexcept
Return the number of local FABs in the FabArray.
Definition: AMReX_FabArrayBase.H:112
const DistributionMapping & DistributionMap() const noexcept
Return constant reference to associated DistributionMapping.
Definition: AMReX_FabArrayBase.H:130
CpOp
parallel copy or add
Definition: AMReX_FabArrayBase.H:393
An Array of FortranArrayBox(FAB)-like Objects.
Definition: AMReX_FabArray.H:344
typename std::conditional_t< IsBaseFab< FAB >::value, FAB, FABType >::value_type value_type
Definition: AMReX_FabArray.H:355
Array4< typename FabArray< FAB >::value_type const > array(const MFIter &mfi) const noexcept
Definition: AMReX_FabArray.H:1561
FAB & atLocalIdx(int L) noexcept
Return a reference to the FAB associated with local index L.
Definition: AMReX_FabArray.H:530
a one-thingy-per-box distributed object
Definition: AMReX_LayoutData.H:13
void define(const BoxArray &a_grids, const DistributionMapping &a_dm)
Definition: AMReX_LayoutData.H:25
Definition: AMReX_MFIter.H:57
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27
Long size() const noexcept
Definition: AMReX_Vector.H:50
@ FAB
Definition: AMReX_AmrvisConstants.H:86
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T CAS(T *const address, T compare, T const val) noexcept
Definition: AMReX_GpuAtomic.H:511
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition: AMReX_GpuAtomic.H:281
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:233
static constexpr DeviceToHost deviceToHost
Definition: AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition: AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
int MyProc()
Definition: AMReX_MPMD.cpp:117
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_cpu(FabArray< FAB > &mf, int dcomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{}) noexcept
std::enable_if_t< IsBaseFab< FAB >) &&IsCallableR< Dim3, DTOS, Dim3 >) &&IsFabProjection< Proj, FAB >)> unpack_recv_buffer_gpu(FabArray< FAB > &mf, int scomp, int ncomp, Vector< char * > const &recv_data, Vector< std::size_t > const &recv_size, Vector< FabArrayBase::CopyComTagsContainer const * > const &recv_cctc, DTOS const &dtos=DTOS{}, Proj const &proj=Proj{})
bool sameTeam(int rank) noexcept
Definition: AMReX_ParallelDescriptor.H:329
bool UseGpuAwareMpi()
Definition: AMReX_ParallelDescriptor.H:111
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
void ParallelFor_doit(Vector< TagType > const &tags, F &&f)
Definition: AMReX_TagParallelFor.H:170
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
DistributionMapping const & DistributionMap(FabArrayBase const &fa)
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4< T > makeArray4(T *p, Box const &bx, int ncomp) noexcept
Definition: AMReX_BaseFab.H:87
AMREX_ATTRIBUTE_FLATTEN_FOR void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition: AMReX_Loop.H:378
constexpr AMREX_GPU_HOST_DEVICE GpuTupleElement< I, GpuTuple< Ts... > >::type & get(GpuTuple< Ts... > &tup) noexcept
Definition: AMReX_Tuple.H:179
BoxArray const & boxArray(FabArrayBase const &fa)
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:225
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609
Definition: AMReX_FabArrayCommI.H:896
void fab_to_fab(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, F &&f)
Definition: AMReX_FBI.H:53
void fab_to_fab_atomic_add(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition: AMReX_FBI.H:190
void fab_to_fab_atomic_cpy(Vector< Array4CopyTag< T0, T1 > > const &copy_tags, int scomp, int dcomp, int ncomp, Vector< Array4Tag< int > > const &)
Definition: AMReX_FBI.H:172
Definition: AMReX_FBI.H:5
IntVect offset
Definition: AMReX_FBI.H:8
FAB const * sfab
Definition: AMReX_FBI.H:6
Box dbox
Definition: AMReX_FBI.H:7
Definition: AMReX_FBI.H:11
char const * p
Definition: AMReX_FBI.H:12
Box dbox
Definition: AMReX_FBI.H:13
Definition: AMReX_TagParallelFor.H:57
Definition: AMReX_TagParallelFor.H:26
Definition: AMReX_TagParallelFor.H:49
Array4< T > dfab
Definition: AMReX_TagParallelFor.H:50
Definition: AMReX_Dim3.H:12
Definition: AMReX_FabArrayBase.H:471
bool m_threadsafe_loc
Definition: AMReX_FabArrayBase.H:473
bool m_threadsafe_rcv
Definition: AMReX_FabArrayBase.H:474
std::unique_ptr< MapOfCopyComTagContainers > m_RcvTags
Definition: AMReX_FabArrayBase.H:477
std::unique_ptr< CopyComTagsContainer > m_LocTags
Definition: AMReX_FabArrayBase.H:475
Used by a bunch of routines when communicating via MPI.
Definition: AMReX_FabArrayBase.H:194
Box sbox
Definition: AMReX_FabArrayBase.H:196
int srcIndex
Definition: AMReX_FabArrayBase.H:198
Box dbox
Definition: AMReX_FabArrayBase.H:195
int dstIndex
Definition: AMReX_FabArrayBase.H:197
FillBoundary.
Definition: AMReX_FabArrayBase.H:487
Definition: AMReX_TypeTraits.H:56
Definition: AMReX_TypeTraits.H:266
Definition: AMReX_FBI.H:32
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:34
Definition: AMReX_FBI.H:42
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(U0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:45
Definition: AMReX_FBI.H:22
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()(T0 *d, T1 s) const noexcept
Definition: AMReX_FBI.H:24