Block-Structured AMR Software Framework
AMReX_ParticleUtil.H
Go to the documentation of this file.
1 #ifndef AMREX_PARTICLEUTIL_H_
2 #define AMREX_PARTICLEUTIL_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_IntVect.H>
6 #include <AMReX_Box.H>
7 #include <AMReX_Gpu.H>
8 #include <AMReX_Print.H>
9 #include <AMReX_MakeParticle.H>
10 #include <AMReX_Math.H>
11 #include <AMReX_MFIter.H>
12 #include <AMReX_ParGDB.H>
13 #include <AMReX_ParticleTile.H>
15 #include <AMReX_TypeTraits.H>
16 #include <AMReX_Scan.H>
17 
18 #include <limits>
19 
20 namespace amrex {
21 
32 template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value, int> foo = 0>
33 int
34 numParticlesOutOfRange (Iterator const& pti, int nGrow)
35 {
36  return numParticlesOutOfRange(pti,
37  IntVect(AMREX_D_DECL(nGrow, nGrow, nGrow)));
38 }
39 
50 template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value && !Iterator::ContainerType::ParticleType::is_soa_particle, int> foo = 0>
51 int
52 numParticlesOutOfRange (Iterator const& pti, IntVect nGrow)
53 {
54  using ParticleType = typename Iterator::ContainerType::ParticleType;
55 
56  const auto& tile = pti.GetParticleTile();
57  const auto np = tile.numParticles();
58  const auto& ptd = tile.getConstParticleTileData();
59  const auto& geom = pti.Geom(pti.GetLevel());
60 
61  const auto& domain = geom.Domain();
62  const auto& plo = geom.ProbLoArray();
63  const auto& dxi = geom.InvCellSizeArray();
64 
65  Box box = pti.tilebox();
66  box.grow(nGrow);
67 
68  ReduceOps<ReduceOpSum> reduce_op;
69  ReduceData<int> reduce_data(reduce_op);
70  using ReduceTuple = typename decltype(reduce_data)::Type;
71 
72  reduce_op.eval(np, reduce_data,
73  [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
74  {
75  auto p = make_particle<ParticleType>{}(ptd,i);
76  if ((p.id() < 0)) { return false; }
77  using AssignorType = typename Iterator::CellAssignor;
78  AssignorType assignor;
79  IntVect iv = assignor(p, plo, dxi, domain);
80  return !box.contains(iv);
81  });
82  int hv = amrex::get<0>(reduce_data.value(reduce_op));
83  return hv;
84 }
85 
86 template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value && Iterator::ContainerType::ParticleType::is_soa_particle, int> foo = 0>
87 int
88 numParticlesOutOfRange (Iterator const& pti, IntVect nGrow)
89 {
90  using ParticleType = typename Iterator::ContainerType::ConstParticleType;
91 
92  const auto& tile = pti.GetParticleTile();
93  const auto tile_data = tile.getConstParticleTileData();
94  const auto np = tile.numParticles();
95  const auto& geom = pti.Geom(pti.GetLevel());
96 
97  const auto domain = geom.Domain();
98  const auto plo = geom.ProbLoArray();
99  const auto dxi = geom.InvCellSizeArray();
100 
101  Box box = pti.tilebox();
102  box.grow(nGrow);
103 
104  ReduceOps<ReduceOpSum> reduce_op;
105  ReduceData<int> reduce_data(reduce_op);
106  using ReduceTuple = typename decltype(reduce_data)::Type;
107 
108  reduce_op.eval(np, reduce_data,
109  [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
110  {
111  ParticleType p(tile_data,i);
112  if ((p.id() < 0)) { return false; }
113  using AssignorType = typename Iterator::CellAssignor;
114  AssignorType assignor;
115  IntVect iv = assignor(p, plo, dxi, domain);
116  return !box.contains(iv);
117  });
118  int hv = amrex::get<0>(reduce_data.value(reduce_op));
119  return hv;
120 }
121 
134 template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
135 int
136 numParticlesOutOfRange (PC const& pc, int nGrow)
137 {
138  return numParticlesOutOfRange(pc, 0, pc.finestLevel(), nGrow);
139 }
140 
153 template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
154 int
155 numParticlesOutOfRange (PC const& pc, IntVect nGrow)
156 {
157  return numParticlesOutOfRange(pc, 0, pc.finestLevel(), nGrow);
158 }
159 
174 template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
175 int
176 numParticlesOutOfRange (PC const& pc, int lev_min, int lev_max, int nGrow)
177 {
178  BL_PROFILE("numParticlesOutOfRange()");
179 
180  return numParticlesOutOfRange(pc, lev_min, lev_max,
181  IntVect(AMREX_D_DECL(nGrow, nGrow, nGrow)));
182 }
183 
198 template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
199 int
200 numParticlesOutOfRange (PC const& pc, int lev_min, int lev_max, IntVect nGrow)
201 {
202  BL_PROFILE("numParticlesOutOfRange()");
203 
204  using ParIter = typename PC::ParConstIterType;
205  int num_wrong = 0;
206  for (int lev = lev_min; lev <= lev_max; ++lev)
207  {
208 #ifdef AMREX_USE_OMP
209 #pragma omp parallel if (Gpu::notInLaunchRegion() && !system::regtest_reduction) reduction(+:num_wrong)
210 #endif
211  for(ParIter pti(pc, lev); pti.isValid(); ++pti)
212  {
213  num_wrong += numParticlesOutOfRange(pti, nGrow);
214  }
215  }
217 
218  return num_wrong;
219 }
220 
222 int getTileIndex (const IntVect& iv, const Box& box, const bool a_do_tiling,
223  const IntVect& a_tile_size, Box& tbx)
224 {
225  if (a_do_tiling == false) {
226  tbx = box;
227  return 0;
228  } else {
229  //
230  // This function must be consistent with FabArrayBase::buildTileArray function!!!
231  //
232  auto tiling_1d = [](int i, int lo, int hi, int tilesize,
233  int& ntile, int& tileidx, int& tlo, int& thi) {
234  int ncells = hi-lo+1;
235  ntile = amrex::max(ncells/tilesize, 1);
236  int ts_right = ncells/ntile;
237  int ts_left = ts_right+1;
238  int nleft = ncells - ntile*ts_right;
239  int ii = i - lo;
240  int nbndry = nleft*ts_left;
241  if (ii < nbndry) {
242  tileidx = ii / ts_left; // tiles on the left of nbndry have size of ts_left
243  tlo = lo + tileidx * ts_left;
244  thi = tlo + ts_left - 1;
245  } else {
246  tileidx = nleft + (ii-nbndry) / ts_right; // tiles on the right: ts_right
247  tlo = lo + tileidx * ts_right + nleft;
248  thi = tlo + ts_right - 1;
249  }
250  };
251  const IntVect& small = box.smallEnd();
252  const IntVect& big = box.bigEnd();
253  IntVect ntiles, ivIndex, tilelo, tilehi;
254 
255  AMREX_D_TERM(int iv0 = amrex::min(amrex::max(iv[0], small[0]), big[0]);,
256  int iv1 = amrex::min(amrex::max(iv[1], small[1]), big[1]);,
257  int iv2 = amrex::min(amrex::max(iv[2], small[2]), big[2]););
258 
259  AMREX_D_TERM(tiling_1d(iv0, small[0], big[0], a_tile_size[0], ntiles[0], ivIndex[0], tilelo[0], tilehi[0]);,
260  tiling_1d(iv1, small[1], big[1], a_tile_size[1], ntiles[1], ivIndex[1], tilelo[1], tilehi[1]);,
261  tiling_1d(iv2, small[2], big[2], a_tile_size[2], ntiles[2], ivIndex[2], tilelo[2], tilehi[2]););
262 
263  tbx = Box(tilelo, tilehi);
264 
265  return AMREX_D_TERM(ivIndex[0], + ntiles[0]*ivIndex[1], + ntiles[0]*ntiles[1]*ivIndex[2]);
266  }
267 }
268 
270 int numTilesInBox (const Box& box, const bool a_do_tiling, const IntVect& a_tile_size)
271 {
272  if (a_do_tiling == false) {
273  return 1;
274  } else {
275  //
276  // This function must be consistent with FabArrayBase::buildTileArray function!!!
277  //
278  auto tiling_1d = [](int lo, int hi, int tilesize, int& ntile) {
279  int ncells = hi-lo+1;
280  ntile = amrex::max(ncells/tilesize, 1);
281  };
282 
283  const IntVect& small = box.smallEnd();
284  const IntVect& big = box.bigEnd();
285  IntVect ntiles;
286 
287  AMREX_D_TERM(tiling_1d(small[0], big[0], a_tile_size[0], ntiles[0]);,
288  tiling_1d(small[1], big[1], a_tile_size[1], ntiles[1]);,
289  tiling_1d(small[2], big[2], a_tile_size[2], ntiles[2]););
290 
291  return AMREX_D_TERM(ntiles[0], *=ntiles[1], *=ntiles[2]);
292  }
293 }
294 
295 struct BinMapper
296 {
297  BinMapper(const int* off_bins_p,
298  const GpuArray<Real,AMREX_SPACEDIM>* dxi_p,
299  const GpuArray<Real,AMREX_SPACEDIM>* plo_p,
300  const Dim3* lo_p,
301  const Dim3* hi_p,
302  int* bin_type_array=nullptr)
303  : m_off_bins_p(off_bins_p), m_dxi_p(dxi_p), m_plo_p(plo_p) ,
304  m_lo_p(lo_p) , m_hi_p(hi_p) , m_bin_type_array(bin_type_array) {}
305 
306  template <typename T>
308  unsigned int operator() (const T& ptd, int i) const
309  {
310  auto p = ptd[i];
311  int type = (m_bin_type_array) ? m_bin_type_array[i] : 0;
312  int offset = m_off_bins_p[type];
313 
314  AMREX_D_TERM(AMREX_ASSERT((p.pos(0)-m_plo_p[type][0])*m_dxi_p[type][0] - m_lo_p[type].x >= 0.0);,
315  AMREX_ASSERT((p.pos(1)-m_plo_p[type][1])*m_dxi_p[type][1] - m_lo_p[type].y >= 0.0);,
316  AMREX_ASSERT((p.pos(2)-m_plo_p[type][2])*m_dxi_p[type][2] - m_lo_p[type].z >= 0.0));
317 
318  auto iv = IntVect(AMREX_D_DECL(static_cast<int>(amrex::Math::floor((p.pos(0)-m_plo_p[type][0])*m_dxi_p[type][0])) - m_lo_p[type].x,
319  static_cast<int>(amrex::Math::floor((p.pos(1)-m_plo_p[type][1])*m_dxi_p[type][1])) - m_lo_p[type].y,
320  static_cast<int>(amrex::Math::floor((p.pos(2)-m_plo_p[type][2])*m_dxi_p[type][2])) - m_lo_p[type].z));
321  auto iv3 = iv.dim3();
322  int nx = m_hi_p[type].x-m_lo_p[type].x+1;
323  int ny = m_hi_p[type].y-m_lo_p[type].y+1;
324  int nz = m_hi_p[type].z-m_lo_p[type].z+1;
325  int uix = amrex::min(nx-1,amrex::max(0,iv3.x));
326  int uiy = amrex::min(ny-1,amrex::max(0,iv3.y));
327  int uiz = amrex::min(nz-1,amrex::max(0,iv3.z));
328  return static_cast<unsigned int>( (uix * ny + uiy) * nz + uiz + offset );
329  }
330 
331 private:
332  const int* m_off_bins_p;
335  const Dim3* m_lo_p;
336  const Dim3* m_hi_p;
338 };
339 
341 {
347 
348  template <typename ParticleType>
350  unsigned int operator() (const ParticleType& p) const noexcept
351  {
352  Box tbx;
353  auto iv = getParticleCell(p, plo, dxi, domain);
354  auto tid = getTileIndex(iv, box, true, bin_size, tbx);
355  return static_cast<unsigned int>(tid);
356  }
357 };
358 
372 template <typename P>
377 {
378  IntVect iv(
379  AMREX_D_DECL(int(amrex::Math::floor((p.pos(0)-plo[0])*dxi[0])),
380  int(amrex::Math::floor((p.pos(1)-plo[1])*dxi[1])),
381  int(amrex::Math::floor((p.pos(2)-plo[2])*dxi[2]))));
382  return iv;
383 }
384 
399 template <typename P>
404  const Box& domain) noexcept
405 {
406  IntVect iv = getParticleCell(p, plo, dxi);
407  iv += domain.smallEnd();
408  return iv;
409 }
410 
411 template <typename PTD>
413 IntVect getParticleCell (PTD const& ptd, int i,
416  const Box& domain) noexcept
417 {
418  if constexpr (PTD::ParticleType::is_soa_particle)
419  {
420  IntVect iv(
421  AMREX_D_DECL(int(amrex::Math::floor((ptd.m_rdata[0][i]-plo[0])*dxi[0])),
422  int(amrex::Math::floor((ptd.m_rdata[1][i]-plo[1])*dxi[1])),
423  int(amrex::Math::floor((ptd.m_rdata[2][i]-plo[2])*dxi[2]))));
424  iv += domain.smallEnd();
425  return iv;
426  } else {
427  return getParticleCell(ptd.m_aos[i], plo, dxi, domain);;
428  }
429 }
430 
432 {
433 
434  template <typename P>
439  const Box& domain) const noexcept
440  {
441  return getParticleCell(p, plo, dxi, domain);
442  }
443 };
444 
445 template <typename P>
447 int getParticleGrid (P const& p, amrex::Array4<int> const& mask,
450  const Box& domain) noexcept
451 {
452  if (p.id() < 0) { return -1; }
453  IntVect iv = getParticleCell(p, plo, dxi, domain);
454  return mask(iv);
455 }
456 
457 template <typename P>
464  amrex::GpuArray<int,AMREX_SPACEDIM> const& is_per) noexcept
465 {
466  bool shifted = false;
467  for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
468  {
469  if (! is_per[idim]) { continue; }
470  if (p.pos(idim) > rhi[idim]) {
471  while (p.pos(idim) > rhi[idim]) {
472  p.pos(idim) -= static_cast<ParticleReal>(phi[idim] - plo[idim]);
473  }
474  // clamp to avoid precision issues;
475  if (p.pos(idim) < rlo[idim]) {
476  p.pos(idim) = rlo[idim];
477  }
478  shifted = true;
479  }
480  else if (p.pos(idim) < rlo[idim]) {
481  while (p.pos(idim) < rlo[idim]) {
482  p.pos(idim) += static_cast<ParticleReal>(phi[idim] - plo[idim]);
483  }
484  // clamp to avoid precision issues;
485  if (p.pos(idim) > rhi[idim]) {
486  p.pos(idim) = rhi[idim];
487  }
488  shifted = true;
489  }
490  AMREX_ASSERT( (p.pos(idim) >= rlo[idim] ) && ( p.pos(idim) <= rhi[idim] ));
491  }
492 
493  return shifted;
494 }
495 
508 template <typename PTile, typename ParFunc>
509 int
510 partitionParticles (PTile& ptile, ParFunc const& is_left)
511 {
512  const int np = ptile.numParticles();
513  if (np == 0) { return 0; }
514 
515  auto ptd = ptile.getParticleTileData();
516 
517  const int num_left = Reduce::Sum<int>(np,
518  [=] AMREX_GPU_DEVICE (int i) -> int
519  {
520  return int(is_left(ptd, i));
521  });
522 
523  // The ptile will be partitioned into left [0, num_left-1] and right [num_left, np-1].
524  //
525  // Note that currently the number of particles in [0, num_left-1] that should belong to the
526  // right partition is equal to the number of particles in [num_left, np-1] that should belong
527  // in the left partition. We will define num_swaps to be this number. This is the minimum
528  // number of swaps that need to be performed to partition the ptile in place for any algorithm.
529  //
530  // From this it is easy to see that
531  // max_num_swaps = min(size([0, num_left-1]), size([num_left, np-1]))
532  // is an upper bound for num_swaps.
533 
534  const int max_num_swaps = std::min(num_left, np - num_left);
535  if (max_num_swaps == 0) { return num_left; }
536 
537  Gpu::DeviceVector<int> index_left(max_num_swaps);
538  Gpu::DeviceVector<int> index_right(max_num_swaps);
539  int * const p_index_left = index_left.dataPtr();
540  int * const p_index_right = index_right.dataPtr();
541 
542  // The num_swaps particles that are in [0, num_left-1] but should be moved to the right
543  // partition are at the same time the first num_swaps particles for which is_left is false
544  // in all the ptile.
545  // Similarly, the num_swaps particles in [num_left, np-1] that should be moved to the left
546  // partition are the last num_swaps particles of the ptile for which is_left is true.
547  //
548  // The PrefixSum is used to find exactly these particles and store their indexes in
549  // index_left and index_right. Since num_swaps is not known, the first max_num_swaps
550  // particles are stored instead. Here, dst = num_left-1-(i-s) is used to effectively reverse
551  // the PrefixSum to store the last particles for which is_left is true.
552  //
553  // This way, all indexes in index_right are in ascending order, and all indexes in
554  // index_left are in descending order.
555 
556  Scan::PrefixSum<int>(np,
557  [=] AMREX_GPU_DEVICE (int i) -> int
558  {
559  return int(!is_left(ptd, i));
560  },
561  [=] AMREX_GPU_DEVICE (int i, int const& s)
562  {
563  if (!is_left(ptd, i)) {
564  int dst = s;
565  if (dst < max_num_swaps) {
566  p_index_right[dst] = i;
567  }
568  } else {
569  int dst = num_left-1-(i-s); // avoid integer overflow
570  if (dst < max_num_swaps) {
571  p_index_left[dst] = i;
572  }
573  }
574  },
576 
577  // Finally, the particles are swapped. Since max_num_swaps is only an upper bound for num_swaps,
578  // some swaps should not be performed and need to be skipped. This is the case if the index
579  // in index_left[i] is already in the left partition or the index in index_right[i] is already
580  // in the right partition. These two cases coincide for the same i because index_right is in
581  // ascending order and index_left in descending order. This means for both index_left and
582  // index_right the first num_swaps particles need to be swapped, and the particles after that
583  // should be skipped.
584  //
585  // The check right_i < left_i makes sure that the particle going to the right partition is
586  // actually coming from the left partition, which has a lower index than the other particle and
587  // visa-versa.
588  //
589  // Since exactly num_swaps swap operations are performed in the end, which is the smallest
590  // number possible, this algorithm is optimal in the number of swap operations.
591  // This results in good performance in practice if the size of a particle is large enough that
592  // it compensates for the extra kernel launches and evaluations of is_left which this
593  // algorithm needs.
594 
595  ParallelFor(max_num_swaps,
596  [=] AMREX_GPU_DEVICE (int i)
597  {
598  int left_i = p_index_left[i];
599  int right_i = p_index_right[i];
600  if (right_i < left_i) {
601  swapParticle(ptd, ptd, left_i, right_i);
602  }
603  });
604 
605  Gpu::streamSynchronize(); // for index_left and index_right deallocation
606 
607  return num_left;
608 }
609 
610 template <typename PTile>
611 void
613 {
614  const int new_size = partitionParticles(ptile,
615  [=] AMREX_GPU_DEVICE (auto& ptd, int i) {
616  return ptd.id(i).is_valid();
617  });
618  ptile.resize(new_size);
619 }
620 
621 #if defined(AMREX_USE_GPU)
622 
623 template <typename PTile, typename PLocator, typename CellAssignor>
624 int
625 partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const& assignor,
626  const ParticleBufferMap& pmap,
631  const GpuArray<int ,AMREX_SPACEDIM>& is_per,
632  int lev, int gid, int /*tid*/,
633  int lev_min, int lev_max, int nGrow, bool remove_negative)
634 {
635  auto getPID = pmap.getPIDFunctor();
636  int pid = ParallelContext::MyProcSub();
637 
638  Gpu::DeviceVector<uint8_t> particle_stays(ptile.numParticles());
639  uint8_t * const p_particle_stays = particle_stays.dataPtr();
640  auto ptd = ptile.getParticleTileData();
641 
642  // the function for determining if a particle stays on this grid is very slow,
643  // so we cache it in particle_stays to avoid evaluating it multiple times.
644  ParallelFor(ptile.numParticles(),
645  [=] AMREX_GPU_DEVICE (int i)
646  {
647  int assigned_grid;
648  int assigned_lev;
649 
650  if (!ptd.id(i).is_valid())
651  {
652  assigned_grid = -1;
653  assigned_lev = -1;
654  }
655  else
656  {
657  auto p_prime = ptd.getSuperParticle(i);
658  enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
659  auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
660  assigned_grid = amrex::get<0>(tup_prime);
661  assigned_lev = amrex::get<1>(tup_prime);
662  if (assigned_grid >= 0)
663  {
664  AMREX_D_TERM(ptd.pos(0, i) = p_prime.pos(0);,
665  ptd.pos(1, i) = p_prime.pos(1);,
666  ptd.pos(2, i) = p_prime.pos(2););
667  }
668  else if (lev_min > 0)
669  {
670  AMREX_D_TERM(p_prime.pos(0) = ptd.pos(0, i);,
671  p_prime.pos(1) = ptd.pos(1, i);,
672  p_prime.pos(2) = ptd.pos(2, i););
673  auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
674  assigned_grid = amrex::get<0>(tup);
675  assigned_lev = amrex::get<1>(tup);
676  }
677  }
678 
679  p_particle_stays[i] = uint8_t(
680  ((remove_negative == false) && (!ptd.id(i).is_valid())) ||
681  ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid)));
682  });
683 
684  return partitionParticles(ptile,
685  [=] AMREX_GPU_DEVICE (auto& /* ptd */, int i) -> bool {
686  return p_particle_stays[i];
687  });
688 }
689 
690 #endif
691 
692 template <class PC1, class PC2>
693 bool SameIteratorsOK (const PC1& pc1, const PC2& pc2) {
694  if (pc1.numLevels() != pc2.numLevels()) {return false;}
695  if (pc1.do_tiling != pc2.do_tiling) {return false;}
696  if (pc1.tile_size != pc2.tile_size) {return false;}
697  for (int lev = 0; lev < pc1.numLevels(); ++lev) {
698  if (pc1.ParticleBoxArray(lev) != pc2.ParticleBoxArray(lev)) {return false;}
699  if (pc1.ParticleDistributionMap(lev) != pc2.ParticleDistributionMap(lev)) {return false;}
700  }
701  return true;
702 }
703 
704 template <class PC>
705 void EnsureThreadSafeTiles(PC& pc) {
706  using Iter = typename PC::ParIterType;
707  for (int lev = 0; lev < pc.numLevels(); ++lev) {
708  for (Iter pti(pc, lev); pti.isValid(); ++pti) {
709  pc.DefineAndReturnParticleTile(lev, pti);
710  }
711  }
712 }
713 
714 IntVect computeRefFac (const ParGDBBase* a_gdb, int src_lev, int lev);
715 
716 Vector<int> computeNeighborProcs (const ParGDBBase* a_gdb, int ngrow);
717 
718 namespace particle_detail
719 {
720 template <typename C>
721 void clearEmptyEntries (C& c)
722 {
723  for (auto c_it = c.begin(); c_it != c.end(); /* no ++ */)
724  {
725  if (c_it->second.empty()) { c.erase(c_it++); }
726  else { ++c_it; }
727  }
728 }
729 }
730 
731 template <class index_type, typename F>
733  index_type nbins, F const& f)
734 {
735  BL_PROFILE("PermutationForDeposition()");
736 
737 #if defined(AMREX_USE_HIP)
738  // MI250X has a small L2 cache and is more tolerant of atomic add contention,
739  // so we use a small block size of 64 and the compressed layout.
740  static constexpr index_type gpu_block_size = 64;
741  static constexpr bool compressed_layout = true;
742 #else
743  // A100 has a larger L2 cache and is very sensitive to atomic add contention,
744  // so we use a large bock size of 1024 and not the compressed layout.
745  static constexpr index_type gpu_block_size = 1024;
746  static constexpr bool compressed_layout = false;
747 #endif
748 
749  static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
750  static constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
751 
752  // round up to gpu_block_size
753  nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size;
754 
755  Gpu::DeviceVector<index_type> llist_start(nbins, llist_guard);
756  Gpu::DeviceVector<index_type> llist_next(nitems);
757  perm.resize(nitems);
758  Gpu::DeviceScalar<index_type> global_idx(0);
759 
760  index_type* pllist_start = llist_start.dataPtr();
761  index_type* pllist_next = llist_next.dataPtr();
762  index_type* pperm = perm.dataPtr();
763  index_type* pglobal_idx = global_idx.dataPtr();
764 
765  amrex::ParallelFor(nitems, [=] AMREX_GPU_DEVICE (index_type i) noexcept
766  {
767  i = nitems - i - 1;
768  pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i);
769  });
770 
771 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
772  amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
773  [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
774  __shared__ index_type sdata[gpu_block_size];
775  __shared__ index_type global_idx_start;
776  __shared__ index_type idx_start;
777 
778  index_type current_idx = 0;
779 
780  if constexpr (compressed_layout) {
781  // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
782  // are put right next to each other, while without the compressed layout,
783  // there can be other particle sweeps from different locations between them.
784  current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
785 
786  index_type num_particles_thread = 0;
787  while (current_idx != llist_guard) {
788  ++num_particles_thread;
789  current_idx = pllist_next[current_idx];
790  }
791 
792  index_type num_particles_block =
793  Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
794 
795  if (threadIdx.x == 0) {
796  global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
797  }
798  }
799 
800  current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
801 
802  while (true) {
803  sdata[threadIdx.x] = index_type(current_idx != llist_guard);
804  index_type x = 0;
805 
806  // simple block wide prefix sum
807  for (index_type i = 1; i<gpu_block_size; i*=2) {
808  __syncthreads();
809  if (threadIdx.x >= i) {
810  x = sdata[threadIdx.x - i];
811  }
812  __syncthreads();
813  if (threadIdx.x >= i) {
814  sdata[threadIdx.x] += x;
815  }
816  }
817  __syncthreads();
818  if (sdata[gpu_block_size_m1] == 0) {
819  break;
820  }
821  if (threadIdx.x == gpu_block_size_m1) {
822  if constexpr (compressed_layout) {
823  idx_start = global_idx_start;
824  global_idx_start += sdata[gpu_block_size_m1];
825  } else {
826  idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
827  }
828  }
829  __syncthreads();
830  sdata[threadIdx.x] += idx_start;
831  if (current_idx != llist_guard) {
832  pperm[sdata[threadIdx.x] - 1] = current_idx;
833  current_idx = pllist_next[current_idx];
834  }
835  }
836  });
837 #else
838  amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
839  Abort("PermutationForDeposition only implemented for CUDA and HIP");
840 #endif
841 
843 }
844 
845 template <class index_type, class PTile>
847  const PTile& ptile, Box bx, Geometry geom, const IntVect idx_type)
848 {
849  AMREX_ALWAYS_ASSERT(idx_type.allGE(0) && idx_type.allLE(2));
850 
851  const IntVect refine_vect = max(idx_type, IntVect(1)).min(IntVect(2));
852  const IntVect type_vect = idx_type - idx_type / 2 * 2;
853 
854  geom.refine(refine_vect);
855 
856  Box domain = geom.Domain();
857 
858  bx.convert(type_vect);
859  domain.convert(type_vect);
860 
861  const RealVect dxi(geom.InvCellSize());
862  const RealVect pos_offset = Real(0.5) * (RealVect(geom.ProbLo()) + RealVect(geom.ProbHi())
863  - RealVect(geom.CellSize()) * RealVect(domain.smallEnd() + domain.bigEnd()));
864 
865  const int ref_product = AMREX_D_TERM(refine_vect[0], * refine_vect[1], * refine_vect[2]);
866  const IntVect ref_offset(AMREX_D_DECL(1, refine_vect[0], refine_vect[0] * refine_vect[1]));
867 
868  auto ptd = ptile.getConstParticleTileData();
869  using ParticleType = typename PTile::ParticleType::ConstType;
870  PermutationForDeposition<index_type>(perm, nitems, bx.numPts() * ref_product,
871  [=] AMREX_GPU_DEVICE (index_type idx) noexcept
872  {
873  const auto& p = make_particle<ParticleType>{}(ptd,idx);
874 
875  IntVect iv = ((p.pos() - pos_offset) * dxi).round();
876 
877  IntVect iv_coarse = iv / refine_vect;
878  IntVect iv_remainder = iv - iv_coarse * refine_vect;
879 
880  iv_coarse = iv_coarse.max(bx.smallEnd());
881  iv_coarse = iv_coarse.min(bx.bigEnd());
882  return bx.index(iv_coarse) + bx.numPts() * (iv_remainder * ref_offset).sum();
883  });
884 }
885 
886 
887 #ifdef AMREX_USE_HDF5_ASYNC
888 void async_vol_es_wait_particle();
889 void async_vol_es_wait_close_particle();
890 #endif
891 }
892 
893 #endif // include guard
#define BL_PROFILE(a)
Definition: AMReX_BLProfiler.H:551
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition: AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition: AMReX_Extension.H:119
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition: AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
Array4< int const > mask
Definition: AMReX_InterpFaceRegister.cpp:93
#define AMREX_D_TERM(a, b, c)
Definition: AMReX_SPACE.H:129
#define AMREX_D_DECL(a, b, c)
Definition: AMReX_SPACE.H:104
if(!(yy_init))
Definition: amrex_iparser.lex.nolint.H:935
AMREX_GPU_HOST_DEVICE const IntVectND< dim > & smallEnd() const &noexcept
Get the smallend of the BoxND.
Definition: AMReX_Box.H:105
AMREX_GPU_HOST_DEVICE BoxND & grow(int i) noexcept
Definition: AMReX_Box.H:627
AMREX_GPU_HOST_DEVICE Long index(const IntVectND< dim > &v) const noexcept
Returns offset of point from smallend; i.e. index(smallend) -> 0, bigend would return numPts()-1....
Definition: AMReX_Box.H:993
AMREX_GPU_HOST_DEVICE BoxND & convert(IndexTypeND< dim > typ) noexcept
Convert the BoxND from the current type into the argument type. This may change the BoxND coordinates...
Definition: AMReX_Box.H:912
AMREX_GPU_HOST_DEVICE const IntVectND< dim > & bigEnd() const &noexcept
Get the bigend.
Definition: AMReX_Box.H:116
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition: AMReX_Box.H:346
AMREX_GPU_HOST_DEVICE bool contains(const IntVectND< dim > &p) const noexcept
Returns true if argument is contained within BoxND.
Definition: AMReX_Box.H:204
const Real * InvCellSize() const noexcept
Returns the inverse cellsize for each coordinate direction.
Definition: AMReX_CoordSys.H:82
const Real * CellSize() const noexcept
Returns the cellsize for each coordinate direction.
Definition: AMReX_CoordSys.H:71
Rectangular problem domain geometry.
Definition: AMReX_Geometry.H:73
const Real * ProbLo() const noexcept
Returns the lo end of the problem domain in each dimension.
Definition: AMReX_Geometry.H:178
void refine(IntVect const &rr)
Definition: AMReX_Geometry.H:411
const Box & Domain() const noexcept
Returns our rectangular domain.
Definition: AMReX_Geometry.H:210
const Real * ProbHi() const noexcept
Returns the hi end of the problem domain in each dimension.
Definition: AMReX_Geometry.H:180
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool allGE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is greater than or equal to argument for all components. NOTE: This is NOT a str...
Definition: AMReX_IntVect.H:443
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool allLE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is less than or equal to argument for all components. NOTE: This is NOT a strict...
Definition: AMReX_IntVect.H:393
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int min() const noexcept
minimum (no absolute values) value
Definition: AMReX_IntVect.H:225
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int max() const noexcept
maximum (no absolute values) value
Definition: AMReX_IntVect.H:214
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition: AMReX_MFIter.H:141
Definition: AMReX_PODVector.H:246
void resize(size_type a_new_size)
Definition: AMReX_PODVector.H:625
T * dataPtr() noexcept
Definition: AMReX_PODVector.H:597
Definition: AMReX_ParIter.H:113
Definition: AMReX_ParticleBufferMap.H:53
GetPID getPIDFunctor() const noexcept
Definition: AMReX_ParticleBufferMap.H:156
A Real vector in SpaceDim-dimensional space.
Definition: AMReX_RealVect.H:32
Definition: AMReX_Reduce.H:249
Type value()
Definition: AMReX_Reduce.H:281
Definition: AMReX_Reduce.H:364
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition: AMReX_Reduce.H:441
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch(T *address, T val) noexcept
Definition: AMReX_GpuAtomic.H:485
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218
void Sum(T &v, MPI_Comm comm)
Definition: AMReX_ParallelReduce.H:204
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition: AMReX_ParallelContext.H:70
int MyProcSub() noexcept
my sub-rank in current frame
Definition: AMReX_ParallelContext.H:76
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum noRetSum
Definition: AMReX_Scan.H:30
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ min
Definition: AMReX_ParallelReduce.H:18
@ max
Definition: AMReX_ParallelReduce.H:17
@ sum
Definition: AMReX_ParallelReduce.H:19
static constexpr int P
Definition: AMReX_OpenBC.H:14
void clearEmptyEntries(C &c)
Definition: AMReX_ParticleUtil.H:721
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void swapParticle(const ParticleTileData< T_ParticleType, NAR, NAI > &dst, const ParticleTileData< T_ParticleType, NAR, NAI > &src, int src_i, int dst_i) noexcept
A general single particle swapping routine that can run on the GPU.
Definition: AMReX_ParticleTransformation.H:119
BoxND< AMREX_SPACEDIM > Box
Definition: AMReX_BaseFwd.H:27
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE IntVect getParticleCell(P const &p, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &plo, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &dxi) noexcept
Returns the cell index for a given particle using the provided lower bounds and cell sizes.
Definition: AMReX_ParticleUtil.H:374
int partitionParticlesByDest(PTile &ptile, const PLocator &ploc, CellAssignor const &assignor, const ParticleBufferMap &pmap, const GpuArray< Real, AMREX_SPACEDIM > &plo, const GpuArray< Real, AMREX_SPACEDIM > &phi, const GpuArray< ParticleReal, AMREX_SPACEDIM > &rlo, const GpuArray< ParticleReal, AMREX_SPACEDIM > &rhi, const GpuArray< int, AMREX_SPACEDIM > &is_per, int lev, int gid, int, int lev_min, int lev_max, int nGrow, bool remove_negative)
Definition: AMReX_ParticleUtil.H:625
void EnsureThreadSafeTiles(PC &pc)
Definition: AMReX_ParticleUtil.H:705
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & max(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:35
AMREX_GPU_HOST_DEVICE constexpr AMREX_FORCE_INLINE const T & min(const T &a, const T &b) noexcept
Definition: AMReX_Algorithm.H:21
void removeInvalidParticles(PTile &ptile)
Definition: AMReX_ParticleUtil.H:612
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int getParticleGrid(P const &p, amrex::Array4< int > const &mask, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &plo, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &dxi, const Box &domain) noexcept
Definition: AMReX_ParticleUtil.H:447
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int numTilesInBox(const Box &box, const bool a_do_tiling, const IntVect &a_tile_size)
Definition: AMReX_ParticleUtil.H:270
IntVectND< AMREX_SPACEDIM > IntVect
Definition: AMReX_BaseFwd.H:30
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool enforcePeriodic(P &p, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &plo, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &phi, amrex::GpuArray< amrex::ParticleReal, AMREX_SPACEDIM > const &rlo, amrex::GpuArray< amrex::ParticleReal, AMREX_SPACEDIM > const &rhi, amrex::GpuArray< int, AMREX_SPACEDIM > const &is_per) noexcept
Definition: AMReX_ParticleUtil.H:459
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
void Add(FabArray< FAB > &dst, FabArray< FAB > const &src, int srccomp, int dstcomp, int numcomp, int nghost)
Definition: AMReX_FabArray.H:239
Vector< int > computeNeighborProcs(const ParGDBBase *a_gdb, int ngrow)
Definition: AMReX_ParticleUtil.cpp:22
bool SameIteratorsOK(const PC1 &pc1, const PC2 &pc2)
Definition: AMReX_ParticleUtil.H:693
int partitionParticles(PTile &ptile, ParFunc const &is_left)
Reorders the ParticleTile into two partitions left [0, num_left-1] and right [num_left,...
Definition: AMReX_ParticleUtil.H:510
int numParticlesOutOfRange(Iterator const &pti, int nGrow)
Returns the number of particles that are more than nGrow cells from the box correspond to the input i...
Definition: AMReX_ParticleUtil.H:34
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:221
const int[]
Definition: AMReX_BLProfiler.cpp:1664
IntVect computeRefFac(const ParGDBBase *a_gdb, int src_lev, int lev)
Definition: AMReX_ParticleUtil.cpp:6
void PermutationForDeposition(Gpu::DeviceVector< index_type > &perm, index_type nitems, const PTile &ptile, Box bx, Geometry geom, const IntVect idx_type)
Definition: AMReX_ParticleUtil.H:846
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int getTileIndex(const IntVect &iv, const Box &box, const bool a_do_tiling, const IntVect &a_tile_size, Box &tbx)
Definition: AMReX_ParticleUtil.H:222
Definition: AMReX_WriteBinaryParticleData.H:19
Definition: AMReX_ParticleUtil.H:296
const Dim3 * m_hi_p
Definition: AMReX_ParticleUtil.H:336
const Dim3 * m_lo_p
Definition: AMReX_ParticleUtil.H:335
BinMapper(const int *off_bins_p, const GpuArray< Real, AMREX_SPACEDIM > *dxi_p, const GpuArray< Real, AMREX_SPACEDIM > *plo_p, const Dim3 *lo_p, const Dim3 *hi_p, int *bin_type_array=nullptr)
Definition: AMReX_ParticleUtil.H:297
int * m_bin_type_array
Definition: AMReX_ParticleUtil.H:337
AMREX_GPU_HOST_DEVICE unsigned int operator()(const T &ptd, int i) const
Definition: AMReX_ParticleUtil.H:308
const int * m_off_bins_p
Definition: AMReX_ParticleUtil.H:332
const GpuArray< Real, AMREX_SPACEDIM > * m_plo_p
Definition: AMReX_ParticleUtil.H:334
const GpuArray< Real, AMREX_SPACEDIM > * m_dxi_p
Definition: AMReX_ParticleUtil.H:333
Definition: AMReX_ParticleUtil.H:432
AMREX_GPU_HOST_DEVICE IntVect operator()(P const &p, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &plo, amrex::GpuArray< amrex::Real, AMREX_SPACEDIM > const &dxi, const Box &domain) const noexcept
Definition: AMReX_ParticleUtil.H:436
Definition: AMReX_Dim3.H:12
int x
Definition: AMReX_Dim3.H:12
int z
Definition: AMReX_Dim3.H:12
int y
Definition: AMReX_Dim3.H:12
Definition: AMReX_ParticleUtil.H:341
IntVect bin_size
Definition: AMReX_ParticleUtil.H:345
GpuArray< Real, AMREX_SPACEDIM > dxi
Definition: AMReX_ParticleUtil.H:343
AMREX_GPU_HOST_DEVICE unsigned int operator()(const ParticleType &p) const noexcept
Definition: AMReX_ParticleUtil.H:350
Box domain
Definition: AMReX_ParticleUtil.H:344
Box box
Definition: AMReX_ParticleUtil.H:346
GpuArray< Real, AMREX_SPACEDIM > plo
Definition: AMReX_ParticleUtil.H:342
Definition: AMReX_GpuMemory.H:56
T * dataPtr()
Definition: AMReX_GpuMemory.H:90
Definition: AMReX_MakeParticle.H:16