Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_ParticleUtil.H
Go to the documentation of this file.
1#ifndef AMREX_PARTICLEUTIL_H_
2#define AMREX_PARTICLEUTIL_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_IntVect.H>
6#include <AMReX_Box.H>
7#include <AMReX_Gpu.H>
8#include <AMReX_Print.H>
10#include <AMReX_Math.H>
11#include <AMReX_MFIter.H>
12#include <AMReX_ParGDB.H>
13#include <AMReX_ParticleTile.H>
16#include <AMReX_TypeTraits.H>
17#include <AMReX_Scan.H>
18
19#include <limits>
20
21namespace amrex {
22
33template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value, int> foo = 0>
34int
35numParticlesOutOfRange (Iterator const& pti, int nGrow)
36{
37 return numParticlesOutOfRange(pti,
38 IntVect(AMREX_D_DECL(nGrow, nGrow, nGrow)));
39}
40
51template <class Iterator, std::enable_if_t<IsParticleIterator<Iterator>::value, int> foo = 0>
52int
53numParticlesOutOfRange (Iterator const& pti, IntVect nGrow)
54{
55 const auto& tile = pti.GetParticleTile();
56 const auto np = tile.numParticles();
57 const auto& ptd = tile.getConstParticleTileData();
58 const auto& geom = pti.Geom(pti.GetLevel());
59
60 const auto& domain = geom.Domain();
61 const auto& plo = geom.ProbLoArray();
62 const auto& dxi = geom.InvCellSizeArray();
63
64 Box box = pti.tilebox();
65 box.grow(nGrow);
66
67 ReduceOps<ReduceOpSum> reduce_op;
68 ReduceData<int> reduce_data(reduce_op);
69 using ReduceTuple = typename decltype(reduce_data)::Type;
70
71 reduce_op.eval(np, reduce_data,
72 [=] AMREX_GPU_DEVICE (int i) -> ReduceTuple
73 {
74 auto p = ptd[i];
75 if (!p.id().is_valid()) { return false; }
76 using AssignorType = typename Iterator::CellAssignor;
77 AssignorType assignor;
78 IntVect iv = assignor(p, plo, dxi, domain);
79 return !box.contains(iv);
80 });
81 int hv = amrex::get<0>(reduce_data.value(reduce_op));
82 return hv;
83}
84
97template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
98int
99numParticlesOutOfRange (PC const& pc, int nGrow)
100{
101 return numParticlesOutOfRange(pc, 0, pc.finestLevel(), nGrow);
102}
103
116template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
117int
118numParticlesOutOfRange (PC const& pc, IntVect nGrow)
119{
120 return numParticlesOutOfRange(pc, 0, pc.finestLevel(), nGrow);
121}
122
137template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
138int
139numParticlesOutOfRange (PC const& pc, int lev_min, int lev_max, int nGrow)
140{
141 BL_PROFILE("numParticlesOutOfRange()");
142
143 return numParticlesOutOfRange(pc, lev_min, lev_max,
144 IntVect(AMREX_D_DECL(nGrow, nGrow, nGrow)));
145}
146
161template <class PC, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
162int
163numParticlesOutOfRange (PC const& pc, int lev_min, int lev_max, IntVect nGrow)
164{
165 BL_PROFILE("numParticlesOutOfRange()");
166
167 using ParIter = typename PC::ParConstIterType;
168 int num_wrong = 0;
169 for (int lev = lev_min; lev <= lev_max; ++lev)
170 {
171#ifdef AMREX_USE_OMP
172#pragma omp parallel if (Gpu::notInLaunchRegion() && !system::regtest_reduction) reduction(+:num_wrong)
173#endif
174 for(ParIter pti(pc, lev); pti.isValid(); ++pti)
175 {
176 num_wrong += numParticlesOutOfRange(pti, nGrow);
177 }
178 }
180
181 return num_wrong;
182}
183
185int getTileIndex (const IntVect& iv, const Box& box, const bool a_do_tiling,
186 const IntVect& a_tile_size, Box& tbx)
187{
188 if (a_do_tiling == false) {
189 tbx = box;
190 return 0;
191 } else {
192 //
193 // This function must be consistent with FabArrayBase::buildTileArray function!!!
194 //
195 auto tiling_1d = [](int i, int lo, int hi, int tilesize,
196 int& ntile, int& tileidx, int& tlo, int& thi) {
197 int ncells = hi-lo+1;
198 ntile = amrex::max(ncells/tilesize, 1);
199 int ts_right = ncells/ntile;
200 int ts_left = ts_right+1;
201 int nleft = ncells - ntile*ts_right;
202 int ii = i - lo;
203 int nbndry = nleft*ts_left;
204 if (ii < nbndry) {
205 tileidx = ii / ts_left; // tiles on the left of nbndry have size of ts_left
206 tlo = lo + tileidx * ts_left;
207 thi = tlo + ts_left - 1;
208 } else {
209 tileidx = nleft + (ii-nbndry) / ts_right; // tiles on the right: ts_right
210 tlo = lo + tileidx * ts_right + nleft;
211 thi = tlo + ts_right - 1;
212 }
213 };
214 const IntVect& sml = box.smallEnd();
215 const IntVect& big = box.bigEnd();
216 IntVect ntiles, ivIndex, tilelo, tilehi;
217
218 AMREX_D_TERM(int iv0 = amrex::min(amrex::max(iv[0], sml[0]), big[0]);,
219 int iv1 = amrex::min(amrex::max(iv[1], sml[1]), big[1]);,
220 int iv2 = amrex::min(amrex::max(iv[2], sml[2]), big[2]););
221
222 AMREX_D_TERM(tiling_1d(iv0, sml[0], big[0], a_tile_size[0], ntiles[0], ivIndex[0], tilelo[0], tilehi[0]);,
223 tiling_1d(iv1, sml[1], big[1], a_tile_size[1], ntiles[1], ivIndex[1], tilelo[1], tilehi[1]);,
224 tiling_1d(iv2, sml[2], big[2], a_tile_size[2], ntiles[2], ivIndex[2], tilelo[2], tilehi[2]););
225
226 tbx = Box(tilelo, tilehi);
227
228 return AMREX_D_TERM(ivIndex[0], + ntiles[0]*ivIndex[1], + ntiles[0]*ntiles[1]*ivIndex[2]);
229 }
230}
231
233int numTilesInBox (const Box& box, const bool a_do_tiling, const IntVect& a_tile_size)
234{
235 if (a_do_tiling == false) {
236 return 1;
237 } else {
238 //
239 // This function must be consistent with FabArrayBase::buildTileArray function!!!
240 //
241 auto tiling_1d = [](int lo, int hi, int tilesize, int& ntile) {
242 int ncells = hi-lo+1;
243 ntile = amrex::max(ncells/tilesize, 1);
244 };
245
246 const IntVect& sml = box.smallEnd();
247 const IntVect& big = box.bigEnd();
248 IntVect ntiles;
249
250 AMREX_D_TERM(tiling_1d(sml[0], big[0], a_tile_size[0], ntiles[0]);,
251 tiling_1d(sml[1], big[1], a_tile_size[1], ntiles[1]);,
252 tiling_1d(sml[2], big[2], a_tile_size[2], ntiles[2]););
253
254 return AMREX_D_TERM(ntiles[0], *=ntiles[1], *=ntiles[2]);
255 }
256}
257
259{
260 BinMapper(const int* off_bins_p,
263 const Dim3* lo_p,
264 const Dim3* hi_p,
265 int* bin_type_array=nullptr)
266 : m_off_bins_p(off_bins_p), m_dxi_p(dxi_p), m_plo_p(plo_p) ,
267 m_lo_p(lo_p) , m_hi_p(hi_p) , m_bin_type_array(bin_type_array) {}
268
269 template <typename T>
271 unsigned int operator() (const T& ptd, int i) const
272 {
273 auto p = ptd[i];
274 int type = (m_bin_type_array) ? m_bin_type_array[i] : 0;
275 int offset = m_off_bins_p[type];
276
277 AMREX_D_TERM(AMREX_ASSERT((p.pos(0)-m_plo_p[type][0])*m_dxi_p[type][0] - m_lo_p[type].x >= 0.0);,
278 AMREX_ASSERT((p.pos(1)-m_plo_p[type][1])*m_dxi_p[type][1] - m_lo_p[type].y >= 0.0);,
279 AMREX_ASSERT((p.pos(2)-m_plo_p[type][2])*m_dxi_p[type][2] - m_lo_p[type].z >= 0.0));
280
281 auto iv = IntVect(AMREX_D_DECL(static_cast<int>(amrex::Math::floor((p.pos(0)-m_plo_p[type][0])*m_dxi_p[type][0])) - m_lo_p[type].x,
282 static_cast<int>(amrex::Math::floor((p.pos(1)-m_plo_p[type][1])*m_dxi_p[type][1])) - m_lo_p[type].y,
283 static_cast<int>(amrex::Math::floor((p.pos(2)-m_plo_p[type][2])*m_dxi_p[type][2])) - m_lo_p[type].z));
284 auto iv3 = iv.dim3();
285 int nx = m_hi_p[type].x-m_lo_p[type].x+1;
286 int ny = m_hi_p[type].y-m_lo_p[type].y+1;
287 int nz = m_hi_p[type].z-m_lo_p[type].z+1;
288 int uix = amrex::min(nx-1,amrex::max(0,iv3.x));
289 int uiy = amrex::min(ny-1,amrex::max(0,iv3.y));
290 int uiz = amrex::min(nz-1,amrex::max(0,iv3.z));
291 return static_cast<unsigned int>( (uix * ny + uiy) * nz + uiz + offset );
292 }
293
294private:
295 const int* m_off_bins_p;
296 const GpuArray<Real,AMREX_SPACEDIM>* m_dxi_p;
297 const GpuArray<Real,AMREX_SPACEDIM>* m_plo_p;
298 const Dim3* m_lo_p;
299 const Dim3* m_hi_p;
300 int* m_bin_type_array;
301};
302
304{
310
311 template <typename ParticleType>
313 unsigned int operator() (const ParticleType& p) const noexcept
314 {
315 Box tbx;
316 auto iv = getParticleCell(p, plo, dxi, domain);
317 auto tid = getTileIndex(iv, box, true, bin_size, tbx);
318 return static_cast<unsigned int>(tid);
319 }
320};
321
335template <typename P>
340{
341 IntVect iv(
342 AMREX_D_DECL(int(amrex::Math::floor((p.pos(0)-plo[0])*dxi[0])),
343 int(amrex::Math::floor((p.pos(1)-plo[1])*dxi[1])),
344 int(amrex::Math::floor((p.pos(2)-plo[2])*dxi[2]))));
345 return iv;
346}
347
362template <typename P>
367 const Box& domain) noexcept
368{
369 IntVect iv = getParticleCell(p, plo, dxi);
370 iv += domain.smallEnd();
371 return iv;
372}
373
374template <typename PTD>
376IntVect getParticleCell (PTD const& ptd, int i,
379 const Box& domain) noexcept
380{
381 IntVect iv(
382 AMREX_D_DECL(int(amrex::Math::floor((ptd.pos(0, i)-plo[0])*dxi[0])),
383 int(amrex::Math::floor((ptd.pos(1, i)-plo[1])*dxi[1])),
384 int(amrex::Math::floor((ptd.pos(2, i)-plo[2])*dxi[2]))));
385 iv += domain.smallEnd();
386 return iv;
387}
388
390{
391
392 template <typename P>
397 const Box& domain) const noexcept
398 {
399 return getParticleCell(p, plo, dxi, domain);
400 }
401};
402
403template <typename P>
408 const Box& domain) noexcept
409{
410 if (!p.id().is_valid()) { return -1; }
411 IntVect iv = getParticleCell(p, plo, dxi, domain);
412 return mask(iv);
413}
414
415template <typename P>
422 amrex::GpuArray<int,AMREX_SPACEDIM> const& is_per) noexcept
423{
424 bool shifted = false;
425 for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
426 {
427 if (! is_per[idim]) { continue; }
428 if (p.pos(idim) > rhi[idim]) {
429 while (p.pos(idim) > rhi[idim]) {
430 p.pos(idim) -= static_cast<ParticleReal>(phi[idim] - plo[idim]);
431 }
432 // clamp to avoid precision issues;
433 if (p.pos(idim) < rlo[idim]) {
434 p.pos(idim) = rlo[idim];
435 }
436 shifted = true;
437 }
438 else if (p.pos(idim) < rlo[idim]) {
439 while (p.pos(idim) < rlo[idim]) {
440 p.pos(idim) += static_cast<ParticleReal>(phi[idim] - plo[idim]);
441 }
442 // clamp to avoid precision issues;
443 if (p.pos(idim) > rhi[idim]) {
444 p.pos(idim) = rhi[idim];
445 }
446 shifted = true;
447 }
448 AMREX_ASSERT( (p.pos(idim) >= rlo[idim] ) && ( p.pos(idim) <= rhi[idim] ));
449 }
450
451 return shifted;
452}
453
466template <typename PTile, typename ParFunc>
467int
468partitionParticles (PTile& ptile, ParFunc const& is_left)
469{
470 const int np = ptile.numParticles();
471 if (np == 0) { return 0; }
472
473 auto ptd = ptile.getParticleTileData();
474
475 const int num_left = Reduce::Sum<int>(np,
476 [=] AMREX_GPU_DEVICE (int i) -> int
477 {
478 return int(is_left(ptd, i));
479 });
480
481 // The ptile will be partitioned into left [0, num_left-1] and right [num_left, np-1].
482 //
483 // Note that currently the number of particles in [0, num_left-1] that should belong to the
484 // right partition is equal to the number of particles in [num_left, np-1] that should belong
485 // in the left partition. We will define num_swaps to be this number. This is the minimum
486 // number of swaps that need to be performed to partition the ptile in place for any algorithm.
487 //
488 // From this it is easy to see that
489 // max_num_swaps = min(size([0, num_left-1]), size([num_left, np-1]))
490 // is an upper bound for num_swaps.
491
492 const int max_num_swaps = std::min(num_left, np - num_left);
493 if (max_num_swaps == 0) { return num_left; }
494
495 Gpu::DeviceVector<int> index_left(max_num_swaps);
496 Gpu::DeviceVector<int> index_right(max_num_swaps);
497 int * const p_index_left = index_left.dataPtr();
498 int * const p_index_right = index_right.dataPtr();
499
500 // The num_swaps particles that are in [0, num_left-1] but should be moved to the right
501 // partition are at the same time the first num_swaps particles for which is_left is false
502 // in all the ptile.
503 // Similarly, the num_swaps particles in [num_left, np-1] that should be moved to the left
504 // partition are the last num_swaps particles of the ptile for which is_left is true.
505 //
506 // The PrefixSum is used to find exactly these particles and store their indexes in
507 // index_left and index_right. Since num_swaps is not known, the first max_num_swaps
508 // particles are stored instead. Here, dst = num_left-1-(i-s) is used to effectively reverse
509 // the PrefixSum to store the last particles for which is_left is true.
510 //
511 // This way, all indexes in index_right are in ascending order, and all indexes in
512 // index_left are in descending order.
513
514 Scan::PrefixSum<int>(np,
515 [=] AMREX_GPU_DEVICE (int i) -> int
516 {
517 return int(!is_left(ptd, i));
518 },
519 [=] AMREX_GPU_DEVICE (int i, int const& s)
520 {
521 if (!is_left(ptd, i)) {
522 int dst = s;
523 if (dst < max_num_swaps) {
524 p_index_right[dst] = i;
525 }
526 } else {
527 int dst = num_left-1-(i-s); // avoid integer overflow
528 if (dst < max_num_swaps) {
529 p_index_left[dst] = i;
530 }
531 }
532 },
534
535 // Finally, the particles are swapped. Since max_num_swaps is only an upper bound for num_swaps,
536 // some swaps should not be performed and need to be skipped. This is the case if the index
537 // in index_left[i] is already in the left partition or the index in index_right[i] is already
538 // in the right partition. These two cases coincide for the same i because index_right is in
539 // ascending order and index_left in descending order. This means for both index_left and
540 // index_right the first num_swaps particles need to be swapped, and the particles after that
541 // should be skipped.
542 //
543 // The check right_i < left_i makes sure that the particle going to the right partition is
544 // actually coming from the left partition, which has a lower index than the other particle and
545 // visa-versa.
546 //
547 // Since exactly num_swaps swap operations are performed in the end, which is the smallest
548 // number possible, this algorithm is optimal in the number of swap operations.
549 // This results in good performance in practice if the size of a particle is large enough that
550 // it compensates for the extra kernel launches and evaluations of is_left which this
551 // algorithm needs.
552
553 ParallelFor(max_num_swaps,
554 [=] AMREX_GPU_DEVICE (int i)
555 {
556 int left_i = p_index_left[i];
557 int right_i = p_index_right[i];
558 if (right_i < left_i) {
559 swapParticle(ptd, ptd, left_i, right_i);
560 }
561 });
562
563 index_left.free_async();
564 index_right.free_async();
565
566 return num_left;
567}
568
583template <typename PTile, typename ParFunc>
584void
585partitionParticles (PTile& ptile, int num_left, ParFunc const& is_left)
586{
587 const int np = ptile.numParticles();
588 if (np == 0) { return; }
589
590 auto ptd = ptile.getParticleTileData();
591
592 const int max_num_swaps = std::min(num_left, np - num_left);
593 if (max_num_swaps == 0) { return; }
594
595 Gpu::DeviceVector<int> index_left(max_num_swaps);
596 Gpu::DeviceVector<int> index_right(max_num_swaps);
597 int * const p_index_left = index_left.dataPtr();
598 int * const p_index_right = index_right.dataPtr();
599
600 Scan::PrefixSum<int>(np,
601 [=] AMREX_GPU_DEVICE (int i) -> int
602 {
603 return int(!is_left(ptd, i));
604 },
605 [=] AMREX_GPU_DEVICE (int i, int const& s)
606 {
607 if (!is_left(ptd, i)) {
608 int dst = s;
609 if (dst < max_num_swaps) {
610 p_index_right[dst] = i;
611 }
612 } else {
613 int dst = num_left-1-(i-s); // avoid integer overflow
614 if (dst < max_num_swaps) {
615 p_index_left[dst] = i;
616 }
617 }
618 },
620
621 ParallelFor(max_num_swaps,
622 [=] AMREX_GPU_DEVICE (int i)
623 {
624 int left_i = p_index_left[i];
625 int right_i = p_index_right[i];
626 if (right_i < left_i) {
627 swapParticle(ptd, ptd, left_i, right_i);
628 }
629 });
630
631 index_left.free_async();
632 index_right.free_async();
633}
634
635template <typename PTile>
636void
638{
639 const int new_size = partitionParticles(ptile,
640 [=] AMREX_GPU_DEVICE (auto& ptd, int i) {
641 return ptd.id(i).is_valid();
642 });
643 ptile.resize(new_size);
644}
645
646#if defined(AMREX_USE_GPU)
647
648template <typename PTile, typename PLocator, typename CellAssignor>
649int
650partitionParticlesByDest (PTile& ptile, const PLocator& ploc, CellAssignor const& assignor,
651 const ParticleBufferMap& pmap,
656 const GpuArray<int ,AMREX_SPACEDIM>& is_per,
657 int lev, int gid, int /*tid*/,
658 int lev_min, int lev_max, int nGrow, bool remove_negative)
659{
660 auto getPID = pmap.getPIDFunctor();
661 int pid = ParallelContext::MyProcSub();
662
663 Gpu::DeviceVector<uint8_t> particle_stays(ptile.numParticles());
664 uint8_t * const p_particle_stays = particle_stays.dataPtr();
665 auto ptd = ptile.getParticleTileData();
666
667 // the function for determining if a particle stays on this grid is very slow,
668 // so we cache it in particle_stays to avoid evaluating it multiple times.
669 ParallelFor(ptile.numParticles(),
670 [=] AMREX_GPU_DEVICE (int i)
671 {
672 int assigned_grid;
673 int assigned_lev;
674
675 if (!ptd.id(i).is_valid())
676 {
677 assigned_grid = -1;
678 assigned_lev = -1;
679 }
680 else
681 {
682 auto p_prime = ptd.getSuperParticle(i);
683 enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
684 auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
685 assigned_grid = amrex::get<0>(tup_prime);
686 assigned_lev = amrex::get<1>(tup_prime);
687 if (assigned_grid >= 0)
688 {
689 AMREX_D_TERM(ptd.pos(0, i) = p_prime.pos(0);,
690 ptd.pos(1, i) = p_prime.pos(1);,
691 ptd.pos(2, i) = p_prime.pos(2););
692 }
693 else if (lev_min > 0)
694 {
695 AMREX_D_TERM(p_prime.pos(0) = ptd.pos(0, i);,
696 p_prime.pos(1) = ptd.pos(1, i);,
697 p_prime.pos(2) = ptd.pos(2, i););
698 auto tup = ploc(p_prime, lev_min, lev_max, nGrow, assignor);
699 assigned_grid = amrex::get<0>(tup);
700 assigned_lev = amrex::get<1>(tup);
701 }
702 }
703
704 p_particle_stays[i] = uint8_t(
705 ((remove_negative == false) && (!ptd.id(i).is_valid())) ||
706 ((assigned_grid == gid) && (assigned_lev == lev) && (getPID(lev, gid) == pid)));
707 });
708
709 return partitionParticles(ptile,
710 [=] AMREX_GPU_DEVICE (auto& /* ptd */, int i) -> bool {
711 return p_particle_stays[i];
712 });
713}
714
715#endif
716
717template <class PC1, class PC2>
718bool SameIteratorsOK (const PC1& pc1, const PC2& pc2) {
719 if (pc1.numLevels() != pc2.numLevels()) {return false;}
720 if (pc1.do_tiling != pc2.do_tiling) {return false;}
721 if (pc1.tile_size != pc2.tile_size) {return false;}
722 for (int lev = 0; lev < pc1.numLevels(); ++lev) {
723 if (pc1.ParticleBoxArray(lev) != pc2.ParticleBoxArray(lev)) {return false;}
724 if (pc1.ParticleDistributionMap(lev) != pc2.ParticleDistributionMap(lev)) {return false;}
725 }
726 return true;
727}
728
729template <class PC>
731 using Iter = typename PC::ParIterType;
732 for (int lev = 0; lev < pc.numLevels(); ++lev) {
733 for (Iter pti(pc, lev); pti.isValid(); ++pti) {
734 pc.DefineAndReturnParticleTile(lev, pti);
735 }
736 }
737}
738
739IntVect computeRefFac (const ParGDBBase* a_gdb, int src_lev, int lev);
740
741Vector<int> computeNeighborProcs (const ParGDBBase* a_gdb, int ngrow);
742
744namespace particle_detail
745{
746template <typename C>
747void clearEmptyEntries (C& c)
748{
749 for (auto c_it = c.begin(); c_it != c.end(); /* no ++ */)
750 {
751 if (c_it->second.empty()) { c.erase(c_it++); }
752 else { ++c_it; }
753 }
754}
755}
757
758template <class index_type, typename F>
760 index_type nbins, F const& f)
761{
762 BL_PROFILE("PermutationForDeposition()");
763
764#if defined(AMREX_USE_HIP)
765 // MI250X has a small L2 cache and is more tolerant of atomic add contention,
766 // so we use a small block size of 64 and the compressed layout.
767 static constexpr index_type gpu_block_size = 64;
768 static constexpr bool compressed_layout = true;
769#else
770 // A100 has a larger L2 cache and is very sensitive to atomic add contention,
771 // so we use a large bock size of 1024 and not the compressed layout.
772 static constexpr index_type gpu_block_size = 1024;
773 static constexpr bool compressed_layout = false;
774#endif
775
776 static constexpr index_type gpu_block_size_m1 = gpu_block_size - 1;
777 static constexpr index_type llist_guard = std::numeric_limits<index_type>::max();
778
779 // round up to gpu_block_size
780 nbins = (nbins + gpu_block_size_m1) / gpu_block_size * gpu_block_size;
781
782 Gpu::DeviceVector<index_type> llist_start(nbins, llist_guard);
783 Gpu::DeviceVector<index_type> llist_next(nitems);
784 perm.resize(nitems);
785 Gpu::DeviceScalar<index_type> global_idx(0);
786
787 index_type* pllist_start = llist_start.dataPtr();
788 index_type* pllist_next = llist_next.dataPtr();
789 index_type* pperm = perm.dataPtr();
790 index_type* pglobal_idx = global_idx.dataPtr();
791
792 amrex::ParallelFor(nitems, [=] AMREX_GPU_DEVICE (index_type i) noexcept
793 {
794 i = nitems - i - 1;
795 pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i);
796 });
797
798#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
799 amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
800 [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
801 __shared__ index_type sdata[gpu_block_size];
802 __shared__ index_type global_idx_start;
803 __shared__ index_type idx_start;
804
805 index_type current_idx = 0;
806
807 if constexpr (compressed_layout) {
808 // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
809 // are put right next to each other, while without the compressed layout,
810 // there can be other particle sweeps from different locations between them.
811 current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
812
813 index_type num_particles_thread = 0;
814 while (current_idx != llist_guard) {
815 ++num_particles_thread;
816 current_idx = pllist_next[current_idx];
817 }
818
819 index_type num_particles_block =
820 Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
821
822 if (threadIdx.x == 0) {
823 global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
824 }
825 }
826
827 current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
828
829 while (true) {
830 sdata[threadIdx.x] = index_type(current_idx != llist_guard);
831 index_type x = 0;
832
833 // simple block wide prefix sum
834 for (index_type i = 1; i<gpu_block_size; i*=2) {
835 __syncthreads();
836 if (threadIdx.x >= i) {
837 x = sdata[threadIdx.x - i];
838 }
839 __syncthreads();
840 if (threadIdx.x >= i) {
841 sdata[threadIdx.x] += x;
842 }
843 }
844 __syncthreads();
845 if (sdata[gpu_block_size_m1] == 0) {
846 break;
847 }
848 if (threadIdx.x == gpu_block_size_m1) {
849 if constexpr (compressed_layout) {
850 idx_start = global_idx_start;
851 global_idx_start += sdata[gpu_block_size_m1];
852 } else {
853 idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
854 }
855 }
856 __syncthreads();
857 sdata[threadIdx.x] += idx_start;
858 if (current_idx != llist_guard) {
859 pperm[sdata[threadIdx.x] - 1] = current_idx;
860 current_idx = pllist_next[current_idx];
861 }
862 }
863 });
864#else
865 amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
866 Abort("PermutationForDeposition only implemented for CUDA and HIP");
867#endif
868
869 Gpu::Device::streamSynchronize();
870}
871
872template <class index_type, class PTile>
874 const PTile& ptile, Box bx, Geometry geom, const IntVect idx_type)
875{
876 AMREX_ALWAYS_ASSERT(idx_type.allGE(0) && idx_type.allLE(2));
877
878 const IntVect refine_vect = max(idx_type, IntVect(1)).min(IntVect(2));
879 const IntVect type_vect = idx_type - idx_type / 2 * 2;
880
881 geom.refine(refine_vect);
882
883 Box domain = geom.Domain();
884
885 bx.convert(type_vect);
886 domain.convert(type_vect);
887
888 const RealVect dxi(geom.InvCellSize());
889 const RealVect pos_offset = Real(0.5) * (RealVect(geom.ProbLo()) + RealVect(geom.ProbHi())
890 - RealVect(geom.CellSize()) * RealVect(domain.smallEnd() + domain.bigEnd()));
891
892 const int ref_product = AMREX_D_TERM(refine_vect[0], * refine_vect[1], * refine_vect[2]);
893 const IntVect ref_offset(AMREX_D_DECL(1, refine_vect[0], refine_vect[0] * refine_vect[1]));
894
895 auto ptd = ptile.getConstParticleTileData();
896 PermutationForDeposition<index_type>(perm, nitems, bx.numPts() * ref_product,
897 [=] AMREX_GPU_DEVICE (index_type idx) noexcept
898 {
899 const auto p = ptd[idx];
900
901 IntVect iv = ((p.pos() - pos_offset) * dxi).round();
902
903 IntVect iv_coarse = iv / refine_vect;
904 IntVect iv_remainder = iv - iv_coarse * refine_vect;
905
906 iv_coarse = iv_coarse.max(bx.smallEnd());
907 iv_coarse = iv_coarse.min(bx.bigEnd());
908 return bx.index(iv_coarse) + bx.numPts() * (iv_remainder * ref_offset).sum();
909 });
910}
911
912template <typename P>
913std::string getDefaultCompNameReal (const int i) {
914 int first_r_name = 0;
915 if constexpr (P::is_soa_particle) {
916 if (i < AMREX_SPACEDIM) {
917 constexpr int x_in_ascii = 120;
918 std::string const name{char(x_in_ascii+i)};
919 return name;
920 }
921 first_r_name = AMREX_SPACEDIM;
922 }
923 std::string const name{("real_comp" + std::to_string(i-first_r_name))};
924 return name;
925}
926
927template <typename P>
928std::string getDefaultCompNameInt (const int i) {
929 std::string const name{("int_comp" + std::to_string(i))};
930 return name;
931}
932
933
943template <class PTile, class index_type,
944 std::enable_if_t<!PTile::ParticleType::is_rtsoa_particle, int> foo = 0>
945void
946ReorderParticles (PTile& ptile, const index_type* permutations)
947{
948 const size_t np = ptile.numParticles();
949 const size_t np_total = np + ptile.numNeighborParticles();
950
951#if defined(AMREX_USE_CUDA) && defined(_WIN32)
952 if (!PTile::ParticleType::is_soa_particle)
953#else
954 if constexpr (!PTile::ParticleType::is_soa_particle)
955#endif
956 {
957 static_assert(sizeof(typename PTile::ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
958 using tmp_t = std::conditional_t<sizeof(typename PTile::ParticleType)%8 == 0,
959 uint64_t, uint32_t>;
960 constexpr std::size_t nchunks = sizeof(typename PTile::ParticleType) / sizeof(tmp_t);
962 auto* ptmp = tmp.data();
963 auto* paos = (tmp_t*)(ptile.getParticleTileData().m_aos);
964 for (std::size_t ichunk = 0; ichunk < nchunks; ++ichunk) {
965 // Do not need to reorder neighbor particles
967 {
968 ptmp[i] = paos[permutations[i]*nchunks+ichunk];
969 });
971 {
972 paos[i*nchunks+ichunk] = ptmp[i];
973 });
974 }
975 Gpu::streamSynchronize();
976 } else {
977 typename PTile::SoA::IdCPU tmp_idcpu;
978 if constexpr (PTile::has_polymorphic_allocator) {
979 tmp_idcpu.setArena(ptile.GetStructOfArrays().GetIdCPUData().arena());
980 }
981 tmp_idcpu.resize(np_total);
982 auto src = ptile.GetStructOfArrays().GetIdCPUData().data();
983 uint64_t* dst = tmp_idcpu.data();
984 AMREX_HOST_DEVICE_FOR_1D( np_total, i,
985 {
986 dst[i] = i < np ? src[permutations[i]] : src[i];
987 });
988
989 Gpu::streamSynchronize();
990
991 ptile.GetStructOfArrays().GetIdCPUData().swap(tmp_idcpu);
992 }
993
994 { // Create a scope for the temporary vector below
995 typename PTile::RealVector tmp_real;
996 if (ptile.NumRealComps() > 0) {
997 if constexpr (PTile::has_polymorphic_allocator) {
998 tmp_real.setArena(ptile.GetStructOfArrays().GetRealData(0).arena());
999 }
1000 tmp_real.resize(np_total);
1001 }
1002 for (int comp = 0; comp < ptile.NumRealComps(); ++comp) {
1003 auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
1004 ParticleReal* dst = tmp_real.data();
1005 AMREX_HOST_DEVICE_FOR_1D( np_total, i,
1006 {
1007 dst[i] = i < np ? src[permutations[i]] : src[i];
1008 });
1009
1010 Gpu::streamSynchronize();
1011
1012 ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
1013 }
1014 }
1015
1016 typename PTile::IntVector tmp_int;
1017 if (ptile.NumIntComps() > 0) {
1018 if constexpr (PTile::has_polymorphic_allocator) {
1019 tmp_int.setArena(ptile.GetStructOfArrays().GetIntData(0).arena());
1020 }
1021 tmp_int.resize(np_total);
1022 }
1023
1024 for (int comp = 0; comp < ptile.NumIntComps(); ++comp) {
1025 auto src = ptile.GetStructOfArrays().GetIntData(comp).data();
1026 int* dst = tmp_int.data();
1027 AMREX_HOST_DEVICE_FOR_1D( np_total , i,
1028 {
1029 dst[i] = i < np ? src[permutations[i]] : src[i];
1030 });
1031
1032 Gpu::streamSynchronize();
1033
1034 ptile.GetStructOfArrays().GetIntData(comp).swap(tmp_int);
1035 }
1036}
1037
1047template <class PTile, class index_type,
1048 std::enable_if_t<PTile::ParticleType::is_rtsoa_particle, int> foo = 0>
1049void
1050ReorderParticles (PTile& ptile, const index_type* permutations)
1051{
1052 const size_t np = ptile.numParticles();
1053 {
1055
1056 auto src = ptile.GetIdCPUData().data();
1057 uint64_t* dst = tmp_idcpu.data();
1059 [=] AMREX_GPU_DEVICE (size_t i) {
1060 dst[i] = src[permutations[i]];
1061 });
1063 [=] AMREX_GPU_DEVICE (size_t i) {
1064 src[i] = dst[i];
1065 });
1066 }
1067 {
1069
1070 for (int comp = 0; comp < ptile.NumRealComps(); ++comp) {
1071 auto src = ptile.GetRealData(comp).data();
1072 auto dst = tmp_real.data();
1074 [=] AMREX_GPU_DEVICE (size_t i) {
1075 dst[i] = src[permutations[i]];
1076 });
1078 [=] AMREX_GPU_DEVICE (size_t i) {
1079 src[i] = dst[i];
1080 });
1081 }
1082 }
1083 {
1085
1086 for (int comp = 0; comp < ptile.NumIntComps(); ++comp) {
1087 auto src = ptile.GetIntData(comp).data();
1088 auto dst = tmp_int.data();
1090 [=] AMREX_GPU_DEVICE (size_t i) {
1091 dst[i] = src[permutations[i]];
1092 });
1094 [=] AMREX_GPU_DEVICE (size_t i) {
1095 src[i] = dst[i];
1096 });
1097 }
1098 }
1099 // stream sync for deallocation of permutations variable in user code
1100 Gpu::streamSynchronize();
1101}
1102
1103#ifdef AMREX_USE_HDF5_ASYNC
1104void async_vol_es_wait_particle();
1105void async_vol_es_wait_close_particle();
1106#endif
1107}
1108
1109#endif // include guard
#define BL_PROFILE(a)
Definition AMReX_BLProfiler.H:551
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119
#define AMREX_HOST_DEVICE_FOR_1D(...)
Definition AMReX_GpuLaunchMacrosC.nolint.H:105
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
Array4< int const > mask
Definition AMReX_InterpFaceRegister.cpp:93
#define AMREX_D_TERM(a, b, c)
Definition AMReX_SPACE.H:172
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171
__host__ __device__ BoxND & grow(int i) noexcept
Definition AMReX_Box.H:641
__host__ __device__ const IntVectND< dim > & bigEnd() const &noexcept
Return the inclusive upper bound of the box.
Definition AMReX_Box.H:123
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:356
__host__ __device__ BoxND & convert(IndexTypeND< dim > typ) noexcept
Convert the BoxND from the current type into the argument type. This may change the BoxND coordinates...
Definition AMReX_Box.H:974
__host__ __device__ bool contains(const IntVectND< dim > &p) const noexcept
Return true if argument is contained within BoxND.
Definition AMReX_Box.H:212
__host__ __device__ Long index(const IntVectND< dim > &v) const noexcept
Return offset of point from smallend; i.e. index(smallend) -> 0, bigend would return numPts()-1....
Definition AMReX_Box.H:1055
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:111
const Real * CellSize() const noexcept
Returns the cellsize for each coordinate direction.
Definition AMReX_CoordSys.H:71
const Real * InvCellSize() const noexcept
Returns the inverse cellsize for each coordinate direction.
Definition AMReX_CoordSys.H:82
Rectangular problem domain geometry.
Definition AMReX_Geometry.H:74
void refine(IntVect const &rr)
Definition AMReX_Geometry.H:412
const Real * ProbHi() const noexcept
Returns the hi end of the problem domain in each dimension.
Definition AMReX_Geometry.H:181
const Box & Domain() const noexcept
Returns our rectangular domain.
Definition AMReX_Geometry.H:211
const Real * ProbLo() const noexcept
Returns the lo end of the problem domain in each dimension.
Definition AMReX_Geometry.H:179
__host__ __device__ constexpr int min() const noexcept
minimum (no absolute values) value
Definition AMReX_IntVect.H:232
__host__ __device__ constexpr bool allGE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is greater than or equal to argument for all components. NOTE: This is NOT a str...
Definition AMReX_IntVect.H:450
__host__ __device__ constexpr int max() const noexcept
maximum (no absolute values) value
Definition AMReX_IntVect.H:221
__host__ __device__ constexpr bool allLE(const IntVectND< dim > &rhs) const noexcept
Returns true if this is less than or equal to argument for all components. NOTE: This is NOT a strict...
Definition AMReX_IntVect.H:400
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:169
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
void resize(size_type a_new_size, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:728
void free_async() noexcept
Definition AMReX_PODVector.H:853
T * dataPtr() noexcept
Definition AMReX_PODVector.H:670
T * data() noexcept
Definition AMReX_PODVector.H:666
Definition AMReX_ParIter.H:118
Definition AMReX_ParticleBufferMap.H:53
GetPID getPIDFunctor() const noexcept
Definition AMReX_ParticleBufferMap.H:156
Definition AMReX_Reduce.H:453
Type value()
Definition AMReX_Reduce.H:488
Definition AMReX_Reduce.H:612
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:746
amrex_real Real
Floating Point Type for Fields.
Definition AMReX_REAL.H:79
amrex_particle_real ParticleReal
Floating Point Type for Particles.
Definition AMReX_REAL.H:90
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:221
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int MyProcSub() noexcept
my sub-rank in current frame
Definition AMReX_ParallelContext.H:76
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum noRetSum
Definition AMReX_Scan.H:33
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
__host__ __device__ void swapParticle(const ParticleTileData< T_ParticleType, NAR, NAI > &dst, const ParticleTileData< T_ParticleType, NAR, NAI > &src, int src_i, int dst_i) noexcept
A general single particle swapping routine that can run on the GPU.
Definition AMReX_ParticleTransformation.H:120
__host__ __device__ int getTileIndex(const IntVect &iv, const Box &box, const bool a_do_tiling, const IntVect &a_tile_size, Box &tbx)
Definition AMReX_ParticleUtil.H:185
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
void EnsureThreadSafeTiles(PC &pc)
Definition AMReX_ParticleUtil.H:730
std::string getDefaultCompNameInt(const int i)
Definition AMReX_ParticleUtil.H:928
void ReorderParticles(PTile &ptile, const index_type *permutations)
Reorder particles on the tile ptile using a the permutations array.
Definition AMReX_ParticleUtil.H:946
__host__ __device__ bool enforcePeriodic(P &p, amrex::GpuArray< amrex::Real, 3 > const &plo, amrex::GpuArray< amrex::Real, 3 > const &phi, amrex::GpuArray< amrex::ParticleReal, 3 > const &rlo, amrex::GpuArray< amrex::ParticleReal, 3 > const &rhi, amrex::GpuArray< int, 3 > const &is_per) noexcept
Definition AMReX_ParticleUtil.H:417
void removeInvalidParticles(PTile &ptile)
Definition AMReX_ParticleUtil.H:637
__host__ __device__ int numTilesInBox(const Box &box, const bool a_do_tiling, const IntVect &a_tile_size)
Definition AMReX_ParticleUtil.H:233
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
std::string getDefaultCompNameReal(const int i)
Definition AMReX_ParticleUtil.H:913
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:24
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:44
__host__ __device__ int getParticleGrid(P const &p, amrex::Array4< int > const &mask, amrex::GpuArray< amrex::Real, 3 > const &plo, amrex::GpuArray< amrex::Real, 3 > const &dxi, const Box &domain) noexcept
Definition AMReX_ParticleUtil.H:405
void PermutationForDeposition(Gpu::DeviceVector< index_type > &perm, index_type nitems, index_type nbins, F const &f)
Definition AMReX_ParticleUtil.H:759
bool SameIteratorsOK(const PC1 &pc1, const PC2 &pc2)
Definition AMReX_ParticleUtil.H:718
int partitionParticles(PTile &ptile, ParFunc const &is_left)
Reorders the ParticleTile into two partitions left [0, num_left-1] and right [num_left,...
Definition AMReX_ParticleUtil.H:468
int numParticlesOutOfRange(Iterator const &pti, int nGrow)
Returns the number of particles that are more than nGrow cells from the box correspond to the input i...
Definition AMReX_ParticleUtil.H:35
int partitionParticlesByDest(PTile &ptile, const PLocator &ploc, CellAssignor const &assignor, const ParticleBufferMap &pmap, const GpuArray< Real, 3 > &plo, const GpuArray< Real, 3 > &phi, const GpuArray< ParticleReal, 3 > &rlo, const GpuArray< ParticleReal, 3 > &rhi, const GpuArray< int, 3 > &is_per, int lev, int gid, int, int lev_min, int lev_max, int nGrow, bool remove_negative)
Definition AMReX_ParticleUtil.H:650
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
__host__ __device__ IntVect getParticleCell(P const &p, amrex::GpuArray< amrex::Real, 3 > const &plo, amrex::GpuArray< amrex::Real, 3 > const &dxi) noexcept
Returns the cell index for a given particle using the provided lower bounds and cell sizes.
Definition AMReX_ParticleUtil.H:337
A multidimensional array accessor.
Definition AMReX_Array4.H:283
Definition AMReX_ParticleUtil.H:259
BinMapper(const int *off_bins_p, const GpuArray< Real, 3 > *dxi_p, const GpuArray< Real, 3 > *plo_p, const Dim3 *lo_p, const Dim3 *hi_p, int *bin_type_array=nullptr)
Definition AMReX_ParticleUtil.H:260
__host__ __device__ unsigned int operator()(const T &ptd, int i) const
Definition AMReX_ParticleUtil.H:271
Definition AMReX_ParticleUtil.H:390
__host__ __device__ IntVect operator()(P const &p, amrex::GpuArray< amrex::Real, 3 > const &plo, amrex::GpuArray< amrex::Real, 3 > const &dxi, const Box &domain) const noexcept
Definition AMReX_ParticleUtil.H:394
Definition AMReX_Dim3.H:12
int x
Definition AMReX_Dim3.H:12
int z
Definition AMReX_Dim3.H:12
int y
Definition AMReX_Dim3.H:12
Definition AMReX_ParticleUtil.H:304
GpuArray< Real, 3 > plo
Definition AMReX_ParticleUtil.H:305
IntVect bin_size
Definition AMReX_ParticleUtil.H:308
__host__ __device__ unsigned int operator()(const ParticleType &p) const noexcept
Definition AMReX_ParticleUtil.H:313
GpuArray< Real, 3 > dxi
Definition AMReX_ParticleUtil.H:306
Box domain
Definition AMReX_ParticleUtil.H:307
Box box
Definition AMReX_ParticleUtil.H:309
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:43
Definition AMReX_GpuMemory.H:56
T * dataPtr()
Definition AMReX_GpuMemory.H:90