amrex/doxygen/AMReX__MFParallelForG_8H_source.html

 #ifndef AMREX_MF_PARALLEL_FOR_G_H_

 #define AMREX_MF_PARALLEL_FOR_G_H_

 #include <AMReX_Config.H>


 #ifdef AMREX_USE_GPU


 #include <algorithm>

 #include <cmath>

 #include <limits>


 namespace amrex {

 namespace detail {


 inline

 void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,

                             Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)

 {

     if (!ncells.empty()) {

         const int nboxes = ncells.size();

         const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));

         const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);

         a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);

         int* hp_blks = (int*)a_hp;

         auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);

         hp_blks[0] = 0;

         bool same_size = true;

         for (int i = 0; i < nboxes; ++i) {

             Long nblocks = (ncells[i] + nthreads-1) / nthreads;

             AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));

             hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);

             same_size = same_size && (ncells[i] == ncells[0]);


             new (hp_boxes+i) BoxIndexer(boxes[i]);

         }


         a_dp = (char*) The_Arena()->alloc(nbytes);

         Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);


         blocks_x.first = hp_blks;

         blocks_x.second = (same_size) ? nullptr : (int*)a_dp;

         pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);

     }

 }


 inline

 void destroy_par_for_nblocks (char* hp, char* dp)

 {

     The_Pinned_Arena()->free(hp);

     The_Arena()->free(dp);

 }

 }


 namespace experimental::detail {


 namespace parfor_mf_detail {

     template <typename F>

     AMREX_GPU_DEVICE

     auto call_f (F const& f, int b, int i, int j, int k, int) noexcept

         -> decltype(f(0,0,0,0))

     {

         f(b,i,j,k);

     }


     template <typename F>

     AMREX_GPU_DEVICE

     auto call_f (F const& f, int b, int i, int j, int k, int n) noexcept

         -> decltype(f(0,0,0,0,0))

     {

         f(b,i,j,k,n);

     }

 }


 template <int MT, typename MF, typename F>

 std::enable_if_t<IsFabArray<MF>::value>

 ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F const& f)

 {

     const auto& index_array = mf.IndexArray();

     const int nboxes = index_array.size();


     if (nboxes == 0) {

         return;

     } else if (nboxes == 1) {

         Box const& b = amrex::grow(mf.box(index_array[0]), nghost);

         amrex::ParallelFor(b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept

         {

             parfor_mf_detail::call_f(f, 0, i, j, k, n);

         });

     } else {

         auto const& parforinfo = mf.getParForInfo(nghost,MT);

         auto par_for_blocks = parforinfo.getBlocks();

         const int nblocks = par_for_blocks.first[nboxes];

         const int block_0_size = par_for_blocks.first[1];

         const int* dp_nblocks = par_for_blocks.second;

         const BoxIndexer* dp_boxes = parforinfo.getBoxes();


 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)


         amrex::launch_global<MT>

             <<<nblocks, MT, 0, Gpu::gpuStream()>>>

             ([=] AMREX_GPU_DEVICE () noexcept

              {

                  int ibox;

                  std::uint64_t icell;

                  if (dp_nblocks) {

                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));

                      icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;

                  } else {

                      ibox = blockIdx.x / block_0_size;

                      icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;

                  }


 #elif defined(AMREX_USE_SYCL)


         amrex::launch<MT>(nblocks, Gpu::gpuStream(),

              [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept

              {

                  int ibox;

                  std::uint64_t icell;

                  int blockIdxx = item.get_group_linear_id();

                  int threadIdxx = item.get_local_linear_id();

                  if (dp_nblocks) {

                      ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));

                      icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;

                  } else {

                      ibox = blockIdxx / block_0_size;

                      icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;

                  }

 #endif

                  BoxIndexer const& indexer = dp_boxes[ibox];

                  if (icell < indexer.numPts()) {

                      auto [i, j, k] = indexer(icell);

                      for (int n = 0; n < ncomp; ++n) {

                          parfor_mf_detail::call_f(f, ibox, i, j, k, n);

                      }

                  }

              });

     }

     AMREX_GPU_ERROR_CHECK();

 }


 template <typename MF, typename F>

 std::enable_if_t<IsFabArray<MF>::value>

 ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f)

 {

     ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));

 }


 template <typename MF, typename F>

 std::enable_if_t<IsFabArray<MF>::value>

 ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f)

 {

     ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));

 }


 }


 }


 #endif

 #endif

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38

AMREX_GPU_ERROR_CHECK
#define AMREX_GPU_ERROR_CHECK()
Definition: AMReX_GpuError.H:125

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18

amrex::Arena::free
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.

amrex::Arena::alloc
virtual void * alloc(std::size_t sz)=0

amrex::BoxND< AMREX_SPACEDIM >

amrex::IntVectND< AMREX_SPACEDIM >

amrex::Vector
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27

amrex::Vector::size
Long size() const noexcept
Definition: AMReX_Vector.H:50

amrex::Gpu::htod_memcpy_async
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition: AMReX_GpuDevice.H:251

amrex::Gpu::gpuStream
gpuStream_t gpuStream() noexcept
Definition: AMReX_GpuDevice.H:218

amrex::SundialsUserFun::f
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44

amrex::detail::max
@ max
Definition: AMReX_ParallelReduce.H:17

amrex::detail::build_par_for_nblocks
void build_par_for_nblocks(char *&a_hp, char *&a_dp, std::pair< int *, int * > &blocks_x, BoxIndexer *&pboxes, Vector< Box > const &boxes, Vector< Long > const &ncells, int nthreads)
Definition: AMReX_MFParallelForG.H:15

amrex::detail::destroy_par_for_nblocks
void destroy_par_for_nblocks(char *hp, char *dp)
Definition: AMReX_MFParallelForG.H:46

amrex::experimental::detail::parfor_mf_detail::call_f
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition: AMReX_MFParallelForG.H:58

amrex::experimental::detail::ParallelFor
std::enable_if_t< IsFabArray< MF >::value > ParallelFor(MF const &mf, IntVect const &nghost, int ncomp, IntVect const &, bool, F const &f)
Definition: AMReX_MFParallelForG.H:75

amrex
Definition: AMReX_Amr.cpp:49

amrex::ParallelFor
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200

amrex::CurlCurlStateType::b
@ b

amrex::grow
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition: AMReX_Box.H:1211

amrex::bisect
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition: AMReX_Algorithm.H:105

amrex::BoxIndexer
BoxIndexerND< AMREX_SPACEDIM > BoxIndexer
Definition: AMReX_Box.H:2099

amrex::The_Pinned_Arena
Arena * The_Pinned_Arena()
Definition: AMReX_Arena.cpp:649

amrex::aligned_size
std::size_t aligned_size(std::size_t align_requirement, std::size_t size) noexcept
Given a minimum required size of size bytes, this returns the next largest arena size that will align...
Definition: AMReX_Arena.H:30

amrex::The_Arena
Arena * The_Arena()
Definition: AMReX_Arena.cpp:609

detail
Definition: AMReX_FabArrayCommI.H:896

sdcquadrature_mod::dp
integer, parameter dp
Definition: AMReX_SDCquadrature.F90:8

amrex::BoxIndexerND
Definition: AMReX_Box.H:2027

amrex::BoxIndexerND::numPts
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition: AMReX_Box.H:2068