docs_html/doxygen/AMReX__BaseFabUtility_8H_source.html

#ifndef AMREX_BASEFAB_UTILITY_H_

#define AMREX_BASEFAB_UTILITY_H_

#include <AMReX_Config.H>


#include <AMReX_BaseFab.H>

#include <AMReX_TypeTraits.H>


namespace amrex {


template <class Tto, class Tfrom>

AMREX_GPU_HOST_DEVICE

void


cast (BaseFab<Tto>& tofab, BaseFab<Tfrom> const& fromfab,

      Box const& bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept

{

    auto const& tdata = tofab.array();

    auto const& fdata = fromfab.const_array();

    amrex::LoopConcurrent(bx, ncomp.n, [=] (int i, int j, int k, int n) noexcept

    {

        tdata(i,j,k,n+dcomp.i) = static_cast<Tto>(fdata(i,j,k,n+scomp.i));

    });

}


template <typename STRUCT, typename F>

requires ((sizeof(STRUCT)<=36*8) &&

          AMREX_IS_TRIVIALLY_COPYABLE(STRUCT) &&

          std::is_trivially_destructible_v<STRUCT>)


void fill (BaseFab<STRUCT>& aos_fab, F const& f)

{

    Box const& box = aos_fab.box();

    auto const& aos = aos_fab.array();

    using T = typename STRUCT::value_type;

    constexpr int STRUCTSIZE = sizeof(STRUCT)/sizeof(T);

    static_assert(sizeof(STRUCT) == sizeof(T)*STRUCTSIZE,

                  "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");

#ifdef AMREX_USE_GPU

    if (Gpu::inLaunchRegion()) {

        BoxIndexer indexer(box);

        const auto ntotcells = std::uint64_t(box.numPts());

        constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;

        std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;

        AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));

        auto nblocks = int(nblocks_long);

        std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);

        T* p = (T*)aos_fab.dataPtr();

#ifdef AMREX_USE_SYCL

        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept

        {

            auto const icell = std::uint64_t(handler.globalIdx());

            std::uint64_t const blockDimx = handler.blockDim();

            std::uint64_t const threadIdxx = handler.threadIdx();

            std::uint64_t const blockIdxx = handler.blockIdx();

            auto const shared = (T*)handler.sharedMemory();

            if (icell < indexer.numPts()) {

                auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;

                auto [i, j, k] = indexer(icell);

                f(*ga, i, j, k);

            }

            handler.sharedBarrier();

            for (std::uint64_t m = threadIdxx,

                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;

                 m < mend; m += blockDimx) {

                p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];

            }

        });

#else

        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept

        {

            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;

            Gpu::SharedMemory<T> gsm;

            T* const shared = gsm.dataPtr();

            if (icell < indexer.numPts()) {

                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;

                auto [i, j, k] = indexer(icell);

                f(*ga, i, j, k);

            }

            __syncthreads();

            for (std::uint64_t m = threadIdx.x,

                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;

                 m < mend; m += blockDim.x) {

                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];

            }

        });

#endif

    } else

#endif

    {

        amrex::LoopOnCpu(box, [&] (int i, int j, int k) noexcept

        {

            f(aos(i,j,k), i, j, k);

        });

    }

}


template <typename T>


void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)

{

    AMREX_ALWAYS_ASSERT(pi != po);


#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)


    constexpr int tile_dim = 32;

    constexpr int block_rows = 16;

    constexpr int nthreads = tile_dim*block_rows;


    // Each block has tile_dim x block_rows threads. They work on a tile_dim

    // x tile_dim tile.


    dim3 block{unsigned(tile_dim), unsigned(block_rows), 1};

    dim3 grid{unsigned((nx+tile_dim-1)/tile_dim),

              unsigned((nz+tile_dim-1)/tile_dim), unsigned(ny)};


    AMREX_LAUNCH_KERNEL(nthreads, grid, block, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE ()

    {

        __shared__ T tile[tile_dim][tile_dim+1]; // +1 to avoid bank conflicts


        int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction

        int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction


        int j = blockIdx.z; // for y-direction


        if (k < nz) {

            for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {

                if (i < nx) {

                    //      x               z

                    tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];

                }

            }

        }


        __syncthreads();


        i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction

        k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction


        if (i < nx) {

            for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {

                if (k < nz) {

                    po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];

                }

            }

        }

    });


#elif defined(AMREX_USE_SYCL)


    constexpr int tile_dim = 32;

    constexpr int block_rows = 8;


    // Each block has tile_dim x block_rows threads. They work on a tile_dim

    // x tile_dim tile.


    sycl::range<3> block{std::size_t(1), std::size_t(block_rows), std::size_t(tile_dim)};

    sycl::range<3> grid{std::size_t(ny), std::size_t((nz+tile_dim-1)/tile_dim),

                                         std::size_t((nx+tile_dim-1)/tile_dim)};

    sycl::range<3> global_size{grid[0]*block[0],

                               grid[1]*block[1],

                               grid[2]*block[2]};


    auto& q = *(Gpu::gpuStream().queue);

    try {

      q.submit([&] (sycl::handler& h)

      {

        auto tile = sycl::local_accessor<T,2>(sycl::range<2>(tile_dim,tile_dim+1),h);


        h.parallel_for(sycl::nd_range<3>(global_size, block),

        [=] (sycl::nd_item<3> item)

        {

            auto group = item.get_group();

            dim3 blockIdx{unsigned(group.get_group_id(2)),

                          unsigned(group.get_group_id(1)),

                          unsigned(group.get_group_id(0))};

            dim3 threadIdx{unsigned(item.get_local_id(2)),

                           unsigned(item.get_local_id(1)),

                           unsigned(item.get_local_id(0))};


            int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction

            int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction


            int j = blockIdx.z; // for y-direction


            if (k < nz) {

                for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {

                    if (i < nx) {

                        //      x               z

                        tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];

                    }

                }

            }


            item.barrier(sycl::access::fence_space::local_space);


            i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction

            k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction


            if (i < nx) {

                for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {

                    if (k < nz) {

                        po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];

                    }

                }

            }

        });

      });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("transposeCtoF: ")+ex.what()+"!!!!!");

    }


#else


    constexpr int bx = 32;

    constexpr int bz = 32;


    std::size_t nxy = std::size_t(nx) * ny;

    std::size_t nyz = std::size_t(ny) * nz;


#ifdef AMREX_USE_OMP

#pragma omp parallel for collapse(3)

#endif

    for (int j = 0; j < ny; ++j) {

        for (int k0 = 0; k0 < nz; k0 += bz) {

            for (int i0 = 0; i0 < nx; i0 += bx) {

                int imax = std::min(i0+bx, nx);

                int kmax = std::min(k0+bz, nz);

                auto      * AMREX_RESTRICT pdst = po + j*std::size_t(nx);

                auto const* AMREX_RESTRICT psrc = pi + j*std::size_t(nz);

                for (int i = i0; i < imax; ++i) {

                    AMREX_PRAGMA_SIMD

                    for (int k = k0; k < kmax; ++k) {

                        pdst[i + k*nxy] = psrc[k + i*nyz];

                    }

                }

            }

        }

    }


#endif

}


template <typename T>


void transposeCtoF (T const* pi, T* po, int nx, int ny)

{

    transposeCtoF(pi, po, nx, 1, ny);

}


}


#endif

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38

AMREX_ALWAYS_ASSERT
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50

AMReX_BaseFab.H
BaseFab container template providing box-based field storage.

AMREX_PRAGMA_SIMD
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:85

AMREX_RESTRICT
#define AMREX_RESTRICT
Definition AMReX_Extension.H:37

AMREX_LAUNCH_KERNEL
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMREX_GPU_HOST_DEVICE
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20

pdst
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1130

AMReX_TypeTraits.H

AMREX_IS_TRIVIALLY_COPYABLE
#define AMREX_IS_TRIVIALLY_COPYABLE(T)
Definition AMReX_TypeTraits.H:10

amrex::BaseFab
A FortranArrayBox(FAB)-like object.
Definition AMReX_BaseFab.H:222

amrex::BaseFab::box
const Box & box() const noexcept
Returns the domain (box) where the array is defined.
Definition AMReX_BaseFab.H:357

amrex::BaseFab::array
Array4< T const > array() const noexcept
Create an Array4 view over all components.
Definition AMReX_BaseFab.H:475

amrex::BaseFab::dataPtr
T * dataPtr(int n=0) noexcept
Returns a pointer to an object of type T that is the value of the Nth component associated with the c...
Definition AMReX_BaseFab.H:418

amrex::BoxND< 3 >

amrex::BoxND::numPts
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:385

amrex::Gpu::inLaunchRegion
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88

amrex::Gpu::gpuStream
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291

amrex
Definition AMReX_Amr.cpp:50

amrex::fill
void fill(BaseFab< STRUCT > &aos_fab, F const &f)
Fill an array-of-structs BaseFab by invoking a functor per cell.
Definition AMReX_BaseFabUtility.H:58

amrex::transposeCtoF
void transposeCtoF(T const *pi, T *po, int nx, int ny, int nz)
Transpose a 3D array of shape (nx, ny, nz) from C-order to Fortran-order storage.
Definition AMReX_BaseFabUtility.H:142

amrex::Order::F
@ F

amrex::cast
__host__ __device__ void cast(BaseFab< Tto > &tofab, BaseFab< Tfrom > const &fromfab, Box const &bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
Cast components from one BaseFab to another over a region.
Definition AMReX_BaseFabUtility.H:30

amrex::Abort
void Abort(const std::string &msg)
Print a fatal-error message to stderr and abort execution.
Definition AMReX.cpp:241

amrex::int
const int[]
Definition AMReX_BLProfiler.cpp:1664

amrex::LoopOnCpu
void LoopOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:365

amrex::LoopConcurrent
__host__ __device__ void LoopConcurrent(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:152

amrex::BoxIndexerND
Utility that maps flattened point indices back to IntVectND coordinates.
Definition AMReX_Box.H:2494

amrex::BoxIndexerND::numPts
__host__ __device__ std::uint64_t numPts() const
Return the number of points covered by the indexed box.
Definition AMReX_Box.H:2552

amrex::DestComp
Destination-component descriptor.
Definition AMReX_BaseFab.H:107

amrex::Gpu::Handler
Definition AMReX_GpuTypes.H:86

amrex::Gpu::SharedMemory
Definition AMReX_GpuMemory.H:126

amrex::Gpu::SharedMemory::dataPtr
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:127

amrex::NumComps
Number-of-components descriptor.
Definition AMReX_BaseFab.H:114

amrex::SrcComp
Source-component descriptor.
Definition AMReX_BaseFab.H:100