docs_html/doxygen/AMReX__GpuLaunch_8H_source.html

#ifndef AMREX_GPU_LAUNCH_H_

#define AMREX_GPU_LAUNCH_H_

#include <AMReX_Config.H>


#include <AMReX_GpuQualifiers.H>

#include <AMReX_GpuKernelInfo.H>

#include <AMReX_GpuControl.H>

#include <AMReX_GpuTypes.H>

#include <AMReX_GpuError.H>

#include <AMReX_GpuRange.H>

#include <AMReX_GpuDevice.H>

#include <AMReX_GpuMemory.H>

#include <AMReX_GpuReduce.H>

#include <AMReX_Arena.H>

#include <AMReX_Tuple.H>

#include <AMReX_Box.H>

#include <AMReX_Loop.H>

#include <AMReX_Extension.H>

#include <AMReX_BLassert.H>

#include <AMReX_TypeTraits.H>

#include <AMReX_GpuLaunchGlobal.H>

#include <AMReX_RandomEngine.H>

#include <AMReX_Algorithm.H>

#include <AMReX_Math.H>

#include <AMReX_Vector.H>

#include <concepts>

#include <cstddef>

#include <limits>

#include <algorithm>

#include <utility>


#define AMREX_GPU_NCELLS_PER_THREAD 3

#define AMREX_GPU_Y_STRIDE 1

#define AMREX_GPU_Z_STRIDE 1


#ifdef AMREX_USE_CUDA


#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \

        amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)


#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \

        amrex::launch_global    <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)


#elif defined(AMREX_USE_HIP)

#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \

        hipLaunchKernelGGL(amrex::launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)

#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \

        hipLaunchKernelGGL(amrex::launch_global    , blocks, threads, sharedMem, stream, __VA_ARGS__)

#endif


namespace amrex {


// We cannot take rvalue lambdas.

// ************************************************

//  Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.


#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)


    template <typename... Lambdas>

    AMREX_GPU_DEVICE void call_device (Lambdas&&... fs) noexcept

    {

        (std::forward<Lambdas>(fs)(), ...);

    }


    template<class L, class... Lambdas>

    AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }


#endif


// CPU variation


    template<class L>

    void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }


    template<class L, class... Lambdas>


    void launch_host (L&& f0, Lambdas&&... fs) noexcept {

        std::forward<L>(f0)();

        launch_host(std::forward<Lambdas>(fs)...);

    }


    template <class T> class LayoutData;

    class FabArrayBase;


namespace Gpu {


#ifdef AMREX_USE_GPU


    constexpr std::size_t numThreadsPerBlockParallelFor () {

        return AMREX_GPU_MAX_THREADS;

    }


#else

    constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }

#endif


    AMREX_GPU_HOST_DEVICE

    inline


    Box getThreadBox (const Box& bx, Long offset) noexcept

    {

        AMREX_IF_ON_DEVICE((

            const auto len = bx.length3d();

            Long k = offset / (len[0]*len[1]);

            Long j = (offset - k*(len[0]*len[1])) / len[0];

            Long i = (offset - k*(len[0]*len[1])) - j*len[0];

            IntVect iv{AMREX_D_DECL(static_cast<int>(i),

                                    static_cast<int>(j),

                                    static_cast<int>(k))};

            iv += bx.smallEnd();

            return (bx & Box(iv,iv,bx.type()));

        ))

        AMREX_IF_ON_HOST((

            amrex::ignore_unused(offset);

            return bx;

        ))

    }


// ************************************************


#ifdef AMREX_USE_GPU


    struct ExecutionConfig {


        ExecutionConfig () noexcept {

            Gpu::Device::grid_stride_threads_and_blocks(numBlocks,numThreads);

        }


        ExecutionConfig (const Box& box) noexcept {

            // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,

            // which assumes the decomposition is 1D.

            Gpu::Device::n_threads_and_blocks( ((box.numPts()+AMREX_GPU_NCELLS_PER_THREAD-1)/AMREX_GPU_NCELLS_PER_THREAD), numBlocks, numThreads );

#if 0

            Box b = amrex::surroundingNodes(box);

            b -= box.smallEnd();

            b.coarsen(IntVect(AMREX_D_DECL(1,AMREX_GPU_Y_STRIDE,AMREX_GPU_Z_STRIDE)));

            Gpu::Device::c_threads_and_blocks(b.loVect(), b.hiVect(), numBlocks, numThreads);

#endif

        }


        ExecutionConfig (const Box& box, int comps) noexcept {

            const Box& b = amrex::surroundingNodes(box);

            Gpu::Device::c_comps_threads_and_blocks(b.loVect(), b.hiVect(), comps, numBlocks, numThreads);

        }


        ExecutionConfig (Long N) noexcept {

            Gpu::Device::n_threads_and_blocks(N, numBlocks, numThreads);

        }


        ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept

            : numBlocks(nb), numThreads(nt), sharedMem(sm) {}


        dim3 numBlocks;

        dim3 numThreads;

        std::size_t sharedMem = 0;

    };


    template <int MT>

    ExecutionConfig


    makeExecutionConfig (Long N) noexcept

    {

        ExecutionConfig ec(dim3{}, dim3{});

        Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;

        // ensure that blockDim.x*gridDim.x does not overflow

        numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));

        // ensure that the maximum grid size of 2^31-1 won't be exceeded

        numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));

        ec.numBlocks.x = numBlocks;

        ec.numThreads.x = MT;

        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);

        return ec;

    }


    template <int MT>

    ExecutionConfig


    makeExecutionConfig (const Box& box) noexcept

    {

        return makeExecutionConfig<MT>(box.numPts());

    }


    struct ExecConfig

    {

        Long start_idx;

        int nblocks;

    };


    template <int MT>


    Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept

    {

        // Max # of blocks in a kernel launch

        int numblocks_max = std::numeric_limits<int>::max();

        // Max # of threads in a kernel launch

        Long nmax = Long(MT) * numblocks_max;

        // # of launches needed for N elements without using grid-stride

        // loops inside GPU kernels.

        auto nlaunches = int((N+nmax-1)/nmax);

        Vector<ExecConfig> r(nlaunches);

        Long ndone = 0;

        for (int i = 0; i < nlaunches; ++i) {

            int nblocks;

            if (N > nmax) {

                nblocks = numblocks_max;

                N -= nmax;

            } else {

                nblocks = int((N+MT-1)/MT);

            }

            // At which element ID the kernel should start

            r[i].start_idx = ndone;

            ndone += Long(nblocks) * MT;

            // # of blocks in this launch

            r[i].nblocks = nblocks;

        }

        return r;

    }


    template <int MT, int dim>


    Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept

    {

        return makeNExecutionConfigs<MT>(box.numPts());

    }


#endif


}

}


#ifdef AMREX_USE_SYCL

namespace amrex::detail {


    template <typename... L> constexpr bool is_big_kernel () {

        return (sizeof(L) + ... + 0) > 1792;

    }


    template <typename... L>

    struct SyclKernelDevPtr

    {

        using Ls = GpuTuple<L...>;

        Ls* m_dp = nullptr;

        Ls* m_hp = nullptr;

        gpuStream_t m_stream;


        SyclKernelDevPtr (L const&... f, gpuStream_t const& stream)

            : m_stream(stream)

        {

            if constexpr (is_big_kernel<L...>()) {

                std::size_t sz = sizeof(Ls);

                m_dp = (Ls*)The_Arena()->alloc(sz);

                m_hp = (Ls*)The_Pinned_Arena()->alloc(sz);

                new (m_hp) Ls(f...);

                auto* l_hp = (void const*)m_hp;

                auto* l_dp = (void*)m_dp;

                stream.queue->submit([&] (sycl::handler& h) {

                    h.memcpy(l_dp, l_hp, sz);

                });

            } else {

                amrex::ignore_unused(f...);

            }

        }


        ~SyclKernelDevPtr ()

        {

            if constexpr (is_big_kernel<Ls>()) {

                try {

                    m_stream.queue->wait_and_throw();

                } catch (sycl::exception const& ex) {

                    if (m_dp) {

                        The_Arena()->free((void*)m_dp);

                        m_dp = nullptr;

                    }

                    if (m_hp) {

                        m_hp->~Ls();

                        The_Pinned_Arena()->free((void*)m_hp);

                        m_hp = nullptr;

                    }

                    amrex::Abort(std::string("~SyclKernelDevPtr: ")+ex.what());

                }

                if (m_dp) {

                    The_Arena()->free((void*)m_dp);

                    m_dp = nullptr;

                }

                if (m_hp) {

                    m_hp->~Ls();

                    The_Pinned_Arena()->free((void*)m_hp);

                    m_hp = nullptr;

                }

            }

        }


        template <int N>

        auto get () const

        {

            using pointer_t = std::add_pointer_t<

                std::add_const_t<typename std::tuple_element<N, Ls>::type>>;

            if (m_hp && m_dp) {

                std::ptrdiff_t offset = (char*)(&(amrex::get<N>(*m_hp))) - (char*)m_hp;

                return pointer_t((char const*)m_dp + offset);

            } else {

                return pointer_t(nullptr);

            }

        }

    };


}

#endif


#ifdef AMREX_USE_GPU

#include <AMReX_GpuLaunchMacrosG.H>

#include <AMReX_GpuLaunchFunctsG.H>

#else

#include <AMReX_GpuLaunchMacrosC.H>

#include <AMReX_GpuLaunchFunctsC.H>

#include <AMReX_SIMD.H>

#endif

#include <AMReX_GpuLaunchFunctsSIMD.H>


#include <AMReX_GpuLaunch.nolint.H>


#include <AMReX_CTOParallelForImpl.H>


namespace amrex {


#if defined(AMREX_USE_GPU) || !defined(AMREX_USE_OMP)


template <std::integral T, typename L >


void ParallelForOMP (T n, L const& f) noexcept

{

    ParallelFor(n, f);

}


template <typename L>


void ParallelForOMP (Box const& box, L const& f) noexcept

{

    ParallelFor(box, f);

}


template <std::integral T, typename L >


void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept

{

    ParallelFor(box, ncomp, f);

}


#else /* !defined(AMREX_USE_GPU) && defined(AMREX_USE_OMP) */


template <std::integral T, typename L >

AMREX_ATTRIBUTE_FLATTEN_FOR

void ParallelForOMP (T n, L const& f) noexcept

{

#pragma omp parallel for

    for (T i = 0; i < n; ++i) {

        f(i);

    }

}


template <typename L>

AMREX_ATTRIBUTE_FLATTEN_FOR

void ParallelForOMP (Box const& box, L const& f) noexcept

{

    auto lo = amrex::lbound(box);

    auto hi = amrex::ubound(box);

#if (AMREX_SPACEDIM == 1)

#pragma omp parallel for

    for (int i = lo.x; i <= hi.x; ++i) {

        f(i,0,0);

    }

#elif (AMREX_SPACEDIM == 2)

#pragma omp parallel for

    for     (int j = lo.y; j <= hi.y; ++j) {

        AMREX_PRAGMA_SIMD

        for (int i = lo.x; i <= hi.x; ++i) {

            f(i,j,0);

        }

    }

#else

#pragma omp parallel for collapse(2)

    for         (int k = lo.z; k <= hi.z; ++k) {

        for     (int j = lo.y; j <= hi.y; ++j) {

            AMREX_PRAGMA_SIMD

            for (int i = lo.x; i <= hi.x; ++i) {

                f(i,j,k);

            }

        }

    }

#endif

}


template <std::integral T, typename L >

AMREX_ATTRIBUTE_FLATTEN_FOR

void ParallelForOMP (Box const& box, T ncomp, L const& f) noexcept

{

    auto lo = amrex::lbound(box);

    auto hi = amrex::ubound(box);

#if (AMREX_SPACEDIM == 1)

#pragma omp parallel for collapse(2)

    for (T n = 0; n < ncomp; ++n) {

        for (int i = lo.x; i <= hi.x; ++i) {

            f(i,0,0,n);

        }

    }

#elif (AMREX_SPACEDIM == 2)

#pragma omp parallel for collapse(2)

    for (T n = 0; n < ncomp; ++n) {

        for     (int j = lo.y; j <= hi.y; ++j) {

            AMREX_PRAGMA_SIMD

            for (int i = lo.x; i <= hi.x; ++i) {

                f(i,j,0,n);

            }

        }

    }

#else

#pragma omp parallel for collapse(3)

    for (T n = 0; n < ncomp; ++n) {

        for         (int k = lo.z; k <= hi.z; ++k) {

            for     (int j = lo.y; j <= hi.y; ++j) {

                AMREX_PRAGMA_SIMD

                for (int i = lo.x; i <= hi.x; ++i) {

                    f(i,j,k,n);

                }

            }

        }

    }

#endif

}


#endif


}


#endif

AMReX_Algorithm.H
General-purpose algorithm utilities available on both host and device.

AMReX_Arena.H
Memory arena base class and global arena accessors.

AMReX_BLassert.H
Assertion macros used across AMReX for runtime consistency checks.

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38

AMReX_Box.H
Integer-lattice boxes and helpers for defining index-space regions.

AMReX_CTOParallelForImpl.H

AMReX_Extension.H
Compiler- and backend-specific extension macros (e.g., restrict, SIMD, inline).

AMREX_PRAGMA_SIMD
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:85

AMREX_ATTRIBUTE_FLATTEN_FOR
#define AMREX_ATTRIBUTE_FLATTEN_FOR
Definition AMReX_Extension.H:156

AMReX_GpuControl.H

AMReX_GpuDevice.H

AMReX_GpuError.H

AMReX_GpuKernelInfo.H

AMReX_GpuLaunchFunctsC.H

AMReX_GpuLaunchFunctsG.H

AMReX_GpuLaunchFunctsSIMD.H

AMReX_GpuLaunchGlobal.H

AMReX_GpuLaunchMacrosC.H

AMReX_GpuLaunchMacrosG.H

AMREX_GPU_Z_STRIDE
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:34

AMREX_GPU_NCELLS_PER_THREAD
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:32

AMREX_GPU_Y_STRIDE
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:33

AMReX_GpuLaunch.nolint.H

AMReX_GpuMemory.H

AMReX_GpuQualifiers.H

AMREX_IF_ON_DEVICE
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56

AMREX_GPU_GLOBAL
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19

AMREX_IF_ON_HOST
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMREX_GPU_HOST_DEVICE
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20

AMReX_GpuRange.H

AMReX_GpuReduce.H

AMReX_GpuTypes.H

offset
Array4< int const  > offset
Definition AMReX_HypreMLABecLap.cpp:1129

AMReX_Loop.H

AMReX_Math.H

AMReX_RandomEngine.H

AMReX_SIMD.H

AMREX_D_DECL
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:171

AMReX_Tuple.H

AMReX_TypeTraits.H

AMReX_Vector.H

amrex::Arena::free
virtual void free(void *pt)=0
Free a previously allocated block pointed to by pt.

amrex::Arena::alloc
virtual void * alloc(std::size_t sz)=0
Allocate sz bytes from this arena.

amrex::BoxND< 3 >

amrex::BoxND::hiVect
__host__ __device__ const int * hiVect() const &noexcept
Return a pointer to the high-end coordinates (useful for Fortran calls).
Definition AMReX_Box.H:210

amrex::BoxND::loVect
__host__ __device__ const int * loVect() const &noexcept
Return a pointer to the low-end coordinates (useful for Fortran calls).
Definition AMReX_Box.H:205

amrex::BoxND::coarsen
__host__ __device__ BoxND & coarsen(int ref_ratio) noexcept
Coarsen BoxND by given (positive) refinement ratio. NOTE: if type(dir) = CELL centered: lo <- lo/rati...
Definition AMReX_Box.H:754

amrex::BoxND::smallEnd
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:124

amrex::Gpu::Device::c_threads_and_blocks
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1201

amrex::Gpu::Device::n_threads_and_blocks
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1186

amrex::Gpu::Device::warp_size
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236

amrex::Gpu::Device::c_comps_threads_and_blocks
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1193

amrex::Gpu::Device::grid_stride_threads_and_blocks
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:1259

amrex::IntVectND< 3 >

amrex::Vector
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:29

amrex::Long
amrex_long Long
Definition AMReX_INT.H:30

amrex::ParallelForOMP
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:328

amrex::ubound
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1364

amrex::lbound
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1350

amrex::surroundingNodes
__host__ __device__ BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Return a BoxND with NODE based coordinates in direction dir that encloses BoxND b.
Definition AMReX_Box.H:1582

amrex::The_Pinned_Arena
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860

amrex::The_Arena
Arena * The_Arena()
Definition AMReX_Arena.cpp:820

amrex::Gpu::getThreadBox
__host__ __device__ Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:99

amrex::Gpu::makeNExecutionConfigs
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:181

amrex::Gpu::numThreadsPerBlockParallelFor
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:90

amrex::Gpu::makeExecutionConfig
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:153

amrex
Definition AMReX_Amr.cpp:50

amrex::launch_host
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:75

amrex::ignore_unused
__host__ __device__ void ignore_unused(const Ts &...)
No-op helper that marks variables as intentionally unused.
Definition AMReX.H:259

amrex::gpuStream_t
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79

amrex::ParallelFor
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202

amrex::Box
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:35

amrex::IntVect
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:38

amrex::Abort
void Abort(const std::string &msg)
Print a fatal-error message to stderr and abort execution.
Definition AMReX.cpp:241

amrex::int
const int[]
Definition AMReX_BLProfiler.cpp:1664

amrex::launch_global
__global__ void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:68

amrex::get
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334

amrex::Gpu::ExecConfig
Definition AMReX_GpuLaunch.H:175

amrex::Gpu::ExecConfig::nblocks
int nblocks
Definition AMReX_GpuLaunch.H:177

amrex::Gpu::ExecConfig::start_idx
Long start_idx
Definition AMReX_GpuLaunch.H:176

amrex::Gpu::ExecutionConfig
Definition AMReX_GpuLaunch.H:121

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:143

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:140

amrex::Gpu::ExecutionConfig::numBlocks
dim3 numBlocks
Definition AMReX_GpuLaunch.H:146

amrex::Gpu::ExecutionConfig::numThreads
dim3 numThreads
Definition AMReX_GpuLaunch.H:147

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:136

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:125

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:122

amrex::Gpu::ExecutionConfig::sharedMem
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:148