amrex/doxygen/AMReX__GpuLaunch_8H_source.html

#ifndef AMREX_GPU_LAUNCH_H_

#define AMREX_GPU_LAUNCH_H_

#include <AMReX_Config.H>


#include <AMReX_GpuQualifiers.H>

#include <AMReX_GpuKernelInfo.H>

#include <AMReX_GpuControl.H>

#include <AMReX_GpuTypes.H>

#include <AMReX_GpuError.H>

#include <AMReX_GpuRange.H>

#include <AMReX_GpuDevice.H>

#include <AMReX_GpuMemory.H>

#include <AMReX_GpuReduce.H>

#include <AMReX_Tuple.H>

#include <AMReX_Box.H>

#include <AMReX_Loop.H>

#include <AMReX_Extension.H>

#include <AMReX_BLassert.H>

#include <AMReX_TypeTraits.H>

#include <AMReX_GpuLaunchGlobal.H>

#include <AMReX_RandomEngine.H>

#include <AMReX_Algorithm.H>

#include <AMReX_Math.H>

#include <AMReX_Vector.H>

#include <cstddef>

#include <limits>

#include <algorithm>

#include <utility>


#define AMREX_GPU_NCELLS_PER_THREAD 3

#define AMREX_GPU_Y_STRIDE 1

#define AMREX_GPU_Z_STRIDE 1


#ifdef AMREX_USE_CUDA


#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \

        amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)


#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \

        amrex::launch_global    <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)


#elif defined(AMREX_USE_HIP)

#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \

        hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)

#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \

        hipLaunchKernelGGL(launch_global    , blocks, threads, sharedMem, stream, __VA_ARGS__)

#endif


namespace amrex {


// We cannot take rvalue lambdas.

// ************************************************

//  Variadic lambda function wrappers for C++ CUDA/HIP Kernel calls.


#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)

    template<class L, class... Lambdas>

    AMREX_GPU_GLOBAL void launch_global (L f0, Lambdas... fs) { f0(); call_device(fs...); }


    template<class L>

    AMREX_GPU_DEVICE void call_device (L&& f0) noexcept { f0(); }


    template<class L, class... Lambdas>


    AMREX_GPU_DEVICE void call_device (L&& f0, Lambdas&&... fs) noexcept {

        f0();

        call_device(std::forward<Lambdas>(fs)...);

    }


#endif


// CPU variation


    template<class L>

    void launch_host (L&& f0) noexcept { std::forward<L>(f0)(); }


    template<class L, class... Lambdas>


    void launch_host (L&& f0, Lambdas&&... fs) noexcept {

        std::forward<L>(f0)();

        launch_host(std::forward<Lambdas>(fs)...);

    }


    template <class T> class LayoutData;

    class FabArrayBase;


namespace Gpu {


#ifdef AMREX_USE_GPU


    inline constexpr std::size_t numThreadsPerBlockParallelFor () {

        return AMREX_GPU_MAX_THREADS;

    }


#else

    inline constexpr std::size_t numThreadsPerBlockParallelFor () { return 0; }

#endif


// ************************************************


    struct ComponentBox {

        Box box;

        int ic;

        int nc;

    };


    struct GridSize {

        int numBlocks;

        int numThreads;

        int globalBlockId;

    };


// ************************************************


    AMREX_GPU_HOST_DEVICE

    inline


    Box getThreadBox (const Box& bx, Long offset) noexcept

    {

        AMREX_IF_ON_DEVICE((

            const auto len = bx.length3d();

            Long k = offset / (len[0]*len[1]);

            Long j = (offset - k*(len[0]*len[1])) / len[0];

            Long i = (offset - k*(len[0]*len[1])) - j*len[0];

            IntVect iv{AMREX_D_DECL(static_cast<int>(i),

                                    static_cast<int>(j),

                                    static_cast<int>(k))};

            iv += bx.smallEnd();

            return (bx & Box(iv,iv,bx.type()));

        ))

        AMREX_IF_ON_HOST((

            amrex::ignore_unused(offset);

            return bx;

        ))

    }


// ************************************************


#ifdef AMREX_USE_GPU


    struct ExecutionConfig {


        ExecutionConfig () noexcept {

            Gpu::Device::grid_stride_threads_and_blocks(numBlocks,numThreads);

        }


        ExecutionConfig (const Box& box) noexcept {

            // If we change this, we must make sure it doesn't break FabArrayUtility Reduce*,

            // which assumes the decomposition is 1D.

            Gpu::Device::n_threads_and_blocks( ((box.numPts()+AMREX_GPU_NCELLS_PER_THREAD-1)/AMREX_GPU_NCELLS_PER_THREAD), numBlocks, numThreads );

#if 0

            Box b = amrex::surroundingNodes(box);

            b -= box.smallEnd();

            b.coarsen(IntVect(AMREX_D_DECL(1,AMREX_GPU_Y_STRIDE,AMREX_GPU_Z_STRIDE)));

            Gpu::Device::c_threads_and_blocks(b.loVect(), b.hiVect(), numBlocks, numThreads);

#endif

        }


        ExecutionConfig (const Box& box, int comps) noexcept {

            const Box& b = amrex::surroundingNodes(box);

            Gpu::Device::c_comps_threads_and_blocks(b.loVect(), b.hiVect(), comps, numBlocks, numThreads);

        }


        ExecutionConfig (Long N) noexcept {

            Gpu::Device::n_threads_and_blocks(N, numBlocks, numThreads);

        }


        ExecutionConfig (dim3 nb, dim3 nt, std::size_t sm=0) noexcept

            : numBlocks(nb), numThreads(nt), sharedMem(sm) {}


        dim3 numBlocks;

        dim3 numThreads;

        std::size_t sharedMem = 0;

    };


    template <int MT>

    ExecutionConfig


    makeExecutionConfig (Long N) noexcept

    {

        ExecutionConfig ec(dim3{}, dim3{});

        Long numBlocks = (std::max(N,Long(1)) + MT - 1) / MT;

        // ensure that blockDim.x*gridDim.x does not overflow

        numBlocks = std::min(numBlocks, Long(std::numeric_limits<unsigned int>::max()/MT));

        // ensure that the maximum grid size of 2^31-1 won't be exceeded

        numBlocks = std::min(numBlocks, Long(std::numeric_limits<int>::max()));

        ec.numBlocks.x = numBlocks;

        ec.numThreads.x = MT;

        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);

        return ec;

    }


    template <int MT>

    ExecutionConfig


    makeExecutionConfig (const Box& box) noexcept

    {

        return makeExecutionConfig<MT>(box.numPts());

    }


    struct ExecConfig

    {

        Long start_idx;

        int nblocks;

    };


    template <int MT>


    Vector<ExecConfig> makeNExecutionConfigs (Long N) noexcept

    {

        // Max # of blocks in a kernel launch

        int numblocks_max = std::numeric_limits<int>::max();

        // Max # of threads in a kernel launch

        Long nmax = Long(MT) * numblocks_max;

        // # of launches needed for N elements without using grid-stride

        // loops inside GPU kernels.

        auto nlaunches = int((N+nmax-1)/nmax);

        Vector<ExecConfig> r(nlaunches);

        Long ndone = 0;

        for (int i = 0; i < nlaunches; ++i) {

            int nblocks;

            if (N > nmax) {

                nblocks = numblocks_max;

                N -= nmax;

            } else {

                nblocks = int((N+MT-1)/MT);

            }

            // At which element ID the kernel should start

            r[i].start_idx = ndone;

            ndone += Long(nblocks) * MT;

            // # of blocks in this launch

            r[i].nblocks = nblocks;

        }

        return r;

    }


    template <int MT, int dim>


    Vector<ExecConfig> makeNExecutionConfigs (BoxND<dim> const& box) noexcept

    {

        return makeNExecutionConfigs<MT>(box.numPts());

    }


#endif


}

}


#ifdef AMREX_USE_GPU

#include <AMReX_GpuLaunchMacrosG.H>

#include <AMReX_GpuLaunchFunctsG.H>

#else

#include <AMReX_GpuLaunchMacrosC.H>

#include <AMReX_GpuLaunchFunctsC.H>

#endif


#include <AMReX_GpuLaunch.nolint.H>


#include <AMReX_CTOParallelForImpl.H>


#endif

AMReX_Algorithm.H

AMReX_BLassert.H

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38

AMReX_Box.H

AMReX_CTOParallelForImpl.H

AMReX_Extension.H

AMReX_GpuControl.H

AMReX_GpuDevice.H

AMReX_GpuError.H

AMReX_GpuKernelInfo.H

AMReX_GpuLaunchFunctsC.H

AMReX_GpuLaunchFunctsG.H

AMReX_GpuLaunchGlobal.H

AMReX_GpuLaunchMacrosC.H

AMReX_GpuLaunchMacrosG.H

AMREX_GPU_Z_STRIDE
#define AMREX_GPU_Z_STRIDE
Definition AMReX_GpuLaunch.H:32

AMREX_GPU_NCELLS_PER_THREAD
#define AMREX_GPU_NCELLS_PER_THREAD
Definition AMReX_GpuLaunch.H:30

AMREX_GPU_Y_STRIDE
#define AMREX_GPU_Y_STRIDE
Definition AMReX_GpuLaunch.H:31

AMReX_GpuLaunch.nolint.H

AMReX_GpuMemory.H

AMReX_GpuQualifiers.H

AMREX_IF_ON_DEVICE
#define AMREX_IF_ON_DEVICE(CODE)
Definition AMReX_GpuQualifiers.H:56

AMREX_GPU_GLOBAL
#define AMREX_GPU_GLOBAL
Definition AMReX_GpuQualifiers.H:19

AMREX_IF_ON_HOST
#define AMREX_IF_ON_HOST(CODE)
Definition AMReX_GpuQualifiers.H:58

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMREX_GPU_HOST_DEVICE
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20

AMReX_GpuRange.H

AMReX_GpuReduce.H

AMReX_GpuTypes.H

offset
Array4< int const  > offset
Definition AMReX_HypreMLABecLap.cpp:1089

AMReX_Loop.H

AMReX_Math.H

AMReX_RandomEngine.H

AMReX_Tuple.H

AMReX_TypeTraits.H

AMReX_Vector.H

amrex::BoxND< AMREX_SPACEDIM >

amrex::Gpu::Device::c_threads_and_blocks
static void c_threads_and_blocks(const int *lo, const int *hi, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:899

amrex::Gpu::Device::n_threads_and_blocks
static void n_threads_and_blocks(const Long N, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:884

amrex::Gpu::Device::warp_size
static AMREX_EXPORT constexpr int warp_size
Definition AMReX_GpuDevice.H:173

amrex::Gpu::Device::c_comps_threads_and_blocks
static void c_comps_threads_and_blocks(const int *lo, const int *hi, const int comps, dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:891

amrex::Gpu::Device::grid_stride_threads_and_blocks
static void grid_stride_threads_and_blocks(dim3 &numBlocks, dim3 &numThreads) noexcept
Definition AMReX_GpuDevice.cpp:957

amrex::IntVectND< AMREX_SPACEDIM >

amrex::Vector
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27

amrex::Gpu::makeNExecutionConfigs
Vector< ExecConfig > makeNExecutionConfigs(Long N) noexcept
Definition AMReX_GpuLaunch.H:192

amrex::Gpu::numThreadsPerBlockParallelFor
constexpr std::size_t numThreadsPerBlockParallelFor()
Definition AMReX_GpuLaunch.H:85

amrex::Gpu::makeExecutionConfig
ExecutionConfig makeExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:164

amrex::Gpu::getThreadBox
AMREX_GPU_HOST_DEVICE Box getThreadBox(const Box &bx, Long offset) noexcept
Definition AMReX_GpuLaunch.H:110

amrex
Definition AMReX_Amr.cpp:49

amrex::launch_host
void launch_host(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:70

amrex::Box
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27

amrex::CurlCurlStateType::r
@ r

amrex::CurlCurlStateType::b
@ b

amrex::surroundingNodes
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > surroundingNodes(const BoxND< dim > &b, int dir) noexcept
Returns a BoxND with NODE based coordinates in direction dir that encloses BoxND b....
Definition AMReX_Box.H:1399

amrex::IntVect
IntVectND< AMREX_SPACEDIM > IntVect
Definition AMReX_BaseFwd.H:30

amrex::Direction::AMREX_D_DECL
@ AMREX_D_DECL

amrex::ignore_unused
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127

amrex::int
const int[]
Definition AMReX_BLProfiler.cpp:1664

amrex::call_device
AMREX_GPU_DEVICE void call_device(L &&f0) noexcept
Definition AMReX_GpuLaunch.H:58

amrex::launch_global
AMREX_GPU_GLOBAL void launch_global(L f0, Lambdas... fs)
Definition AMReX_GpuLaunch.H:55

amrex::Gpu::ComponentBox
Definition AMReX_GpuLaunch.H:94

amrex::Gpu::ComponentBox::box
Box box
Definition AMReX_GpuLaunch.H:95

amrex::Gpu::ComponentBox::ic
int ic
Definition AMReX_GpuLaunch.H:96

amrex::Gpu::ComponentBox::nc
int nc
Definition AMReX_GpuLaunch.H:97

amrex::Gpu::ExecConfig
Definition AMReX_GpuLaunch.H:186

amrex::Gpu::ExecConfig::nblocks
int nblocks
Definition AMReX_GpuLaunch.H:188

amrex::Gpu::ExecConfig::start_idx
Long start_idx
Definition AMReX_GpuLaunch.H:187

amrex::Gpu::ExecutionConfig
Definition AMReX_GpuLaunch.H:132

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(dim3 nb, dim3 nt, std::size_t sm=0) noexcept
Definition AMReX_GpuLaunch.H:154

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(Long N) noexcept
Definition AMReX_GpuLaunch.H:151

amrex::Gpu::ExecutionConfig::numBlocks
dim3 numBlocks
Definition AMReX_GpuLaunch.H:157

amrex::Gpu::ExecutionConfig::numThreads
dim3 numThreads
Definition AMReX_GpuLaunch.H:158

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(const Box &box, int comps) noexcept
Definition AMReX_GpuLaunch.H:147

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig(const Box &box) noexcept
Definition AMReX_GpuLaunch.H:136

amrex::Gpu::ExecutionConfig::ExecutionConfig
ExecutionConfig() noexcept
Definition AMReX_GpuLaunch.H:133

amrex::Gpu::ExecutionConfig::sharedMem
std::size_t sharedMem
Definition AMReX_GpuLaunch.H:159

amrex::Gpu::GridSize
Definition AMReX_GpuLaunch.H:100

amrex::Gpu::GridSize::globalBlockId
int globalBlockId
Definition AMReX_GpuLaunch.H:103

amrex::Gpu::GridSize::numBlocks
int numBlocks
Definition AMReX_GpuLaunch.H:101

amrex::Gpu::GridSize::numThreads
int numThreads
Definition AMReX_GpuLaunch.H:102