amrex/doxygen/AMReX__Reduce_8H_source.html

#ifndef AMREX_REDUCE_H_

#define AMREX_REDUCE_H_

#include <AMReX_Config.H>


#include <AMReX_Gpu.H>

#include <AMReX_Arena.H>

#include <AMReX_OpenMP.H>

#include <AMReX_MFIter.H>

#include <AMReX_ValLocPair.H>


#include <algorithm>

#include <functional>

#include <limits>


namespace amrex {


namespace Reduce::detail {


#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <std::size_t I, typename T, typename P>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);

        for_each_parallel<I+1,T,P1,Ps...>(d, s, h);

    }

#else

    template <std::size_t I, typename T, typename P>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void for_each_parallel (T& d, T const& s)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void for_each_parallel (T& d, T const& s)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));

        for_each_parallel<I+1,T,P1,Ps...>(d, s);

    }


#endif

#endif


    template <std::size_t I, typename T, typename P>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE


    void for_each_local (T& d, T const& s)

    {

        P().local_update(amrex::get<I>(d), amrex::get<I>(s));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE


    void for_each_local (T& d, T const& s)

    {

        P().local_update(amrex::get<I>(d), amrex::get<I>(s));

        for_each_local<I+1,T,P1,Ps...>(d, s);

    }


    template <std::size_t I, typename T, typename P>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE


    constexpr void for_each_init (T& t)

    {

        P().init(amrex::get<I>(t));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE


    constexpr void for_each_init (T& t)

    {

        P().init(amrex::get<I>(t));

        for_each_init<I+1,T,P1,Ps...>(t);

    }


}


struct ReduceOpSum

{


#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceSum(s,h);

        if (h.threadIdx() == 0) { d += r; }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceSum<MT>(s);

        if (threadIdx.x == 0) { d += r; }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d += s; }


    template <typename T>

    constexpr void init (T& t) const noexcept { t = 0; }

};


struct ReduceOpMin

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceMin(s,h);

        if (h.threadIdx() == 0) { d = amrex::min(d,r); }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceMin<MT>(s);

        if (threadIdx.x == 0) { d = amrex::min(d,r); }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }


    template <typename T>

    constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>

    init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }


    template <typename T>

    constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>

    init (T& t) const noexcept { t = T::max(); }

};


struct ReduceOpMax

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceMax(s,h);

        if (h.threadIdx() == 0) { d = amrex::max(d,r); }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceMax<MT>(s);

        if (threadIdx.x == 0) { d = amrex::max(d,r); }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }


    template <typename T>

    constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>

    init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }


    template <typename T>

    constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>

    init (T& t) const noexcept { t = T::lowest(); }

};


struct ReduceOpLogicalAnd

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral<T>::value>

    parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceLogicalAnd(s,h);

        if (h.threadIdx() == 0) { d = d && r; }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral<T>::value>


    parallel_update (T& d, T s) const noexcept {

        T r = Gpu::blockReduceLogicalAnd<MT>(s);

        if (threadIdx.x == 0) { d = d && r; }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral_v<T>>

    local_update (T& d, T s) const noexcept { d = d && s; }


    template <typename T>

    constexpr std::enable_if_t<std::is_integral_v<T>>

    init (T& t) const noexcept { t = true; }

};


struct ReduceOpLogicalOr

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral<T>::value>

    parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceLogicalOr(s,h);

        if (h.threadIdx() == 0) { d = d || r; }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral<T>::value>


    parallel_update (T& d, T s) const noexcept {

        T r = Gpu::blockReduceLogicalOr<MT>(s);

        if (threadIdx.x == 0) { d = d || r; }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    std::enable_if_t<std::is_integral_v<T>>

    local_update (T& d, T s) const noexcept { d = d || s; }


    template <typename T>

    constexpr std::enable_if_t<std::is_integral_v<T>>

    init (T& t) const noexcept { t = false; }

};


template <typename... Ps> class ReduceOps;


#ifdef AMREX_USE_GPU


template <typename... Ts>


class ReduceData

{

public:

    using Type = GpuTuple<Ts...>;


    template <typename... Ps>


    explicit ReduceData (ReduceOps<Ps...>& reduce_op)

        : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),

          m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),

          m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)

                                                    * m_max_blocks * sizeof(Type)))),

          m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })

    {

        reduce_op.resetResultReadiness();

        static_assert(std::is_trivially_copyable<Type>(),

                      "ReduceData::Type must be trivially copyable");

        static_assert(std::is_trivially_destructible<Type>(),

                      "ReduceData::Type must be trivially destructible");


        new (m_host_tuple) Type();

        m_nblocks.fill(0);

    }


    ~ReduceData () {

        The_Pinned_Arena()->free(m_host_tuple);

        The_Arena()->free(m_device_tuple);

    }


    ReduceData (ReduceData<Ts...> const&) = delete;

    ReduceData (ReduceData<Ts...> &&) = delete;

    void operator= (ReduceData<Ts...> const&) = delete;

    void operator= (ReduceData<Ts...> &&) = delete;


    Type value ()

    {

        return m_fn_value();

    }


    template <typename... Ps>


    Type value (ReduceOps<Ps...> & reduce_op)

    {

        return reduce_op.value(*this);

    }


    Type* devicePtr () { return m_device_tuple; }


    Type* devicePtr (gpuStream_t const& s) {

        return m_device_tuple+(Gpu::Device::streamIndex(s))*m_max_blocks;

    }


    Type* hostPtr () { return m_host_tuple; }


    GpuArray<int,AMREX_GPU_MAX_STREAMS>& nBlocks () { return m_nblocks; }

    int& nBlocks (gpuStream_t const& s) { return m_nblocks[Gpu::Device::streamIndex(s)]; }


    int maxBlocks () const { return m_max_blocks; }


    int maxStreamIndex () const { return m_max_stream_index; }


    void updateMaxStreamIndex (gpuStream_t const& s) {

        m_max_stream_index = std::max(m_max_stream_index,Gpu::Device::streamIndex(s));

    }


private:

    int m_max_blocks;

    int m_max_stream_index = 0;

    Type* m_host_tuple = nullptr;

    Type* m_device_tuple = nullptr;

    GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;

    std::function<Type()> m_fn_value;

};


namespace Reduce::detail {

    template <typename F>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    auto call_f (F const& f, int i, int j, int k, IndexType)

        noexcept -> decltype(f(0,0,0))

    {

        return f(i,j,k);

    }


    template <typename F>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f (F const& f, int i, int j, int k, IndexType t)

        noexcept -> decltype(f(Box()))

    {

        AMREX_D_PICK(amrex::ignore_unused(j,k),amrex::ignore_unused(k),(void)0);

        return f(Box(IntVect(AMREX_D_DECL(i,j,k)),

                     IntVect(AMREX_D_DECL(i,j,k)),

                     t));

    }


    struct iterate_box {};

    struct iterate_box_comp {};


    template <typename I, typename F, typename T, typename... Ps,

              std::enable_if_t<std::is_same<iterate_box,I>::value,int> = 0>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept

    {

        auto const& pr = f(ibox,i,j,k);

        Reduce::detail::for_each_local<0, T, Ps...>(r, pr);

    }


    template <typename I, typename F, typename T, typename... Ps,

              std::enable_if_t<std::is_same<iterate_box_comp,I>::value,int> = 0>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept

    {

        for (int n = 0; n < ncomp; ++n) {

            auto const& pr = f(ibox,i,j,k,n);

            Reduce::detail::for_each_local<0, T, Ps...>(r, pr);

        }

    }

}


template <typename... Ps>


class ReduceOps

{

public:


    // This is public for CUDA

    template <typename I, typename MF, typename D, typename F>


    void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        const int nboxes = mf.local_size();

        if (nboxes > 0) {

            auto const& parforinfo = mf.getParForInfo(nghost,AMREX_GPU_MAX_THREADS);

            auto par_for_blocks = parforinfo.getBlocks();

            const int nblocks = par_for_blocks.first[nboxes];

            const int block_0_size = par_for_blocks.first[1];

            const int* dp_nblocks = par_for_blocks.second;

            const BoxIndexer* dp_boxes = parforinfo.getBoxes();


            auto const& stream = Gpu::gpuStream();

            auto pdst = reduce_data.devicePtr(stream);

            int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());

            AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));

            reduce_data.nBlocks(stream) = nblocks_ec;

            reduce_data.updateMaxStreamIndex(stream);


#ifdef AMREX_USE_SYCL

            // device reduce needs local(i.e., shared) memory

            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

            amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

                          [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

            {

                Dim1 blockIdx {gh.blockIdx()};

                Dim1 threadIdx{gh.threadIdx()};

#else

            amrex::launch_global<AMREX_GPU_MAX_THREADS>

                <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>

                ([=] AMREX_GPU_DEVICE () noexcept

            {

#endif

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple& dst = pdst[blockIdx.x];

                if (threadIdx.x == 0) {

                    dst = r;

                }

                for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {

                    int ibox;

                    std::uint64_t icell;

                    if (dp_nblocks) {

                        ibox = amrex::bisect(dp_nblocks, 0, nboxes, iblock);

                        icell = std::uint64_t(iblock-dp_nblocks[ibox])*AMREX_GPU_MAX_THREADS + threadIdx.x;

                    } else {

                        ibox = iblock / block_0_size;

                        icell = std::uint64_t(iblock-ibox*block_0_size)*AMREX_GPU_MAX_THREADS + threadIdx.x;

                    }


                    BoxIndexer const& indexer = dp_boxes[ibox];

                    if (icell < indexer.numPts()) {

                        auto [i, j, k] = indexer(icell);

                        Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>

                            (f, ibox, i, j, k, ncomp, r);

                    }

                }

#ifdef AMREX_USE_SYCL

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

            });

        }

    }


    template <typename MF, typename D, typename F>

    std::enable_if_t<IsFabArray<MF>::value

#ifndef AMREX_USE_CUDA

                     && IsCallable<F, int, int, int, int>::value

#endif

                    >


    eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)

    {

        using ReduceTuple = typename D::Type;

        const int nboxes = mf.local_size();

        if (nboxes == 0) {

            return;

        } else if (!mf.isFusingCandidate()) {

            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {

                Box const& b = amrex::grow(mfi.validbox(), nghost);

                const int li = mfi.LocalIndex();

                this->eval(b, reduce_data,

                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple

                {

                    return f(li, i, j, k);

                });

            }

        } else {

            eval_mf(Reduce::detail::iterate_box{},

                    mf, nghost, 0, reduce_data, std::forward<F>(f));

        }

    }


    template <typename MF, typename D, typename F>

    std::enable_if_t<IsFabArray<MF>::value

#ifndef AMREX_USE_CUDA

                     && IsCallable<F, int, int, int, int, int>::value

#endif

                     >


    eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)

    {

        using ReduceTuple = typename D::Type;


        const int nboxes = mf.local_size();


        if (nboxes == 0) {

            return;

        } else if (!mf.isFusingCandidate()) {

            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {

                Box const& b = amrex::grow(mfi.validbox(), nghost);

                const int li = mfi.LocalIndex();

                this->eval(b, ncomp, reduce_data,

                [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple

                {

                    return f(li, i, j, k, n);

                });

            }

        } else {

            eval_mf(Reduce::detail::iterate_box_comp{},

                    mf, nghost, ncomp, reduce_data, std::forward<F>(f));

        }

    }


    template <typename D, typename F>


    void eval (Box const& box, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr(stream);

        int& nblocks = reduce_data.nBlocks(stream);

        int ncells = box.numPts();

        const auto lo  = amrex::lbound(box);

        const auto len = amrex::length(box);

        const auto lenxy = len.x*len.y;

        const auto lenx = len.x;

        IndexType ixtype = box.ixType();

        constexpr int nitems_per_thread = 4;

        int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)

            / (nitems_per_thread*AMREX_GPU_MAX_THREADS);

        nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());

        reduce_data.updateMaxStreamIndex(stream);

#ifdef AMREX_USE_SYCL

        // device reduce needs local(i.e., shared) memory

        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

        {

            Dim1 blockIdx {gh.blockIdx()};

            Dim1 threadIdx{gh.threadIdx()};

            Dim1 gridDim  {gh.gridDim()};

#else

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,

        [=] AMREX_GPU_DEVICE () noexcept

        {

#endif

            ReduceTuple r;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

            ReduceTuple& dst = *(dp+blockIdx.x);

            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {

                dst = r;

            }

            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 icell < ncells; icell += stride) {

                int k =  icell /   lenxy;

                int j = (icell - k*lenxy) /   lenx;

                int i = (icell - k*lenxy) - j*lenx;

                i += lo.x;

                j += lo.y;

                k += lo.z;

                auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);

            }

#ifdef AMREX_USE_SYCL

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

        });

        nblocks = std::max(nblocks, nblocks_ec);

    }


    template <typename N, typename D, typename F,

              typename M=std::enable_if_t<std::is_integral<N>::value> >


    void eval (Box const& box, N ncomp, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr(stream);

        int& nblocks = reduce_data.nBlocks(stream);

        int ncells = box.numPts();

        const auto lo  = amrex::lbound(box);

        const auto len = amrex::length(box);

        const auto lenxy = len.x*len.y;

        const auto lenx = len.x;

        constexpr int nitems_per_thread = 4;

        int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)

            / (nitems_per_thread*AMREX_GPU_MAX_THREADS);

        nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());

        reduce_data.updateMaxStreamIndex(stream);

#ifdef AMREX_USE_SYCL

        // device reduce needs local(i.e., shared) memory

        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

        {

            Dim1 blockIdx {gh.blockIdx()};

            Dim1 threadIdx{gh.threadIdx()};

            Dim1 gridDim  {gh.gridDim()};

#else

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,

        [=] AMREX_GPU_DEVICE () noexcept

        {

#endif

            ReduceTuple r;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

            ReduceTuple& dst = *(dp+blockIdx.x);

            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {

                dst = r;

            }

            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 icell < ncells; icell += stride) {

                int k =  icell /   lenxy;

                int j = (icell - k*lenxy) /   lenx;

                int i = (icell - k*lenxy) - j*lenx;

                i += lo.x;

                j += lo.y;

                k += lo.z;

                for (N n = 0; n < ncomp; ++n) {

                    auto pr = f(i,j,k,n);

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);

                }

            }

#ifdef AMREX_USE_SYCL

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

        });

        nblocks = std::max(nblocks, nblocks_ec);

    }


    template <typename N, typename D, typename F,

              typename M=std::enable_if_t<std::is_integral<N>::value> >


    void eval (N n, D & reduce_data, F const& f)

    {

        if (n <= 0) { return; }

        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr(stream);

        int& nblocks = reduce_data.nBlocks(stream);

        constexpr int nitems_per_thread = 4;

        int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)

            / (nitems_per_thread*AMREX_GPU_MAX_THREADS);

        nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());

        reduce_data.updateMaxStreamIndex(stream);

#ifdef AMREX_USE_SYCL

        // device reduce needs local(i.e., shared) memory

        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

        {

            Dim1 blockIdx {gh.blockIdx()};

            Dim1 threadIdx{gh.threadIdx()};

            Dim1 gridDim  {gh.gridDim()};

#else

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,

        [=] AMREX_GPU_DEVICE () noexcept

        {

#endif

            ReduceTuple r;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

            ReduceTuple& dst = *(dp+blockIdx.x);

            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {

                dst = r;

            }

            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 i < n; i += stride) {

                auto pr = f(i);

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);

            }

#ifdef AMREX_USE_SYCL

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

        });

        nblocks = amrex::max(nblocks, nblocks_ec);

    }


    template <typename D>


    typename D::Type value (D & reduce_data)

    {

        auto hp = reduce_data.hostPtr();


        if (m_result_is_ready) {

            return *hp;

        }


        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr();

        auto const& nblocks = reduce_data.nBlocks();

#if defined(AMREX_USE_SYCL)

        if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {

            const int N = nblocks[0];

            if (N == 0) {

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);

            } else {

                Gpu::PinnedVector<ReduceTuple> tmp(N);

                Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);

                Gpu::streamSynchronize();

                for (int i = 1; i < N; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);

                }

                *hp = tmp[0];

            }

        } else

#endif

        {

            int maxblocks = reduce_data.maxBlocks();

#ifdef AMREX_USE_SYCL

            // device reduce needs local(i.e., shared) memory

            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND

            // xxxxx SYCL todo: reduce bug workaround

            Gpu::DeviceVector<ReduceTuple> dtmp(1);

            auto presult = dtmp.data();

#else

            auto presult = hp;

#endif

            amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,

            [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

            {

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple dst = r;

                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {

                    auto dp_stream = dp+istream*maxblocks;

                    for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);

                         i < nblocks[istream]; i += stride) {

                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);

                    }

                }

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

                if (gh.threadIdx() == 0) { *presult = dst; }

            });

#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND

            Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));

#endif

#else

            amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,

            [=] AMREX_GPU_DEVICE () noexcept

            {

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple dst = r;

                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {

                    auto dp_stream = dp+istream*maxblocks;

                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                         i < nblocks[istream]; i += stride) {

                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);

                    }

                }

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

                if (threadIdx.x == 0) { *hp = dst; }

            });

#endif

            Gpu::streamSynchronize();

        }


        m_result_is_ready = true;

        return *hp;

    }


private:

    bool m_result_is_ready = false;


public:

    void resetResultReadiness () { m_result_is_ready = false; }

};


namespace Reduce {


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >


T Sum (N n, T const* v, T init_val = 0)

{

    ReduceOps<ReduceOpSum> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return amrex::get<0>(hv) + init_val;

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral<N>::value> >


T Sum (N n, F const& f, T init_val = 0)

{

    ReduceOps<ReduceOpSum> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return amrex::get<0>(hv) + init_val;

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >


T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())

{

    ReduceOps<ReduceOpMin> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::min(amrex::get<0>(hv),init_val);

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral<N>::value> >


T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())

{

    ReduceOps<ReduceOpMin> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::min(amrex::get<0>(hv),init_val);

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >


T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())

{

    ReduceOps<ReduceOpMax> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::max(amrex::get<0>(hv),init_val);

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral<N>::value> >


T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())

{

    ReduceOps<ReduceOpMax> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::max(amrex::get<0>(hv),init_val);

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral<N>::value> >


std::pair<T,T> MinMax (N n, T const* v)

{

    ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;

    ReduceData<T,T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {

            return {v[i],v[i]};

        });

    auto hv = reduce_data.value(reduce_op);

    return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral<N>::value> >


std::pair<T,T> MinMax (N n, F const& f)

{

    ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;

    ReduceData<T,T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {

            T tmp = f(i);

            return {tmp,tmp};

        });

    auto hv = reduce_data.value(reduce_op);

    return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));

}


template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral<N>::value> >


bool AnyOf (N n, T const* v, P const& pred)

{

    Gpu::LaunchSafeGuard lsg(true);

    Gpu::DeviceScalar<int> ds(0);

    int* dp = ds.dataPtr();

    auto ec = Gpu::ExecutionConfig(n);

    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());


#ifdef AMREX_USE_SYCL

    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;

    const std::size_t shared_mem_bytes = num_ints*sizeof(int);

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {

        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);

        if (gh.threadIdx() == 0) { *has_any = *dp; }

        gh.sharedBarrier();


        if (!(*has_any))

        {

            int r = false;

            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();

                 i < n && !r; i += stride)

            {

                r = pred(v[i]) ? 1 : 0;

            }


            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);

            if (gh.threadIdx() == 0 && r) { *dp = 1; }

        }

    });

#else

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        __shared__ int has_any;

        if (threadIdx.x == 0) { has_any = *dp; }

        __syncthreads();


        if (!has_any)

        {

            int r = false;

            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 i < n && !r; i += stride)

            {

                r = pred(v[i]) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);

            if (threadIdx.x == 0 && r) *dp = 1;

        }

    });

#endif

    return ds.dataValue();

}


template <typename P>


bool AnyOf (Box const& box, P const& pred)

{

    Gpu::LaunchSafeGuard lsg(true);

    Gpu::DeviceScalar<int> ds(0);

    int* dp = ds.dataPtr();

    int ncells = box.numPts();

    const auto lo  = amrex::lbound(box);

    const auto len = amrex::length(box);

    const auto lenxy = len.x*len.y;

    const auto lenx = len.x;

    auto ec = Gpu::ExecutionConfig(ncells);

    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());


#ifdef AMREX_USE_SYCL

    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;

    const std::size_t shared_mem_bytes = num_ints*sizeof(int);

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {

        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);

        if (gh.threadIdx() == 0) { *has_any = *dp; }

        gh.sharedBarrier();


        if (!(*has_any))

        {

            int r = false;

            for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();

                 icell < ncells && !r; icell += stride) {

                int k =  icell /   lenxy;

                int j = (icell - k*lenxy) /   lenx;

                int i = (icell - k*lenxy) - j*lenx;

                i += lo.x;

                j += lo.y;

                k += lo.z;

                r = pred(i,j,k) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);

            if (gh.threadIdx() == 0 && r) { *dp = 1; }

        }

    });

#else

    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,

                        Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        __shared__ int has_any;

        if (threadIdx.x == 0) { has_any = *dp; }

        __syncthreads();


        if (!has_any)

        {

            int r = false;

            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 icell < ncells && !r; icell += stride) {

                int k =  icell /   lenxy;

                int j = (icell - k*lenxy) /   lenx;

                int i = (icell - k*lenxy) - j*lenx;

                i += lo.x;

                j += lo.y;

                k += lo.z;

                r = pred(i,j,k) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);

            if (threadIdx.x == 0 && r) *dp = 1;

        }

    });

#endif

    return ds.dataValue();

}


}


#else


template <typename... Ts>

class ReduceData

{

public:

    using Type = GpuTuple<Ts...>;


    template <typename... Ps>

    explicit ReduceData (ReduceOps<Ps...>& reduce_op)

        : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),

          m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })

    {

        reduce_op.resetResultReadiness();

        for (auto& t : m_tuple) {

            Reduce::detail::for_each_init<0, Type, Ps...>(t);

        }

    }


    ~ReduceData () = default;

    ReduceData (ReduceData<Ts...> const&) = delete;

    ReduceData (ReduceData<Ts...> &&) = delete;

    void operator= (ReduceData<Ts...> const&) = delete;

    void operator= (ReduceData<Ts...> &&) = delete;


    Type value () { return m_fn_value(); }


    template <typename... Ps>

    Type value (ReduceOps<Ps...>& reduce_op)

    {

        return reduce_op.value(*this);

    }


    Vector<Type>& reference () { return m_tuple; }


    Type& reference (int tid)

    {

        if (m_tuple.size() == 1) {

            // No OpenMP or already inside OpenMP parallel when reduce_data is constructed

            return m_tuple[0];

        } else {

            return m_tuple[tid];

        }

    }


private:

    Vector<Type> m_tuple;

    std::function<Type()> m_fn_value;

};


template <typename... Ps>

class ReduceOps

{

private:


    template <typename D, typename F>

    AMREX_FORCE_INLINE

    static auto call_f (Box const& box, typename D::Type & r, F const& f)

        noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(0,0,0))>,

                                                  typename D::Type>>

    {

        using ReduceTuple = typename D::Type;

        const auto lo = amrex::lbound(box);

        const auto hi = amrex::ubound(box);

        for (int k = lo.z; k <= hi.z; ++k) {

        for (int j = lo.y; j <= hi.y; ++j) {

        for (int i = lo.x; i <= hi.x; ++i) {

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(i,j,k));

        }}}

    }


    template <typename D, typename F>

    AMREX_FORCE_INLINE

    static auto call_f (Box const& box, typename D::Type & r, F const& f)

        noexcept -> std::enable_if_t<std::is_same_v<std::decay_t<decltype(f(Box()))>,

                                                  typename D::Type>>

    {

        using ReduceTuple = typename D::Type;

        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));

    }


public:


    template <typename MF, typename D, typename F>

    std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int>::value>

    eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

#ifdef AMREX_USE_OMP

#pragma omp parallel

#endif

        for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {

            Box const& b = mfi.growntilebox(nghost);

            const int li = mfi.LocalIndex();

            auto& rr = reduce_data.reference(OpenMP::get_thread_num());

            const auto lo = amrex::lbound(b);

            const auto hi = amrex::ubound(b);

            for (int k = lo.z; k <= hi.z; ++k) {

            for (int j = lo.y; j <= hi.y; ++j) {

            for (int i = lo.x; i <= hi.x; ++i) {

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));

            }}}

        }

    }


    template <typename MF, typename D, typename F>

    std::enable_if_t<IsFabArray<MF>::value && IsCallable<F, int, int, int, int, int>::value>

    eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

#ifdef AMREX_USE_OMP

#pragma omp parallel

#endif

        for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {

            Box const& b = mfi.growntilebox(nghost);

            const int li = mfi.LocalIndex();

            auto& rr = reduce_data.reference(OpenMP::get_thread_num());

            const auto lo = amrex::lbound(b);

            const auto hi = amrex::ubound(b);

            for (int n = 0; n < ncomp; ++n) {

            for (int k = lo.z; k <= hi.z; ++k) {

            for (int j = lo.y; j <= hi.y; ++j) {

            for (int i = lo.x; i <= hi.x; ++i) {

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));

            }}}}

        }

    }


    template <typename D, typename F>

    void eval (Box const& box, D & reduce_data, F&& f)

    {

        auto& rr = reduce_data.reference(OpenMP::get_thread_num());

        call_f<D>(box, rr, std::forward<F>(f));

    }


    template <typename N, typename D, typename F,

              typename M=std::enable_if_t<std::is_integral_v<N>> >

    void eval (Box const& box, N ncomp, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        auto& rr = reduce_data.reference(OpenMP::get_thread_num());

        const auto lo = amrex::lbound(box);

        const auto hi = amrex::ubound(box);

        for (N n = 0; n < ncomp; ++n) {

        for (int k = lo.z; k <= hi.z; ++k) {

        for (int j = lo.y; j <= hi.y; ++j) {

        for (int i = lo.x; i <= hi.x; ++i) {

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i,j,k,n));

        }}}}

    }


    template <typename N, typename D, typename F,

              typename M=std::enable_if_t<std::is_integral_v<N>> >

    void eval (N n, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        auto& rr = reduce_data.reference(OpenMP::get_thread_num());

        for (N i = 0; i < n; ++i) {

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));

        }

    }


    template <typename D>

    typename D::Type value (D & reduce_data)

    {

        auto& rrv = reduce_data.reference();

        if (! m_result_is_ready) {

            using ReduceTuple = typename D::Type;

            if (rrv.size() > 1) {

                for (int i = 1, N = rrv.size(); i < N; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);

                }

            }

            m_result_is_ready = true;

        }

        return rrv[0];

    }


    bool m_result_is_ready = false;


    void resetResultReadiness () { m_result_is_ready = false; }

};


namespace Reduce {


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral_v<N>> >

T Sum (N n, F const& f, T init_val = 0)

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(+:r)

#endif

    for (N i = 0; i < n; ++i) {

        r += f(i);

    }

    return r;

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >

T Sum (N n, T const* v, T init_val = 0)

{

    return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral_v<N>> >

T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max())

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(min:r)

#endif

    for (N i = 0; i < n; ++i) {

        r = std::min(r,f(i));

    }

    return r;

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >

T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max())

{

    return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral_v<N>> >

T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest())

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(max:r)

#endif

    for (N i = 0; i < n; ++i) {

        r = std::max(r,f(i));

    }

    return r;

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >

T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest())

{

    return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, typename N, typename F,

          typename M=std::enable_if_t<std::is_integral_v<N>> >

std::pair<T,T> Min (N n, F const& f)

{

    T r_min = std::numeric_limits<T>::max();

    T r_max = std::numeric_limits<T>::lowest();

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)

#endif

    for (N i = 0; i < n; ++i) {

        T tmp = f(i);

        r_min = std::min(r_min,tmp);

        r_max = std::max(r_max,tmp);

    }

    return std::make_pair(r_min,r_max);

}


template <typename T, typename N, typename M=std::enable_if_t<std::is_integral_v<N>> >

std::pair<T,T> MinMax (N n, T const* v)

{

    return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });

}


template <typename T, typename N, typename P, typename M=std::enable_if_t<std::is_integral_v<N>> >

bool AnyOf (N n, T const* v, P&& pred)

{

    return std::any_of(v, v+n, std::forward<P>(pred));

}


template <typename P>

bool AnyOf (Box const& box, P const& pred)

{

    const auto lo = amrex::lbound(box);

    const auto hi = amrex::ubound(box);

    for (int k = lo.z; k <= hi.z; ++k) {

    for (int j = lo.y; j <= hi.y; ++j) {

    for (int i = lo.x; i <= hi.x; ++i) {

        if (pred(i,j,k)) { return true; }

    }}}

    return false;

}


}


#endif


template <typename... Ts, typename... Ps>

AMREX_GPU_HOST_DEVICE

constexpr GpuTuple<Ts...>


IdentityTuple (GpuTuple<Ts...>, ReduceOps<Ps...>) noexcept

{

    GpuTuple<Ts...> r{};

    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);

    return r;

}


template <typename... Ts, typename... Ps>

AMREX_GPU_HOST_DEVICE

constexpr GpuTuple<Ts...>


IdentityTuple (GpuTuple<Ts...>, TypeList<Ps...>) noexcept

{

    GpuTuple<Ts...> r{};

    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);

    return r;

}


}


#endif

AMReX_Arena.H

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38

AMREX_FORCE_INLINE
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:119

AMREX_GPU_MAX_STREAMS
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:20

AMREX_LAUNCH_KERNEL
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMREX_GPU_HOST_DEVICE
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20

AMReX_Gpu.H

pdst
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090

AMReX_MFIter.H

AMReX_OpenMP.H

AMREX_D_PICK
#define AMREX_D_PICK(a, b, c)
Definition AMReX_SPACE.H:151

AMREX_D_DECL
#define AMREX_D_DECL(a, b, c)
Definition AMReX_SPACE.H:104

AMReX_ValLocPair.H

amrex::Arena::free
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.

amrex::BoxND< AMREX_SPACEDIM >

amrex::BoxND::ixType
AMREX_GPU_HOST_DEVICE IndexTypeND< dim > ixType() const noexcept
Returns the indexing type.
Definition AMReX_Box.H:127

amrex::BoxND::numPts
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:346

amrex::GpuTuple
Definition AMReX_Tuple.H:93

amrex::Gpu::Device::streamIndex
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:626

amrex::IndexTypeND< AMREX_SPACEDIM >

amrex::IntVectND< AMREX_SPACEDIM >

amrex::MFIter
Definition AMReX_MFIter.H:57

amrex::MFIter::isValid
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:141

amrex::PODVector
Definition AMReX_PODVector.H:262

amrex::PODVector::data
T * data() noexcept
Definition AMReX_PODVector.H:609

amrex::ReduceData
Definition AMReX_Reduce.H:249

amrex::ReduceData::~ReduceData
~ReduceData()
Definition AMReX_Reduce.H:271

amrex::ReduceData::maxStreamIndex
int maxStreamIndex() const
Definition AMReX_Reduce.H:304

amrex::ReduceData::value
Type value()
Definition AMReX_Reduce.H:281

amrex::ReduceData::updateMaxStreamIndex
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:305

amrex::ReduceData::m_host_tuple
Type * m_host_tuple
Definition AMReX_Reduce.H:312

amrex::ReduceData::nBlocks
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:300

amrex::ReduceData::ReduceData
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:254

amrex::ReduceData::nBlocks
GpuArray< int, AMREX_GPU_MAX_STREAMS > & nBlocks()
Definition AMReX_Reduce.H:299

amrex::ReduceData::m_max_blocks
int m_max_blocks
Definition AMReX_Reduce.H:310

amrex::ReduceData::devicePtr
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:293

amrex::ReduceData::value
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:287

amrex::ReduceData::devicePtr
Type * devicePtr()
Definition AMReX_Reduce.H:292

amrex::ReduceData::m_device_tuple
Type * m_device_tuple
Definition AMReX_Reduce.H:313

amrex::ReduceData::ReduceData
ReduceData(ReduceData< Ts... > const &)=delete

amrex::ReduceData::m_fn_value
std::function< Type()> m_fn_value
Definition AMReX_Reduce.H:315

amrex::ReduceData::hostPtr
Type * hostPtr()
Definition AMReX_Reduce.H:297

amrex::ReduceData::maxBlocks
int maxBlocks() const
Definition AMReX_Reduce.H:302

amrex::ReduceData::ReduceData
ReduceData(ReduceData< Ts... > &&)=delete

amrex::ReduceData::m_nblocks
GpuArray< int, AMREX_GPU_MAX_STREAMS > m_nblocks
Definition AMReX_Reduce.H:314

amrex::ReduceOps
Definition AMReX_Reduce.H:364

amrex::ReduceOps::value
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:660

amrex::ReduceOps::eval
void eval(Box const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:553

amrex::ReduceOps::eval_mf
void eval_mf(I, MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:369

amrex::ReduceOps::eval
void eval(Box const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:494

amrex::ReduceOps::eval
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:441

amrex::ReduceOps::resetResultReadiness
void resetResultReadiness()
Definition AMReX_Reduce.H:748

amrex::ReduceOps::eval
std::enable_if_t< IsFabArray< MF >::value > eval(MF const &mf, IntVect const &nghost, int ncomp, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:469

amrex::ReduceOps::eval
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:613

amrex::Gpu::Atomic::Max
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:417

amrex::Gpu::Atomic::Min
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:354

amrex::Gpu::blockReduceSum
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:345

amrex::Gpu::blockReduceMin
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:395

amrex::Gpu::blockReduceMax
AMREX_GPU_DEVICE AMREX_FORCE_INLINE T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:445

amrex::ParallelAllReduce::Sum
void Sum(T &v, MPI_Comm comm)
Definition AMReX_ParallelReduce.H:204

amrex::Reduce::detail::mf_call_f
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void mf_call_f(F const &f, int ibox, int i, int j, int k, int, T &r) noexcept
Definition AMReX_Reduce.H:344

amrex::Reduce::detail::for_each_parallel
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void for_each_parallel(T &d, T const &s)
Definition AMReX_Reduce.H:38

amrex::Reduce::detail::for_each_init
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr void for_each_init(T &t)
Definition AMReX_Reduce.H:70

amrex::Reduce::detail::for_each_local
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void for_each_local(T &d, T const &s)
Definition AMReX_Reduce.H:55

amrex::Reduce::MinMax
std::pair< T, T > MinMax(N n, T const *v)
Definition AMReX_Reduce.H:823

amrex::Reduce::AnyOf
bool AnyOf(N n, T const *v, P const &pred)
Definition AMReX_Reduce.H:851

amrex::detail::Reduce
void Reduce(ReduceOp, T *, int, int, MPI_Comm)
Definition AMReX_ParallelReduce.H:92

amrex::experimental::detail::parfor_mf_detail::call_f
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition AMReX_MFParallelForG.H:58

amrex
Definition AMReX_Amr.cpp:49

amrex::Order::F
@ F

amrex::Box
BoxND< AMREX_SPACEDIM > Box
Definition AMReX_BaseFwd.H:27

amrex::CurlCurlStateType::r
@ r

amrex::CurlCurlStateType::b
@ b

amrex::RunOn::Gpu
@ Gpu

amrex::RunOn::Device
@ Device

amrex::gpuStream_t
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:77

amrex::min
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:21

amrex::IdentityTuple
AMREX_GPU_HOST_DEVICE constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1277

amrex::ubound
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 ubound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:315

amrex::bisect
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition AMReX_Algorithm.H:105

amrex::lbound
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 lbound(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:308

amrex::grow
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1211

amrex::ignore_unused
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127

amrex::length
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 length(Array4< T > const &a) noexcept
Definition AMReX_Array4.H:322

amrex::The_Pinned_Arena
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:656

amrex::max
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:35

amrex::int
const int[]
Definition AMReX_BLProfiler.cpp:1664

amrex::The_Arena
Arena * The_Arena()
Definition AMReX_Arena.cpp:616

amrex::BoxIndexerND
Definition AMReX_Box.H:2027

amrex::BoxIndexerND::numPts
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition AMReX_Box.H:2068

amrex::GpuArray
Definition AMReX_Array.H:34

amrex::Gpu::DeviceScalar
Definition AMReX_GpuMemory.H:56

amrex::Gpu::DeviceScalar::dataValue
T dataValue() const
Definition AMReX_GpuMemory.H:92

amrex::Gpu::DeviceScalar::dataPtr
T * dataPtr()
Definition AMReX_GpuMemory.H:90

amrex::Gpu::ExecutionConfig
Definition AMReX_GpuLaunch.H:132

amrex::Gpu::Handler
Definition AMReX_GpuTypes.H:86

amrex::Gpu::LaunchSafeGuard
Definition AMReX_GpuControl.H:125

amrex::Gpu::warpReduce
Definition AMReX_GpuReduce.H:282

amrex::IsCallable
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:201

amrex::Plus
Definition AMReX_Functional.H:14

amrex::ReduceOpLogicalAnd
Definition AMReX_Reduce.H:180

amrex::ReduceOpLogicalAnd::init
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:208

amrex::ReduceOpLogicalAnd::local_update
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:204

amrex::ReduceOpLogicalAnd::parallel_update
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:194

amrex::ReduceOpLogicalOr
Definition AMReX_Reduce.H:212

amrex::ReduceOpLogicalOr::parallel_update
AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral< T >::value > parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:226

amrex::ReduceOpLogicalOr::local_update
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t< std::is_integral_v< T > > local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:236

amrex::ReduceOpLogicalOr::init
constexpr std::enable_if_t< std::is_integral_v< T > > init(T &t) const noexcept
Definition AMReX_Reduce.H:240

amrex::ReduceOpMax
Definition AMReX_Reduce.H:147

amrex::ReduceOpMax::parallel_update
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:159

amrex::ReduceOpMax::init
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:172

amrex::ReduceOpMax::local_update
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:168

amrex::ReduceOpMax::init
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:176

amrex::ReduceOpMin
Definition AMReX_Reduce.H:114

amrex::ReduceOpMin::init
constexpr std::enable_if_t<!std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:143

amrex::ReduceOpMin::parallel_update
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:126

amrex::ReduceOpMin::init
constexpr std::enable_if_t< std::numeric_limits< T >::is_specialized > init(T &t) const noexcept
Definition AMReX_Reduce.H:139

amrex::ReduceOpMin::local_update
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:135

amrex::ReduceOpSum
Definition AMReX_Reduce.H:85

amrex::ReduceOpSum::parallel_update
AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:98

amrex::ReduceOpSum::local_update
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:107

amrex::ReduceOpSum::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:110

amrex::Reduce::detail::iterate_box_comp
Definition AMReX_Reduce.H:339

amrex::Reduce::detail::iterate_box
Definition AMReX_Reduce.H:338

amrex::TypeList
Struct for holding types.
Definition AMReX_TypeList.H:12