docs_html/doxygen/AMReX__Reduce_8H_source.html

#ifndef AMREX_REDUCE_H_

#define AMREX_REDUCE_H_

#include <AMReX_Config.H>


#include <AMReX_Concepts.H>

#include <AMReX_Gpu.H>

#include <AMReX_Arena.H>

#include <AMReX_OpenMP.H>

#include <AMReX_MFIter.H>

#include <AMReX_TypeList.H>

#include <AMReX_ValLocPair.H>


#include <algorithm>

#include <concepts>

#include <functional>

#include <limits>


namespace amrex {


namespace Reduce {


// The declaration of these functions are here to work around doxygen issues.


template <typename T, std::integral N>

T Sum (N n, T const* v, T init_val = 0);


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Sum (N n, F const& f, T init_val = 0);


template <typename T, std::integral N>

T Min (N n, T const* v, T init_val = std::numeric_limits<T>::max());


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Min (N n, F const& f, T init_val = std::numeric_limits<T>::max());


template <typename T, std::integral N>

T Max (N n, T const* v, T init_val = std::numeric_limits<T>::lowest());


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Max (N n, F const& f, T init_val = std::numeric_limits<T>::lowest());


template <typename T, std::integral N>

std::pair<T,T> MinMax (N n, T const* v);


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

std::pair<T,T> MinMax (N n, F const& f);


template <typename T, std::integral N, typename P>

bool AnyOf (N n, T const* v, P const& pred);


template <typename P, int dim>

bool AnyOf (BoxND<dim> const& box, P const& pred);


}


namespace Reduce::detail {


#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <std::size_t I, typename T, typename P>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s, Gpu::Handler const& h)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);

        for_each_parallel<I+1,T,P1,Ps...>(d, s, h);

    }

#else

    template <std::size_t I, typename T, typename P>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void for_each_parallel (T& d, T const& s)

    {

        P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));

        for_each_parallel<I+1,T,P1,Ps...>(d, s);

    }

#endif

#endif


    template <std::size_t I, typename T, typename P>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void for_each_local (T& d, T const& s)

    {

        P().local_update(amrex::get<I>(d), amrex::get<I>(s));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void for_each_local (T& d, T const& s)

    {

        P().local_update(amrex::get<I>(d), amrex::get<I>(s));

        for_each_local<I+1,T,P1,Ps...>(d, s);

    }


    template <std::size_t I, typename T, typename P>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    constexpr void for_each_init (T& t)

    {

        P().init(amrex::get<I>(t));

    }


    template <std::size_t I, typename T, typename P, typename P1, typename... Ps>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    constexpr void for_each_init (T& t)

    {

        P().init(amrex::get<I>(t));

        for_each_init<I+1,T,P1,Ps...>(t);

    }

}


struct ReduceOpSum

{


#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceSum(s,h);

        if (h.threadIdx() == 0) { d += r; }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceSum<MT>(s);

        if (threadIdx.x == 0) { d += r; }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d += s; }


    template <typename T>

    constexpr void init (T& t) const noexcept { t = 0; }

};


struct ReduceOpMin

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceMin(s,h);

        if (h.threadIdx() == 0) { d = amrex::min(d,r); }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceMin<MT>(s);

        if (threadIdx.x == 0) { d = amrex::min(d,r); }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }


    template <typename T>

    requires (std::numeric_limits<T>::is_specialized)

    constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }


    template <typename T>

    requires (!std::numeric_limits<T>::is_specialized)

    constexpr void init (T& t) const noexcept { t = T::max(); }

};


struct ReduceOpMax

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <typename T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceMax(s,h);

        if (h.threadIdx() == 0) { d = amrex::max(d,r); }

    }

#else

    template <typename T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T const& s) const noexcept {

        T r = Gpu::blockReduceMax<MT>(s);

        if (threadIdx.x == 0) { d = amrex::max(d,r); }

    }


#endif

#endif


    template <typename T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }


    template <typename T>

    requires (std::numeric_limits<T>::is_specialized)

    constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }


    template <typename T>

    requires (!std::numeric_limits<T>::is_specialized)

    constexpr void init (T& t) const noexcept { t = T::lowest(); }

};


struct ReduceOpLogicalAnd

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <std::integral T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceLogicalAnd(s,h);

        if (h.threadIdx() == 0) { d = d && r; }

    }

#else

    template <std::integral T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T s) const noexcept {

        T r = Gpu::blockReduceLogicalAnd<MT>(s);

        if (threadIdx.x == 0) { d = d && r; }

    }


#endif

#endif


    template <std::integral T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T s) const noexcept { d = d && s; }


    template <std::integral T>

    constexpr void init (T& t) const noexcept { t = true; }

};


struct ReduceOpLogicalOr

{

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_SYCL

    template <std::integral T>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void parallel_update (T& d, T s, Gpu::Handler const& h) const noexcept {

        T r = Gpu::blockReduceLogicalOr(s,h);

        if (h.threadIdx() == 0) { d = d || r; }

    }

#else

    template <std::integral T, int MT=AMREX_GPU_MAX_THREADS>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE


    void parallel_update (T& d, T s) const noexcept {

        T r = Gpu::blockReduceLogicalOr<MT>(s);

        if (threadIdx.x == 0) { d = d || r; }

    }


#endif

#endif


    template <std::integral T>

    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE

    void local_update (T& d, T s) const noexcept { d = d || s; }


    template <std::integral T>

    constexpr void init (T& t) const noexcept { t = false; }

};


template <typename... Ps> class ReduceOps;


#ifdef AMREX_USE_GPU


template <typename... Ts>


class ReduceData

{

public:

    using Type = GpuTuple<Ts...>;


    template <typename... Ps>


    explicit ReduceData (ReduceOps<Ps...>& reduce_op)

        : m_max_blocks(Gpu::Device::maxBlocksPerLaunch()),

          m_host_tuple((Type*)(The_Pinned_Arena()->alloc(sizeof(Type)))),

          m_device_tuple((Type*)(The_Arena()->alloc((AMREX_GPU_MAX_STREAMS)

                                                    * m_max_blocks * sizeof(Type)))),

          m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })

    {

        reduce_op.resetResultReadiness();

        static_assert(std::is_trivially_copyable<Type>(),

                      "ReduceData::Type must be trivially copyable");

        static_assert(std::is_trivially_destructible<Type>(),

                      "ReduceData::Type must be trivially destructible");


        new (m_host_tuple) Type();

        m_nblocks.fill(0);

    }


    ~ReduceData () {

        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(

            !m_used_external_stream || m_value_called,

            "ReduceData used on an external GPU stream must call value() before destruction.");

        The_Pinned_Arena()->free(m_host_tuple);

        The_Arena()->free(m_device_tuple);

    }


    ReduceData (ReduceData<Ts...> const&) = delete;

    ReduceData (ReduceData<Ts...> &&) = delete;

    void operator= (ReduceData<Ts...> const&) = delete;

    void operator= (ReduceData<Ts...> &&) = delete;


    Type value ()

    {

        Type r = m_fn_value();

        m_value_called = true;

        return r;

    }


    template <typename... Ps>


    Type value (ReduceOps<Ps...> & reduce_op)

    {

        Type r = reduce_op.value(*this);

        m_value_called = true;

        return r;

    }


    Type* devicePtr () { return m_device_tuple; }


    Type* devicePtr (gpuStream_t const& s) {

        return m_device_tuple+streamIndexChecked(s)*m_max_blocks;

    }


    Type* hostPtr () { return m_host_tuple; }


    GpuArray<int,AMREX_GPU_MAX_STREAMS>& nBlocks () { return m_nblocks; }

    int& nBlocks (gpuStream_t const& s) { return m_nblocks[streamIndexChecked(s)]; }


    int maxBlocks () const { return m_max_blocks; }


    int maxStreamIndex () const { return m_max_stream_index; }


    void updateMaxStreamIndex (gpuStream_t const& s) {

        m_max_stream_index = std::max(m_max_stream_index,streamIndexChecked(s));

    }


    void markValueCalled () noexcept { m_value_called = true; }


private:

    int streamIndexChecked (gpuStream_t const& s)

    {

        int const idx = Gpu::Device::streamIndex(s);

        m_used_external_stream = m_used_external_stream || Gpu::Device::usingExternalStream();

        if (idx == 0) {

            if (m_stream_index_zero_set) {

                AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stream_index_zero == s,

                    "ReduceData cannot be reused across different external GPU streams "

                    "or between an external GPU stream and AMReX stream 0.");

            } else {

                m_stream_index_zero = s;

                m_stream_index_zero_set = true;

            }

        }

        return idx;

    }


    int m_max_blocks;

    int m_max_stream_index = 0;

    Type* m_host_tuple = nullptr;

    Type* m_device_tuple = nullptr;

    GpuArray<int,AMREX_GPU_MAX_STREAMS> m_nblocks;

    gpuStream_t m_stream_index_zero{};

    bool m_stream_index_zero_set = false;

    bool m_used_external_stream = false;

    bool m_value_called = false;

    std::function<Type()> m_fn_value;

};


namespace Reduce::detail {


    // call_f_intvect_box


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->

        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))

    {

        return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);

    }


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->

        decltype(f(BoxND<dim>(iv, iv, t)))

    {

        return f(BoxND<dim>(iv, iv, t));

    }


    // call_f_intvect_n

    template <typename F, typename T, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->

        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))

    {

        return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);

    }


    // mf_call_f


    struct iterate_box {};

    struct iterate_box_comp {};


    template <typename I, typename F, typename T, typename... Ps>

    requires (std::same_as<iterate_box,I>)

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void mf_call_f (F const& f, int ibox, int i, int j, int k, int, T& r) noexcept

    {

        auto const& pr = f(ibox,i,j,k);

        Reduce::detail::for_each_local<0, T, Ps...>(r, pr);

    }


    template <typename I, typename F, typename T, typename... Ps>

    requires (std::same_as<iterate_box_comp,I>)

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    void mf_call_f (F const& f, int ibox, int i, int j, int k, int ncomp, T& r) noexcept

    {

        for (int n = 0; n < ncomp; ++n) {

            auto const& pr = f(ibox,i,j,k,n);

            Reduce::detail::for_each_local<0, T, Ps...>(r, pr);

        }

    }

}


template <typename... Ps>


class ReduceOps

{

public:


    // This is public for CUDA

    template <typename I, typename MF, typename D, typename F>

    void eval_mf (I, MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        const int nboxes = mf.local_size();

        if (nboxes > 0) {

            auto const& parforinfo = mf.getParForInfo(nghost);

            auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);

            AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));

            const int nblocks = nblocks_per_box * nboxes;

            const BoxIndexer* dp_boxes = parforinfo.getBoxes();


            auto const& stream = Gpu::gpuStream();

            auto pdst = reduce_data.devicePtr(stream);

            int nblocks_ec = std::min(nblocks, reduce_data.maxBlocks());

            AMREX_ASSERT(Long(nblocks_ec)*2 <= Long(std::numeric_limits<int>::max()));

            int& nblocks_ref = reduce_data.nBlocks(stream);

            auto old_nblocks = static_cast<unsigned int>(nblocks_ref);

            nblocks_ref = amrex::max(nblocks_ref, nblocks_ec);

            reduce_data.updateMaxStreamIndex(stream);


#ifdef AMREX_USE_SYCL

            // device reduce needs local(i.e., shared) memory

            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

            amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

                          [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

            {

                Dim1 blockIdx {gh.blockIdx()};

                Dim1 threadIdx{gh.threadIdx()};

#else

            amrex::launch_global<AMREX_GPU_MAX_THREADS>

                <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>

                ([=] AMREX_GPU_DEVICE () noexcept

            {

#endif

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple& dst = pdst[blockIdx.x];

                if (threadIdx.x == 0 && blockIdx.x >= old_nblocks) {

                    dst = r;

                }

                for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {

                    int ibox = iblock / nblocks_per_box;

                    auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;


                    BoxIndexer const& indexer = dp_boxes[ibox];

                    if (icell < indexer.numPts()) {

                        auto [i, j, k] = indexer(icell);

                        Reduce::detail::mf_call_f<I, F, ReduceTuple, Ps...>

                            (f, ibox, i, j, k, ncomp, r);

                    }

                }

#ifdef AMREX_USE_SYCL

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

            });

        }

    }


    // This is public for CUDA

    template <typename I, int dim, typename D, typename F>

    void eval_box (I, BoxND<dim> const& box, int ncomp, D& reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr(stream);

        int& nblocks = reduce_data.nBlocks(stream);

        const BoxIndexerND<dim> indexer(box);

        IndexTypeND<dim> ixtype = box.ixType();

        constexpr int nitems_per_thread = 4;

        Long nblocks_ec = (box.numPts() + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)

            / (nitems_per_thread*AMREX_GPU_MAX_THREADS);

        nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());

        reduce_data.updateMaxStreamIndex(stream);

#ifdef AMREX_USE_SYCL

        // device reduce needs local(i.e., shared) memory

        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

        {

            Dim1 blockIdx {gh.blockIdx()};

            Dim1 threadIdx{gh.threadIdx()};

            Dim1 gridDim  {gh.gridDim()};

#else

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,

        [=] AMREX_GPU_DEVICE () noexcept

        {

#endif

            ReduceTuple r;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

            ReduceTuple& dst = *(dp+blockIdx.x);

            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {

                dst = r;

            }

            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,

                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;

                 icell < indexer.numPts();

                 icell += stride)

            {

                auto iv = indexer.intVect(icell);

                amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture

                if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {

                    auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);

                } else {

                    for (int n = 0; n < ncomp; ++n) {

                        auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);

                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);

                    }

                }

            }

#ifdef AMREX_USE_SYCL

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

        });

        nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));

    }


    template <FabArrayType MF, typename D, typename F>

#ifndef AMREX_USE_CUDA

    requires (IsCallable<F, int, int, int, int>::value)

#endif


    void eval (MF const& mf, IntVect const& nghost, D& reduce_data, F&& f)

    {

        using ReduceTuple = typename D::Type;

        const int nboxes = mf.local_size();

        if (nboxes == 0) {

            return;

        } else if (!mf.isFusingCandidate()) {

            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {

                Box const& b = amrex::grow(mfi.validbox(), nghost);

                const int li = mfi.LocalIndex();

                this->eval(b, reduce_data,

                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept -> ReduceTuple

                {

                    return f(li, i, j, k);

                });

            }

        } else {

            eval_mf(Reduce::detail::iterate_box{},

                    mf, nghost, 0, reduce_data, std::forward<F>(f));

        }

    }


    template <FabArrayType MF, typename D, typename F>

#ifndef AMREX_USE_CUDA

    requires (IsCallable<F, int, int, int, int, int>::value)

#endif


    void eval (MF const& mf, IntVect const& nghost, int ncomp, D& reduce_data, F&& f)

    {

        using ReduceTuple = typename D::Type;


        const int nboxes = mf.local_size();


        if (nboxes == 0) {

            return;

        } else if (!mf.isFusingCandidate()) {

            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {

                Box const& b = amrex::grow(mfi.validbox(), nghost);

                const int li = mfi.LocalIndex();

                this->eval(b, ncomp, reduce_data,

                [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept -> ReduceTuple

                {

                    return f(li, i, j, k, n);

                });

            }

        } else {

            eval_mf(Reduce::detail::iterate_box_comp{},

                    mf, nghost, ncomp, reduce_data, std::forward<F>(f));

        }

    }


    template <typename D, typename F, int dim>


    void eval (BoxND<dim> const& box, D & reduce_data, F const& f)

    {

        eval_box(Reduce::detail::iterate_box{}, box, 0, reduce_data, f);

    }


    template <std::integral N, typename D, typename F, int dim>


    void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)

    {

        eval_box(Reduce::detail::iterate_box_comp{}, box, ncomp, reduce_data, f);

    }


    template <std::integral N, typename D, typename F>


    void eval (N n, D & reduce_data, F const& f)

    {

        if (n <= 0) { return; }

        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr(stream);

        int& nblocks = reduce_data.nBlocks(stream);

        constexpr int nitems_per_thread = 4;

        int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)

            / (nitems_per_thread*AMREX_GPU_MAX_THREADS);

        nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());

        reduce_data.updateMaxStreamIndex(stream);

#ifdef AMREX_USE_SYCL

        // device reduce needs local(i.e., shared) memory

        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,

        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

        {

            Dim1 blockIdx {gh.blockIdx()};

            Dim1 threadIdx{gh.threadIdx()};

            Dim1 gridDim  {gh.gridDim()};

#else

        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,

        [=] AMREX_GPU_DEVICE () noexcept

        {

#endif

            ReduceTuple r;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

            ReduceTuple& dst = *(dp+blockIdx.x);

            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {

                dst = r;

            }

            for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,

                 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;

                 i < n;

                 i += stride)

            {

                auto pr = f(i);

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);

            }

#ifdef AMREX_USE_SYCL

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

#else

            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

#endif

        });

        nblocks = amrex::max(nblocks, nblocks_ec);

    }


    template <typename D>


    typename D::Type value (D & reduce_data)

    {

        auto hp = reduce_data.hostPtr();


        if (m_result_is_ready) {

            reduce_data.markValueCalled();

            return *hp;

        }


        using ReduceTuple = typename D::Type;

        auto const& stream = Gpu::gpuStream();

        auto dp = reduce_data.devicePtr();

        auto const& nblocks = reduce_data.nBlocks();

#if defined(AMREX_USE_SYCL)

        if (reduce_data.maxStreamIndex() == 0 && nblocks[0] <= 4096) {

            const int N = nblocks[0];

            if (N == 0) {

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(*hp);

            } else {

                Gpu::PinnedVector<ReduceTuple> tmp(N);

                Gpu::dtoh_memcpy_async(tmp.data(), dp, sizeof(ReduceTuple)*N);

                Gpu::streamSynchronize();

                for (int i = 1; i < N; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(tmp[0], tmp[i]);

                }

                *hp = tmp[0];

            }

        } else

#endif

        {

            int maxblocks = reduce_data.maxBlocks();

#ifdef AMREX_USE_SYCL

            // device reduce needs local(i.e., shared) memory

            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;

#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND

            // xxxxx SYCL todo: reduce bug workaround

            Gpu::DeviceVector<ReduceTuple> dtmp(1);

            auto presult = dtmp.data();

#else

            auto presult = hp;

#endif

            amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,

            [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept

            {

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple dst = r;

                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {

                    auto dp_stream = dp+istream*maxblocks;

                    for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);

                         i < nblocks[istream]; i += stride) {

                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);

                    }

                }

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);

                if (gh.threadIdx() == 0) { *presult = dst; }

            });

#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND

            Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));

#endif

#else

            amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,

            [=] AMREX_GPU_DEVICE () noexcept

            {

                ReduceTuple r;

                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

                ReduceTuple dst = r;

                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {

                    auto dp_stream = dp+istream*maxblocks;

                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                         i < nblocks[istream]; i += stride) {

                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);

                    }

                }

                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);

                if (threadIdx.x == 0) { *hp = dst; }

            });

#endif

            Gpu::streamSynchronize();

        }


        m_result_is_ready = true;

        reduce_data.markValueCalled();

        return *hp;

    }


private:

    template <typename... T> friend class ReduceData;

    bool m_result_is_ready = false;

    void resetResultReadiness () { m_result_is_ready = false; }

};


namespace Reduce {


template <typename T, std::integral N>


T Sum (N n, T const* v, T init_val)

{

    ReduceOps<ReduceOpSum> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return amrex::get<0>(hv) + init_val;

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)


T Sum (N n, F const& f, T init_val)

{

    ReduceOps<ReduceOpSum> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return amrex::get<0>(hv) + init_val;

}


template <typename T, std::integral N>


T Min (N n, T const* v, T init_val)

{

    ReduceOps<ReduceOpMin> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::min(amrex::get<0>(hv),init_val);

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)


T Min (N n, F const& f, T init_val)

{

    ReduceOps<ReduceOpMin> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::min(amrex::get<0>(hv),init_val);

}


template <typename T, std::integral N>


T Max (N n, T const* v, T init_val)

{

    ReduceOps<ReduceOpMax> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {v[i]}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::max(amrex::get<0>(hv),init_val);

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)


T Max (N n, F const& f, T init_val)

{

    ReduceOps<ReduceOpMax> reduce_op;

    ReduceData<T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple { return {f(i)}; });

    ReduceTuple hv = reduce_data.value(reduce_op);

    return std::max(amrex::get<0>(hv),init_val);

}


template <typename T, std::integral N>


std::pair<T,T> MinMax (N n, T const* v)

{

    ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;

    ReduceData<T,T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {

            return {v[i],v[i]};

        });

    auto hv = reduce_data.value(reduce_op);

    return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)


std::pair<T,T> MinMax (N n, F const& f)

{

    ReduceOps<ReduceOpMin,ReduceOpMax> reduce_op;

    ReduceData<T,T> reduce_data(reduce_op);

    using ReduceTuple = typename decltype(reduce_data)::Type;

    reduce_op.eval(n, reduce_data, [=] AMREX_GPU_DEVICE (N i) -> ReduceTuple {

            T tmp = f(i);

            return {tmp,tmp};

        });

    auto hv = reduce_data.value(reduce_op);

    return std::make_pair(amrex::get<0>(hv), amrex::get<1>(hv));

}


template <typename T, std::integral N, typename P>


bool AnyOf (N n, T const* v, P const& pred)

{

    Gpu::LaunchSafeGuard lsg(true);

    Gpu::DeviceScalar<int> ds(0);

    int* dp = ds.dataPtr();

    auto ec = Gpu::ExecutionConfig(n);

    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());


#ifdef AMREX_USE_SYCL

    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;

    const std::size_t shared_mem_bytes = num_ints*sizeof(int);

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {

        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);

        if (gh.threadIdx() == 0) { *has_any = *dp; }

        gh.sharedBarrier();


        if (!(*has_any))

        {

            int r = false;

            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();

                 i < n && !r; i += stride)

            {

                r = pred(v[i]) ? 1 : 0;

            }


            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);

            if (gh.threadIdx() == 0 && r) { *dp = 1; }

        }

    });

#else

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        __shared__ int has_any;

        if (threadIdx.x == 0) { has_any = *dp; }

        __syncthreads();


        if (!has_any)

        {

            int r = false;

            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;

                 i < n && !r; i += stride)

            {

                r = pred(v[i]) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);

            if (threadIdx.x == 0 && r) *dp = 1;

        }

    });

#endif

    return ds.dataValue();

}


template <typename P, int dim>


bool AnyOf (BoxND<dim> const& box, P const& pred)

{

    Gpu::LaunchSafeGuard lsg(true);

    Gpu::DeviceScalar<int> ds(0);

    int* dp = ds.dataPtr();

    const BoxIndexerND<dim> indexer(box);

    auto ec = Gpu::ExecutionConfig(box.numPts());

    ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());


#ifdef AMREX_USE_SYCL

    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;

    const std::size_t shared_mem_bytes = num_ints*sizeof(int);

    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {

        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);

        if (gh.threadIdx() == 0) { *has_any = *dp; }

        gh.sharedBarrier();


        if (!(*has_any))

        {

            int r = false;

            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),

                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();

                 icell < indexer.numPts() && !r;

                 icell += stride)

            {

                auto iv = indexer.intVect(icell);

                r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);

            if (gh.threadIdx() == 0 && r) { *dp = 1; }

        }

    });

#else

    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,

                        Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        __shared__ int has_any;

        if (threadIdx.x == 0) { has_any = *dp; }

        __syncthreads();


        if (!has_any)

        {

            int r = false;

            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,

                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;

                 icell < indexer.numPts() && !r;

                 icell += stride)

            {

                auto iv = indexer.intVect(icell);

                r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;

            }

            r = Gpu::blockReduce<Gpu::Device::warp_size>

                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);

            if (threadIdx.x == 0 && r) *dp = 1;

        }

    });

#endif

    return ds.dataValue();

}


}


#else


template <typename... Ts>

class ReduceData

{

public:

    using Type = GpuTuple<Ts...>;


    template <typename... Ps>

    explicit ReduceData (ReduceOps<Ps...>& reduce_op)

        : m_tuple(OpenMP::in_parallel() ? 1 : OpenMP::get_max_threads()),

          m_fn_value([&reduce_op,this] () -> Type { return this->value(reduce_op); })

    {

        reduce_op.resetResultReadiness();

        for (auto& t : m_tuple) {

            Reduce::detail::for_each_init<0, Type, Ps...>(t);

        }

    }


    ~ReduceData () = default;

    ReduceData (ReduceData<Ts...> const&) = delete;

    ReduceData (ReduceData<Ts...> &&) = delete;

    void operator= (ReduceData<Ts...> const&) = delete;

    void operator= (ReduceData<Ts...> &&) = delete;


    Type value () { return m_fn_value(); }


    template <typename... Ps>

    Type value (ReduceOps<Ps...>& reduce_op)

    {

        return reduce_op.value(*this);

    }


    Vector<Type>& reference () { return m_tuple; }


    Type& reference (int tid)

    {

        if (m_tuple.size() == 1) {

            // No OpenMP or already inside OpenMP parallel when reduce_data is constructed

            return m_tuple[0];

        } else {

            return m_tuple[tid];

        }

    }


private:

    Vector<Type> m_tuple;

    std::function<Type()> m_fn_value;

};


namespace Reduce::detail {


    // call_f_intvect


    template <typename F, int dim>

    AMREX_FORCE_INLINE

    auto call_f_intvect (F const& f, IntVectND<dim> iv) noexcept ->

        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))

    {

        return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);

    }


    // call_f_intvect_n


    template <typename F, typename T, int dim>

    AMREX_FORCE_INLINE

    auto call_f_intvect_n (F const& f, IntVectND<dim> iv, T n) noexcept ->

        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n))

    {

        return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);

    }

}


template <typename... Ps>

class ReduceOps

{

private:


    // call_f_box


    template <typename D, typename F, int dim>

    requires (std::same_as<std::decay_t<decltype(

                  Reduce::detail::call_f_intvect(std::declval<F const&>(), IntVectND<dim>()))>,

              typename D::Type>)

    AMREX_FORCE_INLINE

    static void call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f) noexcept

    {

        using ReduceTuple = typename D::Type;

        For(box,

            [&] (IntVectND<dim> iv) {

                auto pr = Reduce::detail::call_f_intvect(f, iv);

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);

            });

    }


    template <typename D, typename F, int dim>

    requires (std::same_as<std::decay_t<decltype(

                  std::declval<F const&>()(std::declval<BoxND<dim> const&>()))>,

              typename D::Type>)

    AMREX_FORCE_INLINE

    static void call_f_box (BoxND<dim> const& box, typename D::Type & r, F const& f) noexcept

    {

        using ReduceTuple = typename D::Type;

        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, f(box));

    }


public:


    template <FabArrayType MF, typename D, typename F>

    requires (IsCallable<F, int, int, int, int>::value)

    void eval (MF const& mf, IntVect const& nghost, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

#ifdef AMREX_USE_OMP

#pragma omp parallel

#endif

        {

            ReduceTuple rr;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);

            for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {

                Box const& b = mfi.growntilebox(nghost);

                const int li = mfi.LocalIndex();

                const auto lo = amrex::lbound(b);

                const auto hi = amrex::ubound(b);

                for (int k = lo.z; k <= hi.z; ++k) {

                for (int j = lo.y; j <= hi.y; ++j) {

                for (int i = lo.x; i <= hi.x; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k));

                }}}

            }

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(

                reduce_data.reference(OpenMP::get_thread_num()), rr);

        }

    }


    template <FabArrayType MF, typename D, typename F>

    requires (IsCallable<F, int, int, int, int, int>::value)

    void eval (MF const& mf, IntVect const& nghost, int ncomp, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

#ifdef AMREX_USE_OMP

#pragma omp parallel

#endif

        {

            ReduceTuple rr;

            Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);

            for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {

                Box const& b = mfi.growntilebox(nghost);

                const int li = mfi.LocalIndex();

                const auto lo = amrex::lbound(b);

                const auto hi = amrex::ubound(b);

                for (int n = 0; n < ncomp; ++n) {

                for (int k = lo.z; k <= hi.z; ++k) {

                for (int j = lo.y; j <= hi.y; ++j) {

                for (int i = lo.x; i <= hi.x; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(li,i,j,k,n));

                }}}}

            }

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(

                reduce_data.reference(OpenMP::get_thread_num()), rr);

        }

    }


    template <typename D, typename F, int dim>

    void eval (BoxND<dim> const& box, D & reduce_data, F&& f)

    {

        using ReduceTuple = typename D::Type;

        ReduceTuple rr;

        Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);

        call_f_box<D>(box, rr, std::forward<F>(f));

        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(

             reduce_data.reference(OpenMP::get_thread_num()), rr);

    }


    template <std::integral N, typename D, typename F, int dim>

    void eval (BoxND<dim> const& box, N ncomp, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        ReduceTuple rr;

        Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);

        For(box, ncomp,

            [&] (IntVectND<dim> iv, int n) {

                auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);

                Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, pr);

            });

        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(

             reduce_data.reference(OpenMP::get_thread_num()), rr);

    }


    template <std::integral N, typename D, typename F>

    void eval (N n, D & reduce_data, F const& f)

    {

        using ReduceTuple = typename D::Type;

        ReduceTuple rr;

        Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(rr);

        for (N i = 0; i < n; ++i) {

            Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rr, f(i));

        }

        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(

             reduce_data.reference(OpenMP::get_thread_num()), rr);

    }


    template <typename D>

    typename D::Type value (D & reduce_data)

    {

        auto& rrv = reduce_data.reference();

        if (! m_result_is_ready) {

            using ReduceTuple = typename D::Type;

            if (rrv.size() > 1) {

                for (int i = 1, N = rrv.size(); i < N; ++i) {

                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(rrv[0], rrv[i]);

                }

            }

            m_result_is_ready = true;

        }

        return rrv[0];

    }


private:

    template <typename... T> friend class ReduceData;

    bool m_result_is_ready = false;

    void resetResultReadiness () { m_result_is_ready = false; }

};


namespace Reduce {


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Sum (N n, F const& f, T init_val)

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(+:r)

#endif

    for (N i = 0; i < n; ++i) {

        r += f(i);

    }

    return r;

}


template <typename T, std::integral N>

T Sum (N n, T const* v, T init_val)

{

    return Sum(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Min (N n, F const& f, T init_val)

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(min:r)

#endif

    for (N i = 0; i < n; ++i) {

        r = std::min(r,f(i));

    }

    return r;

}


template <typename T, std::integral N>

T Min (N n, T const* v, T init_val)

{

    return Reduce::Min(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

T Max (N n, F const& f, T init_val)

{

    T r = init_val;

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(max:r)

#endif

    for (N i = 0; i < n; ++i) {

        r = std::max(r,f(i));

    }

    return r;

}


template <typename T, std::integral N>

T Max (N n, T const* v, T init_val)

{

    return Reduce::Max(n, [=] (N i) -> T { return v[i]; }, init_val);

}


template <typename T, std::integral N, typename F>

requires (!std::same_as<T*,std::decay_t<F>>)

std::pair<T,T> MinMax (N n, F const& f)

{

    T r_min = std::numeric_limits<T>::max();

    T r_max = std::numeric_limits<T>::lowest();

#ifdef AMREX_USE_OMP

#pragma omp parallel for reduction(min:r_min) reduction(max:r_max)

#endif

    for (N i = 0; i < n; ++i) {

        T tmp = f(i);

        r_min = std::min(r_min,tmp);

        r_max = std::max(r_max,tmp);

    }

    return std::make_pair(r_min,r_max);

}


template <typename T, std::integral N>

std::pair<T,T> MinMax (N n, T const* v)

{

    return Reduce::MinMax<T>(n, [=] (N i) -> T { return v[i]; });

}


template <typename T, std::integral N, typename P>

bool AnyOf (N n, T const* v, P const& pred)

{

    return std::any_of(v, v+n, pred);

}


template <typename P, int dim>

bool AnyOf (BoxND<dim> const& box, P const& pred)

{

    for (auto iv : box.iterator()) { // NOLINT(readability-use-anyofallof)

        if (Reduce::detail::call_f_intvect(pred, iv)) { return true; }

    }

    return false;

}


}


#endif


template <typename... Ts, typename... Ps>

AMREX_GPU_HOST_DEVICE

constexpr GpuTuple<Ts...>


IdentityTuple (GpuTuple<Ts...>, ReduceOps<Ps...>) noexcept

{

    GpuTuple<Ts...> r{};

    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);

    return r;

}


template <typename... Ts, typename... Ps>

AMREX_GPU_HOST_DEVICE

constexpr GpuTuple<Ts...>


IdentityTuple (GpuTuple<Ts...>, TypeList<Ps...>) noexcept

{

    GpuTuple<Ts...> r{};

    Reduce::detail::for_each_init<0, decltype(r), Ps...>(r);

    return r;

}


template <typename Ops, typename Ts>

class ReducerImpl;


template <typename... Ops, typename... Ts>

class ReducerImpl<TypeList<Ops...>, TypeList<Ts...>>

{

public:

    static_assert(sizeof...(Ops) > 0);

    static_assert(sizeof...(Ts) > 0);

    static_assert(sizeof...(Ops) == sizeof...(Ts));


    ReducerImpl ()

        : m_reduce_data(m_reduce_op)

        {}


protected:

    using Result_t = GpuTuple<Ts...>;

    ReduceOps<Ops...> m_reduce_op;

    ReduceData<Ts...> m_reduce_data;

};


template <typename Ops, typename Ts>


class Reducer

    : public ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>

{

    using Base = ReducerImpl<ToTypeList_t<Ops>, ToTypeList_t<Ts>>;

public:


    using Result_t = typename Base::Result_t;

    static constexpr int size = GpuTupleSize<Result_t>::value;


    Reducer () = default;

    ~Reducer () = default;


    Reducer (Reducer const&) = delete;

    Reducer (Reducer &&) = delete;

    void operator= (Reducer const&) = delete;

    void operator= (Reducer &&) = delete;


    template <typename F, int dim>

    requires (IsCallable<F, int, int, int>::value ||

              IsCallable<F, IntVectND<dim>>::value)


    void eval (BoxND<dim> const& box, F&& f)

    {

        this->m_reduce_op.eval(box, this->m_reduce_data, std::forward<F>(f));

    }


    template <typename F, int dim>

    requires (IsCallable<F, int, int, int, int>::value ||

              IsCallable<F, IntVectND<dim>, int>::value)


    void eval (BoxND<dim> const& box, int ncomp, F&& f)

    {

        this->m_reduce_op.eval(box, ncomp, this->m_reduce_data, std::forward<F>(f));

    }


    template <FabArrayType MF, typename F>

    requires (IsCallable<F, int, int, int, int>::value)


    void eval (MF const& mf, IntVect const& nghost, F && f)

    {

        this->m_reduce_op.eval(mf, nghost, this->m_reduce_data, std::forward<F>(f));

    }


    template <FabArrayType MF, typename F>

    requires (IsCallable<F, int, int, int, int, int>::value)


    void eval (MF const& mf, IntVect const& nghost, int ncomp, F && f)

    {

        this->m_reduce_op.eval(mf, nghost, ncomp, this->m_reduce_data, std::forward<F>(f));

    }


    template <typename N, typename F>

    requires (IsCallable<F, N>::value)


    void eval (N n, F && f)

    {

        this->m_reduce_op.eval(n, this->m_reduce_data, std::forward<F>(f));

    }


    [[nodiscard]] Result_t getResult ()

    {

        return this->m_reduce_data.value(this->m_reduce_op);

    }


};


}


#endif

AMReX_Arena.H
Memory arena base class and global arena accessors.

AMREX_ALWAYS_ASSERT_WITH_MESSAGE
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49

AMREX_ASSERT
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38

AMReX_Concepts.H

AMREX_FORCE_INLINE
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:124

AMREX_GPU_MAX_STREAMS
#define AMREX_GPU_MAX_STREAMS
Definition AMReX_GpuDevice.H:21

AMREX_LAUNCH_KERNEL
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

AMREX_GPU_HOST_DEVICE
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20

AMReX_Gpu.H

pdst
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1130

AMReX_MFIter.H

AMReX_OpenMP.H

AMReX_TypeList.H

AMReX_ValLocPair.H

amrex::Arena::free
virtual void free(void *pt)=0
Free a previously allocated block pointed to by pt.

amrex::BoxND
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:54

amrex::BoxND::numPts
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:385

amrex::BoxND::ixType
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:148

amrex::GpuTuple
GPU-compatible tuple.
Definition AMReX_Tuple.H:98

amrex::Gpu::Device::streamIndex
static int streamIndex(gpuStream_t s=gpuStream()) noexcept
Definition AMReX_GpuDevice.cpp:710

amrex::Gpu::Device::usingExternalStream
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:837

amrex::IndexTypeND
Cell-Based or Node-Based Indices.
Definition AMReX_IndexType.H:36

amrex::IntVectND< 3 >

amrex::MFIter
Iterator for looping ever tiles and boxes of amrex::FabArray based containers.
Definition AMReX_MFIter.H:88

amrex::MFIter::isValid
bool isValid() const noexcept
Is the iterator valid i.e. is it associated with a FAB?
Definition AMReX_MFIter.H:172

amrex::PODVector
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308

amrex::PODVector::data
T * data() noexcept
Definition AMReX_PODVector.H:666

amrex::ReduceData
Definition AMReX_Reduce.H:438

amrex::ReduceData::~ReduceData
~ReduceData()
Definition AMReX_Reduce.H:460

amrex::ReduceData::maxStreamIndex
int maxStreamIndex() const
Definition AMReX_Reduce.H:500

amrex::ReduceData::value
Type value()
Definition AMReX_Reduce.H:473

amrex::ReduceData::updateMaxStreamIndex
void updateMaxStreamIndex(gpuStream_t const &s)
Definition AMReX_Reduce.H:501

amrex::ReduceData::nBlocks
int & nBlocks(gpuStream_t const &s)
Definition AMReX_Reduce.H:496

amrex::ReduceData::ReduceData
ReduceData(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:443

amrex::ReduceData::markValueCalled
void markValueCalled() noexcept
Definition AMReX_Reduce.H:505

amrex::ReduceData::devicePtr
Type * devicePtr(gpuStream_t const &s)
Definition AMReX_Reduce.H:489

amrex::ReduceData::value
Type value(ReduceOps< Ps... > &reduce_op)
Definition AMReX_Reduce.H:481

amrex::ReduceData::devicePtr
Type * devicePtr()
Definition AMReX_Reduce.H:488

amrex::ReduceData::nBlocks
GpuArray< int, 8 > & nBlocks()
Definition AMReX_Reduce.H:495

amrex::ReduceData::ReduceData
ReduceData(ReduceData< Ts... > const &)=delete

amrex::ReduceData::hostPtr
Type * hostPtr()
Definition AMReX_Reduce.H:493

amrex::ReduceData::maxBlocks
int maxBlocks() const
Definition AMReX_Reduce.H:498

amrex::ReduceData::ReduceData
ReduceData(ReduceData< Ts... > &&)=delete

amrex::ReduceOps
Definition AMReX_Reduce.H:597

amrex::ReduceOps::eval
void eval(BoxND< dim > const &box, N ncomp, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:788

amrex::ReduceOps::value
D::Type value(D &reduce_data)
Definition AMReX_Reduce.H:844

amrex::ReduceOps::eval
void eval(BoxND< dim > const &box, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:782

amrex::ReduceOps::eval
void eval(MF const &mf, IntVect const &nghost, D &reduce_data, F &&f)
Definition AMReX_Reduce.H:731

amrex::ReduceOps::eval
void eval(N n, D &reduce_data, F const &f)
Definition AMReX_Reduce.H:794

amrex::Reducer
Class for local reductions (e.g., sum, min and max).
Definition AMReX_Reduce.H:1602

amrex::Reducer::Reducer
Reducer()=default

amrex::Reducer::getResult
Result_t getResult()
Get the final reduction result.
Definition AMReX_Reduce.H:1747

amrex::Reducer::Result_t
typename Base::Result_t Result_t
Reduction result type, GpuTuple<U...>, where U... are the types in Ts.
Definition AMReX_Reduce.H:1606

amrex::Reducer::~Reducer
~Reducer()=default

amrex::Reducer::eval
void eval(BoxND< dim > const &box, F &&f)
Reduction over a Box.
Definition AMReX_Reduce.H:1635

amrex::Long
amrex_long Long
Definition AMReX_INT.H:30

amrex::Reduce::Min
T Min(N n, T const *v, T init_val=std::numeric_limits< T >::max())
Compute the minimum of an array of values.
Definition AMReX_Reduce.H:962

amrex::Reduce::AnyOf
bool AnyOf(N n, T const *v, P const &pred)
Test whether any element in an array satisfies a unary predicate.
Definition AMReX_Reduce.H:1036

amrex::Reduce::MinMax
std::pair< T, T > MinMax(N n, T const *v)
Compute the minimum and maximum of an array of values.
Definition AMReX_Reduce.H:1008

amrex::Reduce::Max
T Max(N n, T const *v, T init_val=std::numeric_limits< T >::lowest())
Compute the maximum of an array of values.
Definition AMReX_Reduce.H:985

amrex::Reduce::Sum
T Sum(N n, T const *v, T init_val=0)
Compute the sum of an array of values.
Definition AMReX_Reduce.H:939

amrex::ubound
__host__ __device__ Dim3 ubound(Array4< T > const &a) noexcept
Return the inclusive upper bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1364

amrex::lbound
__host__ __device__ Dim3 lbound(Array4< T > const &a) noexcept
Return the inclusive lower bounds of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1350

amrex::grow
__host__ __device__ BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Return a copy of b grown uniformly by i cells in every direction.
Definition AMReX_Box.H:1326

amrex::The_Pinned_Arena
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:860

amrex::The_Arena
Arena * The_Arena()
Definition AMReX_Arena.cpp:820

amrex::ParallelAllReduce::Sum
void Sum(Gpu::DeviceVector< T > &v, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:34

amrex::min
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:31

amrex::max
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:53

amrex::Gpu::Atomic::Max
__host__ __device__ AMREX_FORCE_INLINE T Max(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:419

amrex::Gpu::Atomic::Min
__host__ __device__ AMREX_FORCE_INLINE T Min(T *const m, T const value) noexcept
Definition AMReX_GpuAtomic.H:356

amrex::Gpu::blockReduceLogicalOr
__device__ int blockReduceLogicalOr(int source) noexcept
Definition AMReX_GpuReduce.H:553

amrex::Gpu::blockReduceMax
__device__ T blockReduceMax(T source) noexcept
Definition AMReX_GpuReduce.H:452

amrex::Gpu::blockReduceMin
__device__ T blockReduceMin(T source) noexcept
Definition AMReX_GpuReduce.H:397

amrex::Gpu::blockReduceLogicalAnd
__device__ int blockReduceLogicalAnd(int source) noexcept
Definition AMReX_GpuReduce.H:505

amrex::Gpu::blockReduceSum
__device__ T blockReduceSum(T source) noexcept
Definition AMReX_GpuReduce.H:347

amrex
Definition AMReX_Amr.cpp:50

amrex::ignore_unused
__host__ __device__ void ignore_unused(const Ts &...)
No-op helper that marks variables as intentionally unused.
Definition AMReX.H:259

amrex::Order::F
@ F

amrex::RunOn::Gpu
@ Gpu

amrex::RunOn::Device
@ Device

amrex::gpuStream_t
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79

amrex::IdentityTuple
__host__ __device__ constexpr GpuTuple< Ts... > IdentityTuple(GpuTuple< Ts... >, ReduceOps< Ps... >) noexcept
Return a GpuTuple containing the identity element for each operation in ReduceOps....
Definition AMReX_Reduce.H:1491

amrex::Box
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:35

amrex::ToTypeList_t
typename ToTypeList< T >::type ToTypeList_t
Definition AMReX_TypeList.H:233

amrex::int
const int[]
Definition AMReX_BLProfiler.cpp:1664

amrex::For
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136

amrex::BoxIndexerND
Utility that maps flattened point indices back to IntVectND coordinates.
Definition AMReX_Box.H:2494

amrex::BoxIndexerND::intVect
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Convert flattened point index icell to its IntVectND coordinate.
Definition AMReX_Box.H:2517

amrex::BoxIndexerND::numPts
__host__ __device__ std::uint64_t numPts() const
Return the number of points covered by the indexed box.
Definition AMReX_Box.H:2552

amrex::GpuArray
Fixed-size array that can be used on GPU.
Definition AMReX_Array.H:52

amrex::GpuTupleSize
Definition AMReX_Tuple.H:127

amrex::Gpu::DeviceScalar
Definition AMReX_GpuMemory.H:57

amrex::Gpu::DeviceScalar::dataPtr
T * dataPtr()
Definition AMReX_GpuMemory.H:91

amrex::Gpu::DeviceScalar::dataValue
T dataValue() const
Definition AMReX_GpuMemory.H:93

amrex::Gpu::ExecutionConfig
Definition AMReX_GpuLaunch.H:121

amrex::Gpu::Handler
Definition AMReX_GpuTypes.H:86

amrex::Gpu::LaunchSafeGuard
Definition AMReX_GpuControl.H:127

amrex::Gpu::warpReduce
Definition AMReX_GpuReduce.H:284

amrex::IsCallable
Test if a given type T is callable with arguments of type Args...
Definition AMReX_TypeTraits.H:208

amrex::Plus
Definition AMReX_Functional.H:14

amrex::ReduceOpLogicalAnd
Definition AMReX_Reduce.H:375

amrex::ReduceOpLogicalAnd::local_update
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:396

amrex::ReduceOpLogicalAnd::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:399

amrex::ReduceOpLogicalAnd::parallel_update
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:387

amrex::ReduceOpLogicalOr
Definition AMReX_Reduce.H:404

amrex::ReduceOpLogicalOr::parallel_update
__device__ void parallel_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:416

amrex::ReduceOpLogicalOr::local_update
__host__ __device__ void local_update(T &d, T s) const noexcept
Definition AMReX_Reduce.H:425

amrex::ReduceOpLogicalOr::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:428

amrex::ReduceOpMax
Definition AMReX_Reduce.H:341

amrex::ReduceOpMax::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:366

amrex::ReduceOpMax::local_update
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:362

amrex::ReduceOpMax::parallel_update
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:353

amrex::ReduceOpMin
Definition AMReX_Reduce.H:307

amrex::ReduceOpMin::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:332

amrex::ReduceOpMin::parallel_update
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:319

amrex::ReduceOpMin::local_update
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:328

amrex::ReduceOpSum
Definition AMReX_Reduce.H:277

amrex::ReduceOpSum::parallel_update
__device__ void parallel_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:290

amrex::ReduceOpSum::local_update
__host__ __device__ void local_update(T &d, T const &s) const noexcept
Definition AMReX_Reduce.H:299

amrex::ReduceOpSum::init
constexpr void init(T &t) const noexcept
Definition AMReX_Reduce.H:302

amrex::TypeList
Struct for holding types.
Definition AMReX_TypeList.H:13