docs_html/doxygen/AMReX__GpuLaunchFunctsG_8H_source.html

#ifndef AMREX_GPU_LAUNCH_FUNCTS_G_H_

#define AMREX_GPU_LAUNCH_FUNCTS_G_H_

#include <AMReX_Config.H>


namespace amrex {


namespace detail {


    // call_f_scalar_handler


    template <typename F, typename N>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const&)

        noexcept -> decltype(f(0))

    {

        return f(i);

    }


    template <typename F, typename N>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_scalar_handler (F const& f, N i, Gpu::Handler const& handler)

        noexcept -> decltype(f(0,Gpu::Handler{}))

    {

        return f(i, handler);

    }


    // call_f_intvect_inner


    template <typename F, std::size_t...Ns, class...Args>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<1> iv, Args...args)

        noexcept -> decltype(f(0, 0, 0, args...))

    {

        return f(iv[0], 0, 0, args...);

    }


    template <typename F, std::size_t...Ns, class...Args>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<2> iv, Args...args)

        noexcept -> decltype(f(0, 0, 0, args...))

    {

        return f(iv[0], iv[1], 0, args...);

    }


    template <typename F, int dim, std::size_t...Ns, class...Args>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)

        noexcept -> decltype(f(iv, args...))

    {

        return f(iv, args...);

    }


    template <typename F, int dim, std::size_t...Ns, class...Args>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_inner (std::index_sequence<Ns...>, F const& f, IntVectND<dim> iv, Args...args)

        noexcept -> decltype(f(iv[Ns]..., args...))

    {

        return f(iv[Ns]..., args...);

    }


    // call_f_intvect


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect (F const& f, IntVectND<dim> iv)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))

    {

        return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);

    }


    // call_f_intvect_engine


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_engine (F const& f, IntVectND<dim> iv, RandomEngine engine)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine))

    {

        return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, engine);

    }


    // call_f_intvect_handler


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const&)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))

    {

        return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);

    }


    template <typename F, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_handler (F const& f, IntVectND<dim> iv, Gpu::Handler const& handler)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, Gpu::Handler{}))

    {

        return call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, handler);

    }


    // call_f_intvect_ncomp


    template <typename F, typename T, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_ncomp (F const& f, IntVectND<dim> iv, T ncomp)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))

    {

        for (T n = 0; n < ncomp; ++n) {

            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);

        }

    }


    // call_f_intvect_ncomp_engine


    template <typename F, typename T, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_ncomp_engine (F const& f, IntVectND<dim> iv, T ncomp, RandomEngine engine)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, engine))

    {

        for (T n = 0; n < ncomp; ++n) {

            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, engine);

        }

    }


    // call_f_intvect_ncomp_handler


    template <typename F, typename T, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const&)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0))

    {

        for (T n = 0; n < ncomp; ++n) {

            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n);

        }

    }


    template <typename F, typename T, int dim>

    AMREX_GPU_DEVICE AMREX_FORCE_INLINE

    auto call_f_intvect_ncomp_handler (F const& f, IntVectND<dim> iv, T ncomp, Gpu::Handler const& handler)

        noexcept -> decltype(call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, 0, Gpu::Handler{}))

    {

        for (T n = 0; n < ncomp; ++n) {

            call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv, n, handler);

        }

    }


}


#ifdef AMREX_USE_SYCL


template <typename L>

void single_task (gpuStream_t stream, L const& f)

{

    detail::SyclKernelDevPtr<L> skdp(f, stream);

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    auto& q = *(stream.queue);

    try {

        q.submit([&] (sycl::handler& h) {

            if constexpr (detail::is_big_kernel<L>()) {

                h.single_task(*pf);

            } else {

                h.single_task(f);

            }

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("single_task: ")+ex.what()+"!!!!!");

    }

}


template<typename L>

void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,

             gpuStream_t stream, L const& f)

{

    detail::SyclKernelDevPtr<L> skdp(f, stream);

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;

    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)

        / sizeof(unsigned long long);

    auto& q = *(stream.queue);

    try {

        q.submit([&] (sycl::handler& h) {

            sycl::local_accessor<unsigned long long>

                shared_data(sycl::range<1>(shared_mem_numull), h);

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                if constexpr (detail::is_big_kernel<L>()) {

                    (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});

                } else {

                    f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");

    }

}


template<typename L>

void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f)

{

    detail::SyclKernelDevPtr<L> skdp(f, stream);

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;

    auto& q = *(stream.queue);

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                if constexpr (detail::is_big_kernel<L>()) {

                    (*pf)(item);

                } else {

                    f(item);

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");

    }

}


template <int MT, typename L>

void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,

             L const& f)

{

    detail::SyclKernelDevPtr<L> skdp(f, stream);

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto nthreads_total = MT * std::size_t(nblocks);

    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)

        / sizeof(unsigned long long);

    auto& q = *(stream.queue);

    try {

        q.submit([&] (sycl::handler& h) {

            sycl::local_accessor<unsigned long long>

                shared_data(sycl::range<1>(shared_mem_numull), h);

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(MT)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                if constexpr (detail::is_big_kernel<L>()) {

                    (*pf)(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});

                } else {

                    f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");

    }

}


template <int MT, typename L>

void launch (int nblocks, gpuStream_t stream, L const& f)

{

    detail::SyclKernelDevPtr<L> skdp(f, stream);

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto nthreads_total = MT * std::size_t(nblocks);

    auto& q = *(stream.queue);

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(MT)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                if constexpr (detail::is_big_kernel<L>()) {

                    (*pf)(item);

                } else {

                    f(item);

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");

    }

}


template<int MT, typename T, typename L>

void launch (T const& n, L const& f)

{

    if (amrex::isEmpty(n)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto ec = Gpu::makeExecutionConfig<MT>(n);

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {

                    if constexpr (detail::is_big_kernel<L>()) {

                        (*pf)(i);

                    } else {

                        f(i);

                    }

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");

    }

}


template <int MT, std::integral T, typename L>

void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f)

{

    if (amrex::isEmpty(n)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto ec = Gpu::makeExecutionConfig<MT>(n);

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        if (info.hasReduction()) {

            q.submit([&] (sycl::handler& h) {

                sycl::local_accessor<unsigned long long>

                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);

                         i < std::size_t(n); i += stride) {

                        int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),

                                                          item.get_local_range(0));

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_scalar_handler(*pf, T(i),

                                                          Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                          n_active_threads});

                        } else {

                            detail::call_f_scalar_handler(f, T(i),

                                                          Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                          n_active_threads});

                        }

                    }

                });

            });

        } else {

            q.submit([&] (sycl::handler& h) {

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);

                         i < std::size_t(n); i += stride) {

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_scalar_handler(*pf, T(i), Gpu::Handler{&item});

                        } else {

                            detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item});

                        }

                    }

                });

            });

        }

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, typename L, int dim>

void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f)

{

    if (amrex::isEmpty(box)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        if (info.hasReduction()) {

            q.submit([&] (sycl::handler& h) {

                sycl::local_accessor<unsigned long long>

                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                         icell < indexer.numPts(); icell += stride) {

                        auto iv = indexer.intVect(icell);

                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),

                                                          std::uint64_t(item.get_local_range(0)));

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_intvect_handler(*pf,

                                                           iv, Gpu::Handler{&item,

                                                               shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                               n_active_threads});

                        } else {

                            detail::call_f_intvect_handler(f,

                                                           iv, Gpu::Handler{&item,

                                                               shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                               n_active_threads});

                        }

                    }

                });

            });

        } else {

            q.submit([&] (sycl::handler& h) {

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                         icell < indexer.numPts(); icell += stride) {

                        auto iv = indexer.intVect(icell);

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_intvect_handler(*pf,iv,Gpu::Handler{&item});

                        } else {

                            detail::call_f_intvect_handler(f,iv,Gpu::Handler{&item});

                        }

                    }

                });

            });

        }

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, std::integral T, typename L, int dim>

void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L const& f)

{

    if (amrex::isEmpty(box)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::makeExecutionConfig<MT>(box.numPts());

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        if (info.hasReduction()) {

            q.submit([&] (sycl::handler& h) {

                sycl::local_accessor<unsigned long long>

                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                         icell < indexer.numPts(); icell += stride) {

                        auto iv = indexer.intVect(icell);

                        int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),

                                                          std::uint64_t(item.get_local_range(0)));

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_intvect_ncomp_handler(*pf, iv, ncomp,

                                                                 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                                     n_active_threads});

                        } else {

                            detail::call_f_intvect_ncomp_handler(f, iv, ncomp,

                                                                 Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),

                                                                     n_active_threads});

                        }

                    }

                });

            });

        } else {

            q.submit([&] (sycl::handler& h) {

                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                                 sycl::range<1>(nthreads_per_block)),

                [=] (sycl::nd_item<1> item)

                [[sycl::reqd_work_group_size(MT)]]

                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

                {

                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                         icell < indexer.numPts(); icell += stride) {

                        auto iv = indexer.intVect(icell);

                        if constexpr (detail::is_big_kernel<L>()) {

                            detail::call_f_intvect_ncomp_handler(*pf,iv,ncomp,Gpu::Handler{&item});

                        } else {

                            detail::call_f_intvect_ncomp_handler(f,iv,ncomp,Gpu::Handler{&item});

                        }

                    }

                });

            });

        }

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <std::integral T, typename L>

void ParallelForRNG (T n, L const& f)

{

    if (amrex::isEmpty(n)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const auto ec = Gpu::ExecutionConfig(n);

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());

    auto& q = Gpu::Device::streamQueue();

    auto& engdescr = *(getRandEngineDescriptor());

    try {

        q.submit([&] (sycl::handler& h) {

            auto engine_acc = engdescr.get_access(h);

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const tid = item.get_global_id(0);

                auto engine = engine_acc.load(tid);

                RandomEngine rand_eng{&engine};

                for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {

                    if constexpr (detail::is_big_kernel<L>()) {

                        (*pf)(T(i),rand_eng);

                    } else {

                        f(T(i),rand_eng);

                    }

                }

                engine_acc.store(engine, tid);

            });

        });

        q.wait_and_throw(); // because next launch might be on a different queue

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <typename L, int dim>

void ParallelForRNG (BoxND<dim> const& box, L const& f)

{

    if (amrex::isEmpty(box)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::ExecutionConfig(box.numPts());

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());

    auto& q = Gpu::Device::streamQueue();

    auto& engdescr = *(getRandEngineDescriptor());

    try {

        q.submit([&] (sycl::handler& h) {

            auto engine_acc = engdescr.get_access(h);

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const tid = item.get_global_id(0);

                auto engine = engine_acc.load(tid);

                RandomEngine rand_eng{&engine};

                for (std::uint64_t icell = tid, stride = item.get_global_range(0);

                     icell < indexer.numPts(); icell += stride) {

                    auto iv = indexer.intVect(icell);

                    if constexpr (detail::is_big_kernel<L>()) {

                        detail::call_f_intvect_engine(*pf,iv,rand_eng);

                    } else {

                        detail::call_f_intvect_engine(f,iv,rand_eng);

                    }

                }

                engine_acc.store(engine, tid);

            });

        });

        q.wait_and_throw(); // because next launch might be on a different queue

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <std::integral T, typename L, int dim>

void ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f)

{

    if (amrex::isEmpty(box)) { return; }


    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());

    L const* pf = skdp.template get<0>();

    amrex::ignore_unused(pf);


    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::ExecutionConfig(box.numPts());

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());

    auto& q = Gpu::Device::streamQueue();

    auto& engdescr = *(getRandEngineDescriptor());

    try {

        q.submit([&] (sycl::handler& h) {

            auto engine_acc = engdescr.get_access(h);

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const tid = item.get_global_id(0);

                auto engine = engine_acc.load(tid);

                RandomEngine rand_eng{&engine};

                for (std::uint64_t icell = tid, stride = item.get_global_range(0);

                     icell < indexer.numPts(); icell += stride) {

                    auto iv = indexer.intVect(icell);

                    if constexpr (detail::is_big_kernel<L>()) {

                        detail::call_f_intvect_ncomp_engine(*pf,iv,ncomp,rand_eng);

                    } else {

                        detail::call_f_intvect_ncomp_engine(f,iv,ncomp,rand_eng);

                    }

                }

                engine_acc.store(engine, tid);

            });

        });

        q.wait_and_throw(); // because next launch might be on a different queue

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, typename L1, typename L2, int dim>

void ParallelFor (Gpu::KernelInfo const& /*info*/, BoxND<dim> const& box1, BoxND<dim> const& box2, L1 const& f1, L2 const& f2)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }


    detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());

    L1 const* pf1 = skdp.template get<0>();

    L2 const* pf2 = skdp.template get<1>();

    amrex::ignore_unused(pf1,pf2);


    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(), box2.numPts()));

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());

                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                     icell < ncells; icell += stride) {

                    if (icell < indexer1.numPts()) {

                        auto iv = indexer1.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2>()) {

                            detail::call_f_intvect(*pf1,iv);

                        } else {

                            detail::call_f_intvect(f1,iv);

                        }

                    }

                    if (icell < indexer2.numPts()) {

                        auto iv = indexer2.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2>()) {

                            detail::call_f_intvect(*pf2,iv);

                        } else {

                            detail::call_f_intvect(f2,iv);

                        }

                    }

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void ParallelFor (Gpu::KernelInfo const& /*info*/,

                  BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                  L1 const& f1, L2 const& f2, L3 const& f3)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }


    detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());

    L1 const* pf1 = skdp.template get<0>();

    L2 const* pf2 = skdp.template get<1>();

    L3 const* pf3 = skdp.template get<2>();

    amrex::ignore_unused(pf1,pf2,pf3);


    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const BoxIndexerND<dim> indexer3(box3);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());

                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                     icell < ncells; icell += stride) {

                    if (icell < indexer1.numPts()) {

                        auto iv = indexer1.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect(*pf1,iv);

                        } else {

                            detail::call_f_intvect(f1,iv);

                        }

                    }

                    if (icell < indexer2.numPts()) {

                        auto iv = indexer2.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect(*pf2,iv);

                        } else {

                            detail::call_f_intvect(f2,iv);

                        }

                    }

                    if (icell < indexer3.numPts()) {

                        auto iv = indexer3.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect(*pf3,iv);

                        } else {

                            detail::call_f_intvect(f3,iv);

                        }

                    }

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void ParallelFor (Gpu::KernelInfo const& /*info*/,

                  BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2 const& f2)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }


    detail::SyclKernelDevPtr<L1,L2> skdp(f1, f2, Gpu::gpuStream());

    L1 const* pf1 = skdp.template get<0>();

    L2 const* pf2 = skdp.template get<1>();

    amrex::ignore_unused(pf1,pf2);


    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());

                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                     icell < ncells; icell += stride) {

                    if (icell < indexer1.numPts()) {

                        auto iv = indexer1.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2>()) {

                            detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);

                        } else {

                            detail::call_f_intvect_ncomp(f1,iv,ncomp1);

                        }

                    }

                    if (icell < indexer2.numPts()) {

                        auto iv = indexer2.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2>()) {

                            detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);

                        } else {

                            detail::call_f_intvect_ncomp(f2,iv,ncomp2);

                        }

                    }

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void ParallelFor (Gpu::KernelInfo const& /*info*/,

                  BoxND<dim> const& box1, T1 ncomp1, L1 const& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2 const& f2,

                  BoxND<dim> const& box3, T3 ncomp3, L3 const& f3)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }


    detail::SyclKernelDevPtr<L1,L2,L3> skdp(f1, f2, f3, Gpu::gpuStream());

    L1 const* pf1 = skdp.template get<0>();

    L2 const* pf2 = skdp.template get<1>();

    L3 const* pf3 = skdp.template get<2>();

    amrex::ignore_unused(pf1,pf2,pf3);


    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const BoxIndexerND<dim> indexer3(box3);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));

    const auto nthreads_per_block = ec.numThreads.x;

    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;

    auto& q = Gpu::Device::streamQueue();

    try {

        q.submit([&] (sycl::handler& h) {

            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),

                                             sycl::range<1>(nthreads_per_block)),

            [=] (sycl::nd_item<1> item)

            [[sycl::reqd_work_group_size(MT)]]

            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]

            {

                auto const ncells = amrex::max(indexer1.numPts(), indexer2.numPts(), indexer3.numPts());

                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);

                     icell < ncells; icell += stride) {

                    if (icell < indexer1.numPts()) {

                        auto iv = indexer1.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect_ncomp(*pf1,iv,ncomp1);

                        } else {

                            detail::call_f_intvect_ncomp(f1,iv,ncomp1);

                        }

                    }

                    if (icell < indexer2.numPts()) {

                        auto iv = indexer2.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect_ncomp(*pf2,iv,ncomp2);

                        } else {

                            detail::call_f_intvect_ncomp(f2,iv,ncomp2);

                        }

                    }

                    if (icell < indexer3.numPts()) {

                        auto iv = indexer3.intVect(icell);

                        if constexpr (detail::is_big_kernel<L1,L2,L3>()) {

                            detail::call_f_intvect_ncomp(*pf3,iv,ncomp3);

                        } else {

                            detail::call_f_intvect_ncomp(f3,iv,ncomp3);

                        }

                    }

                }

            });

        });

    } catch (sycl::exception const& ex) {

        amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");

    }

}


#else

// CUDA or HIP


template <typename L>


void single_task (gpuStream_t stream, L const& f)

{

    AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream, f);

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, typename L>


void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,

             L const& f)

{

    AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream, f);

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, typename L>


void launch (int nblocks, gpuStream_t stream, L const& f)

{

    AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream, f);

    AMREX_GPU_ERROR_CHECK();

}


template<typename L>


void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,

             gpuStream_t stream, L const& f)

{

    AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes, stream, f);

    AMREX_GPU_ERROR_CHECK();

}


template<typename L>


void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept

{

    launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));

}


template<int MT, std::integral T, typename L>


void launch (T const& n, L const& f)

{

    static_assert(sizeof(T) >= 2);

    if (amrex::isEmpty(n)) { return; }

    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);

    for (auto const& ec : nec) {

        const T start_idx = T(ec.start_idx);

        const T nleft = n - start_idx;

        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept {

            // This will not overflow, even though nblocks*MT might.

            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);

            if (tid < nleft) {

                f(tid+start_idx);

            }

        });

    }

    AMREX_GPU_ERROR_CHECK();

}


template<int MT, int dim, typename L>


void launch (BoxND<dim> const& box, L const& f)

{

    if (box.isEmpty()) { return; }

    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);

    const BoxIndexerND<dim> indexer(box);

    const auto type = box.ixType();

    for (auto const& ec : nec) {

        const auto start_idx = std::uint64_t(ec.start_idx);

        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept {

            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;

            if (icell < indexer.numPts()) {

                auto iv = indexer.intVect(icell);

                f(BoxND<dim>(iv,iv,type));

            }

        });

    }

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, std::integral T, typename L>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const&, T n, L const& f)

{

    static_assert(sizeof(T) >= 2);

    if (amrex::isEmpty(n)) { return; }

    const auto& nec = Gpu::makeNExecutionConfigs<MT>(n);

    for (auto const& ec : nec) {

        const T start_idx = T(ec.start_idx);

        const T nleft = n - start_idx;

        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept {

            // This will not overflow, even though nblocks*MT might.

            auto tid = T(MT)*T(blockIdx.x)+T(threadIdx.x);

            if (tid < nleft) {

                detail::call_f_scalar_handler(f, tid+start_idx,

                    Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),

                    (std::uint64_t)MT)));

            }

        });

    }

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f)

{

    if (amrex::isEmpty(box)) { return; }

    const BoxIndexerND<dim> indexer(box);

    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);

    for (auto const& ec : nec) {

        const auto start_idx = std::uint64_t(ec.start_idx);

        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept {

            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;

            if (icell < indexer.numPts()) {

                auto iv = indexer.intVect(icell);

                detail::call_f_intvect_handler(f, iv,

                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),

                    (std::uint64_t)MT)));

            }

        });

    }

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, std::integral T, typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)

{

    if (amrex::isEmpty(box)) { return; }

    const BoxIndexerND<dim> indexer(box);

    const auto& nec = Gpu::makeNExecutionConfigs<MT>(box);

    for (auto const& ec : nec) {

        const auto start_idx = std::uint64_t(ec.start_idx);

        AMREX_LAUNCH_KERNEL(MT, ec.nblocks, MT, 0, Gpu::gpuStream(),

        [=] AMREX_GPU_DEVICE () noexcept {

            auto icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x + start_idx;

            if (icell < indexer.numPts()) {

                auto iv = indexer.intVect(icell);

                detail::call_f_intvect_ncomp_handler(f, iv, ncomp,

                    Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),

                    (std::uint64_t)MT)));

            }

        });

    }

    AMREX_GPU_ERROR_CHECK();

}


template <std::integral T, typename L>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelForRNG (T n, L const& f)

{

    if (amrex::isEmpty(n)) { return; }

    randState_t* rand_state = getRandState();

    const auto ec = Gpu::ExecutionConfig(n);

    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,

                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),

                        ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;

        RandomEngine engine{&(rand_state[tid])};

        for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {

            f(T(i),engine);

        }

    });

    Gpu::streamSynchronize(); // To avoid multiple streams using RNG

    AMREX_GPU_ERROR_CHECK();

}


template <typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelForRNG (BoxND<dim> const& box, L const& f)

{

    if (amrex::isEmpty(box)) { return; }

    randState_t* rand_state = getRandState();

    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::ExecutionConfig(box.numPts());

    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,

                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),

                        ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;

        RandomEngine engine{&(rand_state[tid])};

        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {

            auto iv = indexer.intVect(icell);

            detail::call_f_intvect_engine(f, iv, engine);

        }

    });

    Gpu::streamSynchronize(); // To avoid multiple streams using RNG

    AMREX_GPU_ERROR_CHECK();

}


template <std::integral T, typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f)

{

    if (amrex::isEmpty(box)) { return; }

    randState_t* rand_state = getRandState();

    const BoxIndexerND<dim> indexer(box);

    const auto ec = Gpu::ExecutionConfig(box.numPts());

    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,

                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),

                        ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;

        RandomEngine engine{&(rand_state[tid])};

        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {

            auto iv = indexer.intVect(icell);

            detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);

        }

    });

    Gpu::streamSynchronize(); // To avoid multiple streams using RNG

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, typename L1, typename L2, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)

void


ParallelFor (Gpu::KernelInfo const&,

             BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }

    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));

    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());

        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;

             icell < ncells; icell += stride) {

            if (icell < indexer1.numPts()) {

                auto iv = indexer1.intVect(icell);

                detail::call_f_intvect(f1, iv);

            }

            if (icell < indexer2.numPts()) {

                auto iv = indexer2.intVect(icell);

                detail::call_f_intvect(f2, iv);

            }

        }

    });

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, typename L1, typename L2, typename L3, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&

          MaybeDeviceRunnable<L3>::value)

void


ParallelFor (Gpu::KernelInfo const&,

             BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

             L1&& f1, L2&& f2, L3&& f3)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }

    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const BoxIndexerND<dim> indexer3(box3);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));

    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});

        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;

             icell < ncells; icell += stride) {

            if (icell < indexer1.numPts()) {

                auto iv = indexer1.intVect(icell);

                detail::call_f_intvect(f1, iv);

            }

            if (icell < indexer2.numPts()) {

                auto iv = indexer2.intVect(icell);

                detail::call_f_intvect(f2, iv);

            }

            if (icell < indexer3.numPts()) {

                auto iv = indexer3.intVect(icell);

                detail::call_f_intvect(f3, iv);

            }

        }

    });

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)

void


ParallelFor (Gpu::KernelInfo const&,

             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

             BoxND<dim> const& box2, T2 ncomp2, L2&& f2)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }

    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max(box1.numPts(),box2.numPts()));

    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());

        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;

             icell < ncells; icell += stride) {

            if (icell < indexer1.numPts()) {

                auto iv = indexer1.intVect(icell);

                detail::call_f_intvect_ncomp(f1, iv, ncomp1);

            }

            if (icell < indexer2.numPts()) {

                auto iv = indexer2.intVect(icell);

                detail::call_f_intvect_ncomp(f2, iv, ncomp2);

            }

        }

    });

    AMREX_GPU_ERROR_CHECK();

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&

          MaybeDeviceRunnable<L3>::value)

void


ParallelFor (Gpu::KernelInfo const&,

             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

             BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

             BoxND<dim> const& box3, T3 ncomp3, L3&& f3)

{

    if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }

    const BoxIndexerND<dim> indexer1(box1);

    const BoxIndexerND<dim> indexer2(box2);

    const BoxIndexerND<dim> indexer3(box3);

    const auto ec = Gpu::makeExecutionConfig<MT>(std::max({box1.numPts(),box2.numPts(),box3.numPts()}));

    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),

    [=] AMREX_GPU_DEVICE () noexcept {

        auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});

        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;

             icell < ncells; icell += stride) {

            if (icell < indexer1.numPts()) {

                auto iv = indexer1.intVect(icell);

                detail::call_f_intvect_ncomp(f1, iv, ncomp1);

            }

            if (icell < indexer2.numPts()) {

                auto iv = indexer2.intVect(icell);

                detail::call_f_intvect_ncomp(f2, iv, ncomp2);

            }

            if (icell < indexer3.numPts()) {

                auto iv = indexer3.intVect(icell);

                detail::call_f_intvect_ncomp(f3, iv, ncomp3);

            }

        }

    });

    AMREX_GPU_ERROR_CHECK();

}


#endif


template <typename L>

void single_task (L&& f) noexcept

{

    single_task(Gpu::gpuStream(), std::forward<L>(f));

}


template<typename T, typename L>

void launch (T const& n, L&& f) noexcept

{

    launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));

}


template <std::integral T, typename L>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));

}


template <typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));

}


template <std::integral T, typename L, int dim>

requires (MaybeDeviceRunnable<L>::value)

void


ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)

void


ParallelFor (Gpu::KernelInfo const& info,

             BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),

                                       std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&

          MaybeDeviceRunnable<L3>::value)

void


ParallelFor (Gpu::KernelInfo const& info,

             BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

             L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),

                                       std::forward<L2>(f2), std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value)

void


ParallelFor (Gpu::KernelInfo const& info,

             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

             BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),

                                             box2, ncomp2, std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

requires (MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value &&

          MaybeDeviceRunnable<L3>::value)

void


ParallelFor (Gpu::KernelInfo const& info,

             BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

             BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

             BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),

                                             box2, ncomp2, std::forward<L2>(f2),

                                             box3, ncomp3, std::forward<L3>(f3));

}


template <std::integral T, typename L>

void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept

{

    ParallelFor<MT>(info, n,std::forward<L>(f));

}


template <typename L, int dim>

void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));

}


template <int MT, typename L, int dim>

void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<MT>(info, box,std::forward<L>(f));

}


template <std::integral T, typename L, int dim>

void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));

}


template <int MT, std::integral T, typename L, int dim>

void For (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <int MT, typename L1, typename L2, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

          L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

          L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(info,

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void For (Gpu::KernelInfo const& info,

          BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<MT>(info,

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <std::integral T, typename L>


void ParallelFor (T n, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

void ParallelFor (T n, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));

}


template <typename L, int dim>


void ParallelFor (BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));

}


template <int MT, typename L, int dim>

void ParallelFor (BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));

}


template <std::integral T, typename L, int dim>


void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <int MT, std::integral T, typename L, int dim>

void ParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <int MT, typename L1, typename L2, int dim>

void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                  L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void ParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                  L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void ParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                  BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                  BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <std::integral T, typename L>


void For (T n, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

void For (T n, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));

}


template <typename L, int dim>


void For (BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));

}


template <int MT, typename L, int dim>

void For (BoxND<dim> const& box, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));

}


template <std::integral T, typename L, int dim>


void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <int MT, std::integral T, typename L, int dim>

void For (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <int MT, typename L1, typename L2, int dim>

void For (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

          L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void For (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

          L1&& f1, L2&& f2, L3&& f3) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void For (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

          BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

          BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    ParallelFor<MT>(Gpu::KernelInfo{},

                box1,ncomp1,std::forward<L1>(f1),

                box2,ncomp2,std::forward<L2>(f2),

                box3,ncomp3,std::forward<L3>(f3));

}


template <std::integral T, typename L>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        AMREX_PRAGMA_SIMD

        for (T i = 0; i < n; ++i) { f(i); }

#endif

    }

}


template <int MT, std::integral T, typename L>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info,n,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        AMREX_PRAGMA_SIMD

        for (T i = 0; i < n; ++i) { f(i); }

#endif

    }

}


template <std::integral T, typename L>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (T n, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (T n, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));

}


template <typename L, int dim>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box,std::forward<L>(f));

#endif

    }

}


template <int MT, typename L, int dim>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info, box,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box,std::forward<L>(f));

#endif

    }

}


template <std::integral T, typename L, int dim>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));

#endif

    }

}


template <int MT, std::integral T, typename L, int dim>

requires (MaybeHostDeviceRunnable<L>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));

#endif

    }

}


template <typename L1, typename L2, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,std::forward<L2>(f2));

#endif

    }

}


template <int MT, typename L1, typename L2, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,std::forward<L2>(f2));

#endif

    }

}


template <int MT, typename L1, typename L2, typename L3, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&

          MaybeHostDeviceRunnable<L3>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                       L1&& f1, L2&& f2, L3&& f3)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info,box1,box2,box3,

                    std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,std::forward<L2>(f2));

        LoopConcurrentOnCpu(box3,std::forward<L3>(f3));

#endif

    }

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));

#endif

    }

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));

#endif

    }

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&

          MaybeHostDeviceRunnable<L3>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                       BoxND<dim> const& box3, T3 ncomp3, L3&& f3)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<AMREX_GPU_MAX_THREADS>(info,

                    box1,ncomp1,std::forward<L1>(f1),

                    box2,ncomp2,std::forward<L2>(f2),

                    box3,ncomp3,std::forward<L3>(f3));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));

        LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));

#endif

    }

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

requires (MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value &&

          MaybeHostDeviceRunnable<L3>::value)

void


HostDeviceParallelFor (Gpu::KernelInfo const& info,

                       BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                       BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                       BoxND<dim> const& box3, T3 ncomp3, L3&& f3)

{

    if (Gpu::inLaunchRegion()) {

        ParallelFor<MT>(info,

                    box1,ncomp1,std::forward<L1>(f1),

                    box2,ncomp2,std::forward<L2>(f2),

                    box3,ncomp3,std::forward<L3>(f3));

    } else {

#ifdef AMREX_USE_SYCL

        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");

#else

        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));

        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));

        LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));

#endif

    }

}


template <std::integral T, typename L>

void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));

}


template <typename L, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));

}


template <int MT, typename L, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));

}


template <std::integral T, typename L, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));

}


template <int MT, std::integral T, typename L, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <int MT, typename L1, typename L2, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                    L1&& f1, L2&& f2, L3&& f3) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,

                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                    L1&& f1, L2&& f2, L3&& f3) noexcept

{

    HostDeviceParallelFor<MT>(info, box1,box2,box3,

                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,

                          box1,ncomp1,std::forward<L1>(f1),

                          box2,ncomp2,std::forward<L2>(f2),

                          box3,ncomp3,std::forward<L3>(f3));

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void HostDeviceFor (Gpu::KernelInfo const& info,

                    BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                    BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                    BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    HostDeviceParallelFor<MT>(info,

                          box1,ncomp1,std::forward<L1>(f1),

                          box2,ncomp2,std::forward<L2>(f2),

                          box3,ncomp3,std::forward<L3>(f3));

}


template <std::integral T, typename L>

void HostDeviceParallelFor (T n, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));

}


template <int MT, std::integral T, typename L>

void HostDeviceParallelFor (T n, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));

}


template <typename L, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));

}


template <int MT, typename L, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));

}


template <std::integral T, typename L, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <int MT, std::integral T, typename L, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box, T ncomp, L&& f) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));

}


template <typename L1, typename L2, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <int MT, typename L1, typename L2, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, L1&& f1, L2&& f2) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));

}


template <typename L1, typename L2, typename L3, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                            L1&& f1, L2&& f2, L3&& f3) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,

                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <int MT, typename L1, typename L2, typename L3, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, BoxND<dim> const& box2, BoxND<dim> const& box3,

                            L1&& f1, L2&& f2, L3&& f3) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,

                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));

}


template <std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <int MT, std::integral T1, std::integral T2, typename L1, typename L2, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));

}


template <std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},

                          box1,ncomp1,std::forward<L1>(f1),

                          box2,ncomp2,std::forward<L2>(f2),

                          box3,ncomp3,std::forward<L3>(f3));

}


template <int MT, std::integral T1, std::integral T2, std::integral T3, typename L1, typename L2, typename L3, int dim>

void HostDeviceParallelFor (BoxND<dim> const& box1, T1 ncomp1, L1&& f1,

                            BoxND<dim> const& box2, T2 ncomp2, L2&& f2,

                            BoxND<dim> const& box3, T3 ncomp3, L3&& f3) noexcept

{

    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},

                          box1,ncomp1,std::forward<L1>(f1),

                          box2,ncomp2,std::forward<L2>(f2),

                          box3,ncomp3,std::forward<L3>(f3));

}


}


#endif

AMREX_PRAGMA_SIMD
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:85

AMREX_FORCE_INLINE
#define AMREX_FORCE_INLINE
Definition AMReX_Extension.H:124

AMREX_GPU_ERROR_CHECK
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151

AMREX_LAUNCH_KERNEL
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37

AMREX_LAUNCH_KERNEL_NOBOUND
#define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:39

AMREX_GPU_DEVICE
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18

amrex::BoxND
A Rectangular Domain on an Integer Lattice.
Definition AMReX_Box.H:54

amrex::BoxND::isEmpty
__host__ __device__ bool isEmpty() const noexcept
Checks if it is an empty BoxND.
Definition AMReX_Box.H:223

amrex::BoxND::numPts
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:385

amrex::BoxND::ixType
__host__ __device__ IndexTypeND< dim > ixType() const noexcept
Return the indexing type.
Definition AMReX_Box.H:148

amrex::Gpu::Device::maxBlocksPerLaunch
static unsigned int maxBlocksPerLaunch() noexcept
Definition AMReX_GpuDevice.H:239

amrex::Gpu::Device::warp_size
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236

amrex::Gpu::KernelInfo
Definition AMReX_GpuKernelInfo.H:8

amrex::Long
amrex_long Long
Definition AMReX_INT.H:30

amrex::min
__host__ __device__ constexpr const T & min(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:31

amrex::max
__host__ __device__ constexpr const T & max(const T &a, const T &b) noexcept
Definition AMReX_Algorithm.H:53

amrex::Gpu::Range
__host__ __device__ range_detail::range_impl< T > Range(T const &b) noexcept
Definition AMReX_GpuRange.H:128

amrex::Gpu::streamSynchronize
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310

amrex::Gpu::inLaunchRegion
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88

amrex::Gpu::gpuStream
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291

amrex
Definition AMReX_Amr.cpp:50

amrex::ignore_unused
__host__ __device__ void ignore_unused(const Ts &...)
No-op helper that marks variables as intentionally unused.
Definition AMReX.H:259

amrex::HostDeviceParallelFor
void HostDeviceParallelFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:726

amrex::Order::F
@ F

amrex::RunOn::Gpu
@ Gpu

amrex::gpuStream_t
cudaStream_t gpuStream_t
Definition AMReX_GpuControl.H:79

amrex::getRandState
randState_t * getRandState()
Definition AMReX_RandomEngine.H:65

amrex::ParallelFor
void ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:202

amrex::HostDeviceFor
void HostDeviceFor(T n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:829

amrex::launch
void launch(T const &n, L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:122

amrex::isEmpty
bool isEmpty(T n) noexcept
Definition AMReX_GpuRange.H:15

amrex::single_task
void single_task(L &&f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1239

amrex::ParallelForRNG
AMREX_ATTRIBUTE_FLATTEN_FOR void ParallelForRNG(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:1151

amrex::randState_t
curandState_t randState_t
Definition AMReX_RandomEngine.H:58

amrex::LoopConcurrentOnCpu
void LoopConcurrentOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:388

amrex::Abort
void Abort(const std::string &msg)
Print a fatal-error message to stderr and abort execution.
Definition AMReX.cpp:241

amrex::For
AMREX_ATTRIBUTE_FLATTEN_FOR void For(T n, L const &f) noexcept
Definition AMReX_GpuLaunchFunctsC.H:136

amrex::get
__host__ __device__ constexpr int get(IntVectND< dim > const &iv) noexcept
Get I'th element of IntVectND<dim>
Definition AMReX_IntVect.H:1334

amrex::BoxIndexerND
Utility that maps flattened point indices back to IntVectND coordinates.
Definition AMReX_Box.H:2494

amrex::BoxIndexerND::intVect
__host__ __device__ IntVectND< dim > intVect(std::uint64_t icell) const
Convert flattened point index icell to its IntVectND coordinate.
Definition AMReX_Box.H:2517

amrex::BoxIndexerND::numPts
__host__ __device__ std::uint64_t numPts() const
Return the number of points covered by the indexed box.
Definition AMReX_Box.H:2552

amrex::Gpu::ExecutionConfig
Definition AMReX_GpuLaunch.H:121

amrex::Gpu::Handler
Definition AMReX_GpuTypes.H:86

amrex::RandomEngine
Definition AMReX_RandomEngine.H:72