amrex/doxygen/AMReX__GpuParallelReduce_8H_source.html

#ifndef AMREX_GPU_PARALLEL_REDUCE_H_

#define AMREX_GPU_PARALLEL_REDUCE_H_

#include <AMReX_Config.H>


#include <AMReX_GpuContainers.H>

#include <AMReX_INT.H>

#include <AMReX_ParallelDescriptor.H>

#include <AMReX_ParallelReduce.H>


#include <cstddef>


//

// GPU-aware MPI collectives that operate in place on a Gpu::DeviceVector.

//

// These overloads complement the pointer/scalar overloads in

// AMReX_ParallelReduce.H and AMReX_ParallelDescriptor.H. They live in a

// separate header (rather than in those low-level headers) because they need

// the Gpu container/copy machinery (AMReX_GpuContainers.H) that would bloat the

// ParallelReduce.H headers. This header is also pulled into the AMReX_Gpu.H

// umbrella for convenience.

//

// When AMReX is configured with GPU-aware MPI (ParallelDescriptor::UseGpuAwareMpi())

// the device buffer is handed to MPI directly, otherwise the data is staged

// through host (pinned) memory for the collective.

//


namespace amrex {


namespace ParallelAllReduce {


template <typename T>


void Sum (Gpu::DeviceVector<T>& v, MPI_Comm comm)

{

    // GPU-unaware case

#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)

    if (!ParallelDescriptor::UseGpuAwareMpi()) {

        Gpu::PinnedVector<T> hv(v.size());

        Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());

        Sum(hv.data(), static_cast<int>(hv.size()), comm);

        Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());

        return;

    }

#endif


    // GPU-aware case

    Sum(v.data(), static_cast<int>(v.size()), comm);

}


} // namespace ParallelAllReduce


namespace ParallelReduce {


template <typename T>


void Sum (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)

{

    // GPU-unaware case

#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)

    if (!ParallelDescriptor::UseGpuAwareMpi()) {

        Gpu::PinnedVector<T> hv(v.size());

        // every rank stages its contribution to host for the reduction

        Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());

        Sum(hv.data(), static_cast<int>(hv.size()), root, comm);

        // only the root receives the reduced result, so only it copies back

        if (ParallelDescriptor::MyProc(comm) == root) {

            Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());

        }

        return;

    }

#endif


    // GPU-aware case

    Sum(v.data(), static_cast<int>(v.size()), root, comm);

}


} // namespace ParallelReduce


namespace ParallelDescriptor {


template <typename T>


void Bcast (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)

{

#ifdef AMREX_USE_MPI

    auto const n = v.size();


#ifdef AMREX_DEBUG

    // verify the pre-allocation contract (the length broadcast happens on every

    // rank, so it is collectively safe and cannot deadlock)

    Long n_root = static_cast<Long>(n);

    Bcast(&n_root, std::size_t(1), root, comm);

    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_root == static_cast<Long>(n),

        "ParallelDescriptor::Bcast(Gpu::DeviceVector): receiver must be pre-allocated to the root's length");

#endif


    // trivial case: 1 rank

    if (n == 0) { return; }


    // GPU-unaware case

#ifdef AMREX_USE_GPU

    if (!UseGpuAwareMpi()) {

        Gpu::PinnedVector<T> hv(n);

        const bool is_root = (MyProc(comm) == root);

        // only the root needs to stage its data to host before the broadcast

        if (is_root) {

            Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());

        }

        Bcast(hv.data(), static_cast<std::size_t>(n), root, comm);

        // only the receivers need to copy the broadcast result back to device

        if (!is_root) {

            Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());

        }

        return;

    }

#endif


    // GPU-aware case

    Bcast(v.data(), static_cast<std::size_t>(n), root, comm);


#else  // AMREX_USE_MPI

    amrex::ignore_unused(v, root, comm);

#endif

}


} // namespace ParallelDescriptor


} // namespace amrex


#endif /*AMREX_GPU_PARALLEL_REDUCE_H_*/

AMREX_ALWAYS_ASSERT_WITH_MESSAGE
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49

AMReX_GpuContainers.H

AMReX_INT.H

AMReX_ParallelDescriptor.H

AMReX_ParallelReduce.H

amrex::PODVector
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308

amrex::PODVector::size
size_type size() const noexcept
Definition AMReX_PODVector.H:648

amrex::PODVector::begin
iterator begin() noexcept
Definition AMReX_PODVector.H:674

amrex::PODVector::end
iterator end() noexcept
Definition AMReX_PODVector.H:678

amrex::PODVector::data
T * data() noexcept
Definition AMReX_PODVector.H:666

amrex::Long
amrex_long Long
Definition AMReX_INT.H:30

amrex::ParallelDescriptor::MyProc
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128

amrex::ParallelDescriptor::Bcast
void Bcast(Gpu::DeviceVector< T > &v, int root, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:100

amrex::ParallelAllReduce::Sum
void Sum(Gpu::DeviceVector< T > &v, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:34

amrex::ParallelReduce::Sum
void Sum(Gpu::DeviceVector< T > &v, int root, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:58

amrex::Gpu::copy
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:128

amrex::Gpu::deviceToHost
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106

amrex::Gpu::hostToDevice
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105

amrex::ParallelDescriptor::UseGpuAwareMpi
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:113

amrex::mpidatatypes::MPI_Comm
int MPI_Comm
Definition AMReX_ccse-mpi.H:51

amrex
Definition AMReX_Amr.cpp:50

amrex::ignore_unused
__host__ __device__ void ignore_unused(const Ts &...)
No-op helper that marks variables as intentionally unused.
Definition AMReX.H:259