Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_GpuParallelReduce.H
Go to the documentation of this file.
1#ifndef AMREX_GPU_PARALLEL_REDUCE_H_
2#define AMREX_GPU_PARALLEL_REDUCE_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_INT.H>
9
10#include <cstddef>
11
12//
13// GPU-aware MPI collectives that operate in place on a Gpu::DeviceVector.
14//
15// These overloads complement the pointer/scalar overloads in
16// AMReX_ParallelReduce.H and AMReX_ParallelDescriptor.H. They live in a
17// separate header (rather than in those low-level headers) because they need
18// the Gpu container/copy machinery (AMReX_GpuContainers.H) that would bloat the
19// ParallelReduce.H headers. This header is also pulled into the AMReX_Gpu.H
20// umbrella for convenience.
21//
22// When AMReX is configured with GPU-aware MPI (ParallelDescriptor::UseGpuAwareMpi())
23// the device buffer is handed to MPI directly, otherwise the data is staged
24// through host (pinned) memory for the collective.
25//
26
27namespace amrex {
28
29namespace ParallelAllReduce {
30
32
33template <typename T>
35{
36 // GPU-unaware case
37#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
40 Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
41 Sum(hv.data(), static_cast<int>(hv.size()), comm);
42 Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
43 return;
44 }
45#endif
46
47 // GPU-aware case
48 Sum(v.data(), static_cast<int>(v.size()), comm);
49}
50
51} // namespace ParallelAllReduce
52
53namespace ParallelReduce {
54
56
57template <typename T>
58void Sum (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
59{
60 // GPU-unaware case
61#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
64 // every rank stages its contribution to host for the reduction
65 Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
66 Sum(hv.data(), static_cast<int>(hv.size()), root, comm);
67 // only the root receives the reduced result, so only it copies back
68 if (ParallelDescriptor::MyProc(comm) == root) {
69 Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
70 }
71 return;
72 }
73#endif
74
75 // GPU-aware case
76 Sum(v.data(), static_cast<int>(v.size()), root, comm);
77}
78
79} // namespace ParallelReduce
80
81namespace ParallelDescriptor {
82
84
99template <typename T>
100void Bcast (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
101{
102#ifdef AMREX_USE_MPI
103 auto const n = v.size();
104
105#ifdef AMREX_DEBUG
106 // verify the pre-allocation contract (the length broadcast happens on every
107 // rank, so it is collectively safe and cannot deadlock)
108 Long n_root = static_cast<Long>(n);
109 Bcast(&n_root, std::size_t(1), root, comm);
110 AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_root == static_cast<Long>(n),
111 "ParallelDescriptor::Bcast(Gpu::DeviceVector): receiver must be pre-allocated to the root's length");
112#endif
113
114 // trivial case: 1 rank
115 if (n == 0) { return; }
116
117 // GPU-unaware case
118#ifdef AMREX_USE_GPU
119 if (!UseGpuAwareMpi()) {
121 const bool is_root = (MyProc(comm) == root);
122 // only the root needs to stage its data to host before the broadcast
123 if (is_root) {
124 Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
125 }
126 Bcast(hv.data(), static_cast<std::size_t>(n), root, comm);
127 // only the receivers need to copy the broadcast result back to device
128 if (!is_root) {
129 Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
130 }
131 return;
132 }
133#endif
134
135 // GPU-aware case
136 Bcast(v.data(), static_cast<std::size_t>(n), root, comm);
137
138#else // AMREX_USE_MPI
139 amrex::ignore_unused(v, root, comm);
140#endif
141}
142
143} // namespace ParallelDescriptor
144
145} // namespace amrex
146
147#endif /*AMREX_GPU_PARALLEL_REDUCE_H_*/
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
size_type size() const noexcept
Definition AMReX_PODVector.H:648
iterator begin() noexcept
Definition AMReX_PODVector.H:674
iterator end() noexcept
Definition AMReX_PODVector.H:678
T * data() noexcept
Definition AMReX_PODVector.H:666
amrex_long Long
Definition AMReX_INT.H:30
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
void Bcast(Gpu::DeviceVector< T > &v, int root, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:100
void Sum(Gpu::DeviceVector< T > &v, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:34
void Sum(Gpu::DeviceVector< T > &v, int root, MPI_Comm comm)
Definition AMReX_GpuParallelReduce.H:58
void copy(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:128
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:113
int MPI_Comm
Definition AMReX_ccse-mpi.H:51
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139