Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_CSR.H
Go to the documentation of this file.
1#ifndef AMREX_CSR_H_
2#define AMREX_CSR_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_Gpu.H>
6#include <AMReX_INT.H>
7#include <AMReX_OpenMP.H>
8
9#if defined(AMREX_USE_CUDA)
10#include <cub/cub.cuh> // for Clang
11#endif
12
13#include <algorithm>
14#include <concepts>
15#include <climits>
16#include <type_traits>
17
18namespace amrex {
19
33template <typename T>
34struct CsrView {
35 using U = std::conditional_t<std::is_const_v<T>, Long const, Long>;
36 T* AMREX_RESTRICT mat = nullptr;
39 Long nnz = 0;
41};
42
49template <typename T, template <typename> class V>
50struct CSR {
51 V<T> mat;
52 V<Long> col_index;
53 V<Long> row_offset;
54 Long nnz = 0;
55
57 [[nodiscard]] Long nrows () const {
58 return row_offset.empty() ? Long(0) : Long(row_offset.size())-1;
59 }
60
70 void resize (Long num_rows, Long num_non_zeros) {
71 mat.resize(num_non_zeros);
72 col_index.resize(num_non_zeros);
73 row_offset.resize(num_rows+1);
74 nnz = num_non_zeros;
75 }
76
79 return CsrView<T>{mat.data(), col_index.data(), row_offset.data(),
80 nnz, Long(row_offset.size())-1};
81 }
82
84 [[nodiscard]] CsrView<T const> view () const {
85 return CsrView<T const>{.mat = mat.data(),
86 .col_index = col_index.data(),
87 .row_offset = row_offset.data(),
88 .nnz = nnz,
89 .nrows = Long(row_offset.size())-1};
90 }
91
93 [[nodiscard]] CsrView<T const> const_view () const {
94 return CsrView<T const>{mat.data(), col_index.data(), row_offset.data(),
95 nnz, Long(row_offset.size())-1};
96 }
97
101 void sort ();
102
107};
108
109template <typename C, typename T, template<typename> class AD, template<typename> class AS>
110requires (std::same_as<C,Gpu::HostToDevice> ||
111 std::same_as<C,Gpu::DeviceToHost> ||
112 std::same_as<C,Gpu::DeviceToDevice>)
120void duplicateCSR (C c, CSR<T,AD>& dst, CSR<T,AS> const& src)
121{
122 dst.mat.resize(src.mat.size());
123 dst.col_index.resize(src.col_index.size());
124 dst.row_offset.resize(src.row_offset.size());
126 src.mat.begin(),
127 src.mat.end(),
128 dst.mat.begin());
130 src.col_index.begin(),
131 src.col_index.end(),
132 dst.col_index.begin());
134 src.row_offset.begin(),
135 src.row_offset.end(),
136 dst.row_offset.begin());
137 dst.nnz = src.nnz;
138}
139
140template <typename T, template <typename> class V>
142{
143 if (nnz <= 0) { return; }
144
145#ifdef AMREX_USE_GPU
146
147#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
148
149 // The function is synchronous. If that is no longer the case, we might
150 // need to update SpMatrix::define.
151
152 constexpr int nthreads = 256;
153 constexpr int nwarps_per_block = nthreads / Gpu::Device::warp_size;
154
155 AMREX_ALWAYS_ASSERT((nrows()+nwarps_per_block-1) < Long(std::numeric_limits<int>::max()));
156
157 auto nr = int(nrows());
158 int nblocks = (nr + nwarps_per_block-1) / nwarps_per_block;
159 auto const& stream = Gpu::gpuStream();
160
161 auto* pmat = mat.data();
162 auto* pcol = col_index.data();
163 auto* prow = row_offset.data();
164
165 Gpu::Buffer<int> needs_fallback({0});
166 auto* d_needs_fallback = needs_fallback.data();
167
168 amrex::launch_global<nthreads><<<nblocks, nthreads, 0, stream>>>
169 ([=] AMREX_GPU_DEVICE () noexcept
170 {
171 int wid = int(threadIdx.x)/Gpu::Device::warp_size;
172 int r = int(blockIdx.x)*nwarps_per_block + wid;
173 if (r >= nr) return;
174
175 Long const b = prow[r];
176 Long const e = prow[r+1];
177 auto const len = int(e - b);
178
179 if (len <= 1) return;
180
181 int lane = int(threadIdx.x) - wid * Gpu::Device::warp_size;
182
183 bool sorted = true;
184 for (Long i = lane + 1; i < len; i += Gpu::Device::warp_size) {
185 sorted = sorted && (pcol[b+i-1] <= pcol[b+i]);
186 }
187#if defined(AMREX_USE_CUDA)
188 if (__all_sync(0xffffffff, sorted)) { return; }
189#else
190 if (__all(sorted)) { return; }
191#endif
192
193 constexpr int ITEMS_PER_THREAD = AMREX_HIP_OR_CUDA(2,4);
194 constexpr int ITEMS_PER_WARP = Gpu::Device::warp_size * ITEMS_PER_THREAD;
195
196 if (len <= ITEMS_PER_WARP)
197 {
198#if defined(AMREX_USE_CUDA)
199 using WarpSort = cub::WarpMergeSort<Long, ITEMS_PER_THREAD, Gpu::Device::warp_size, T>;
200 __shared__ typename WarpSort::TempStorage temp_storage[nwarps_per_block];
201#elif defined(AMREX_USE_HIP)
202 using WarpSort = rocprim::warp_sort<Long, Gpu::Device::warp_size, T>;
203 __shared__ typename WarpSort::storage_type temp_storage[nwarps_per_block];
204#endif
205
206 Long keys[ITEMS_PER_THREAD];
207 T values[ITEMS_PER_THREAD];
208
209 #pragma unroll
210 for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
211 int idx = lane * ITEMS_PER_THREAD + i;
212 if (idx < len) {
213 keys[i] = pcol[b + idx];
214 values[i] = pmat[b + idx];
215 } else {
216 keys[i] = std::numeric_limits<Long>::max();
217 values[i] = T{};
218 }
219 }
220
222 WarpSort{}.sort(keys, values, temp_storage[wid]),
223 WarpSort(temp_storage[wid]).Sort(
224 keys, values, [](Long x, Long y) {return x < y;}));
225
226 #pragma unroll
227 for (int i = 0; i < ITEMS_PER_THREAD; ++i) {
228 int idx = lane * ITEMS_PER_THREAD + i;
229 if (idx < len) {
230 pcol[b + idx] = keys[i];
231 pmat[b + idx] = values[i];
232 }
233 }
234 } else {
235 if (lane == 0) {
236 Gpu::Atomic::AddNoRet(d_needs_fallback, 1);
237 }
238 }
239 });
240
241 auto* h_needs_fallback = needs_fallback.copyToHost();
242
243 if (*h_needs_fallback)
244 {
245 V<Long> col_index_out(col_index.size());
246 V<T> mat_out(mat.size());
247 auto* d_col_out = col_index_out.data();
248 auto* d_val_out = mat_out.data();
249
250 std::size_t temp_bytes = 0;
251
253 rocprim::segmented_radix_sort_pairs,
254 cub::DeviceSegmentedRadixSort::SortPairs)
255 (nullptr, temp_bytes, pcol, d_col_out, pmat, d_val_out,
256 nnz, nr, prow, prow+1, 0, int(sizeof(Long)*CHAR_BIT),
257 stream));
258
259 auto* d_temp = (void*) The_Arena()->alloc(temp_bytes);
260
262 rocprim::segmented_radix_sort_pairs,
263 cub::DeviceSegmentedRadixSort::SortPairs)
264 (d_temp, temp_bytes, pcol, d_col_out, pmat, d_val_out,
265 nnz, nr, prow, prow+1, 0, int(sizeof(Long)*CHAR_BIT),
266 stream));
267
268 std::swap(col_index, col_index_out);
269 std::swap(mat, mat_out);
270
272 The_Arena()->free(d_temp);
273 }
274
275 // let's test both by print matrix out to see if it's sorted.
276
278
279#elif defined(AMREX_USE_SYCL)
280
281 // xxxxx TODO SYCL: Let's not worry about performance for now.
283 duplicateCSR(Gpu::deviceToHost, h_csr, *this);
285 h_csr.sort_on_host();
286 duplicateCSR(Gpu::hostToDevice, *this, h_csr);
288
289#endif
290
291#else
292
293 sort_on_host();
294
295#endif
296}
297
298template <typename T, template <typename> class V>
300{
301 if (nnz <= 0) { return; }
302
303 constexpr int SMALL = 128;
304
305 Long nr = nrows();
306
307#ifdef AMREX_USE_OMP
308#pragma omp parallel
309#endif
310 {
311 V<Long> lcols;
312 V<T > lvals;
313 V<int > perm;
314
315 Long scols[SMALL];
316 T svals[SMALL];
317
318#ifdef AMREX_USE_OMP
319#pragma omp for
320#endif
321 for (Long r = 0; r < nr; ++r) {
322 Long const b = row_offset[r ];
323 Long const e = row_offset[r+1];
324 auto const len = int(e - b);
325
326 if (len <= 1) { continue; }
327
328 bool sorted = true;
329 for (int i = 1; i < len; ++i) {
330 if (col_index[b+i-1] > col_index[b+i]) {
331 sorted = false;
332 break;
333 }
334 }
335 if (sorted) { continue; }
336
337 if (len <= SMALL) {
338 // Insertion sort using arrays on stack
339 for (int i = 0; i < len; ++i) {
340 scols[i] = col_index[b+i];
341 svals[i] = mat [b+i];
342 }
343 for (int i = 1; i < len; ++i) {
344 auto c = scols[i];
345 auto v = svals[i];
346 auto j = i;
347 while (j > 0 && scols[j-1] > c) {
348 scols[j] = scols[j-1];
349 svals[j] = svals[j-1];
350 --j;
351 }
352 scols[j] = c;
353 svals[j] = v;
354 }
355 for (int i = 0; i < len; ++i) {
356 col_index[b+i] = scols[i];
357 mat [b+i] = svals[i];
358 }
359 } else {
360 lcols.resize(len);
361 lvals.resize(len);
362 perm.resize(len);
363
364 for (int i = 0; i < len; ++i) {
365 lcols[i] = col_index[b+i];
366 lvals[i] = mat [b+i];
367 perm [i] = i;
368 }
369
370 std::sort(perm.begin(), perm.end(),
371 [&] (int i0, int i1) {
372 return lcols[i0] < lcols[i1];
373 });
374
375 for (int out = 0; out < len; ++out) {
376 auto const in = perm[out];
377 col_index[b+out] = lcols[in];
378 mat [b+out] = lvals[in];
379 }
380 }
381 }
382 }
383}
384
385}
386
387#endif
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_HIP_OR_CUDA(a, b)
Definition AMReX_GpuControl.H:17
#define AMREX_GPU_SAFE_CALL(call)
Definition AMReX_GpuError.H:63
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Definition AMReX_GpuBuffer.H:24
T const * data() const noexcept
Definition AMReX_GpuBuffer.H:51
static constexpr int warp_size
Definition AMReX_GpuDevice.H:236
amrex_long Long
Definition AMReX_INT.H:30
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
__host__ __device__ AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
Definition AMReX_GpuAtomic.H:283
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
void duplicateCSR(C c, CSR< T, AD > &dst, CSR< T, AS > const &src)
Definition AMReX_CSR.H:120
const int[]
Definition AMReX_BLProfiler.cpp:1664
Owning CSR container backed by AMReX resizable vectors.
Definition AMReX_CSR.H:50
V< Long > row_offset
Definition AMReX_CSR.H:53
Long nrows() const
Number of logical rows represented by the CSR offset array.
Definition AMReX_CSR.H:57
Long nnz
Definition AMReX_CSR.H:54
void sort()
Sort each row by column index. Uses GPU acceleration when possible.
Definition AMReX_CSR.H:141
CsrView< T > view()
Mutable view of the underlying buffers.
Definition AMReX_CSR.H:78
void sort_on_host()
Host-only fallback that sorts column indices row by row.
Definition AMReX_CSR.H:299
CsrView< T const > view() const
Const view of the underlying buffers.
Definition AMReX_CSR.H:84
CsrView< T const > const_view() const
Convenience alias for view() const.
Definition AMReX_CSR.H:93
void resize(Long num_rows, Long num_non_zeros)
Resize the storage to accommodate num_rows and num_non_zeros entries.
Definition AMReX_CSR.H:70
V< Long > col_index
Definition AMReX_CSR.H:52
V< T > mat
Definition AMReX_CSR.H:51
Lightweight non-owning CSR view that can point to host or device buffers.
Definition AMReX_CSR.H:34
std::conditional_t< std::is_const_v< T >, Long const, Long > U
Definition AMReX_CSR.H:35
T *__restrict__ mat
Definition AMReX_CSR.H:36
Long nrows
Definition AMReX_CSR.H:40
Long nnz
Definition AMReX_CSR.H:39
U *__restrict__ row_offset
Definition AMReX_CSR.H:38
U *__restrict__ col_index
Definition AMReX_CSR.H:37