Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_SpMatrix.H
Go to the documentation of this file.
1#ifndef AMREX_SP_MATRIX_H_
2#define AMREX_SP_MATRIX_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_AlgVector.H>
7#include <AMReX_Gpu.H>
8#include <AMReX_INT.H>
9#include <AMReX_Scan.H>
10
11#include <fstream>
12#include <string>
13#include <type_traits>
14
15namespace amrex {
16
17template <typename T, template<typename> class Allocator = DefaultAllocator>
19{
20public:
21 using value_type = T;
22
23 SpMatrix () = default;
24
26
27 SpMatrix (SpMatrix const&) = delete;
28 SpMatrix& operator= (SpMatrix const&) = delete;
29
30 SpMatrix (SpMatrix &&) = default;
32
33 ~SpMatrix () = default;
34
35 void define (AlgPartition partition, int nnz);
36
42 void define (AlgPartition partition, T const* mat, Long const* col_index,
43 Long nnz, Long const* row_index);
44
45 [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
46
47 [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; }
48 [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
49 [[nodiscard]] Long numLocalNonZero () const { return m_data.nnz; }
50
52 [[nodiscard]] Long globalRowBegin () const { return m_row_begin; }
54 [[nodiscard]] Long globalRowEnd () const { return m_row_end; }
55
56 [[nodiscard]] T const* data () const { return m_data.mat.data(); }
57 [[nodiscard]] T * data () { return m_data.mat.data(); }
58 [[nodiscard]] Long const* columnIndex () const { return m_data.col_index.data(); }
59 [[nodiscard]] Long * columnIndex () { return m_data.col_index.data(); }
60 [[nodiscard]] Long const* rowOffset () const { return m_data.row_offset.data(); }
61 [[nodiscard]] Long * rowOffset () { return m_data.row_offset.data(); }
62
63 void printToFile (std::string const& file) const;
64
65 template <typename F>
66 void setVal (F const& f);
67
68 [[nodiscard]] AlgVector<T> const& diagonalVector () const;
69
70 template <typename U> friend void SpMV(AlgVector<U>& y, SpMatrix<U> const& A, AlgVector<U> const& x);
71
73 void define_doit (int nnz);
74
75#ifdef AMREX_USE_MPI
77 void prepare_comm ();
78 void pack_buffer (AlgVector<T> const& v);
80#endif
81
82private:
83
84 void startComm (AlgVector<T> const& x);
85 void finishComm (AlgVector<T>& y);
86
87 template <class U> using DVec = PODVector<U,Allocator<U> >;
88
89 template <template <typename> class V>
90 struct CSR {
91 V<T> mat;
92 V<Long> col_index;
93 V<Long> row_offset;
94 Long nnz = -1;
95 };
96
98 Long m_row_begin = 0;
99 Long m_row_end = 0;
101
103
104#ifdef AMREX_USE_MPI
106
107#ifdef AMREX_USE_GPU
109#endif
110
112
116
119
121 T* m_send_buffer = nullptr;
123
125 T* m_recv_buffer = nullptr;
127
128 bool m_comm_prepared = false;
129#endif
130
131 bool m_shifted = false;
132};
133
134template <typename T, template<typename> class Allocator>
136 : m_partition(std::move(partition)),
137 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
138 m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
139{
140 static_assert(std::is_floating_point<T>::value, "SpMatrix is for floating point type only");
141 define_doit(nnz);
142}
143
144template <typename T, template<typename> class Allocator>
146{
147 m_partition = std::move(partition);
148 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
149 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
150 define_doit(nnz);
151}
152
153template <typename T, template<typename> class Allocator>
154void
156{
157 Long nlocalrows = this->numLocalRows();
158 Long total_nnz = nlocalrows*nnz;
159 m_data.mat.resize(total_nnz);
160 m_data.col_index.resize(total_nnz);
161 m_data.row_offset.resize(nlocalrows+1);
162 m_data.nnz = total_nnz;
163
164 auto* poffset = m_data.row_offset.data();
165 ParallelFor(nlocalrows+1, [=] AMREX_GPU_DEVICE (Long lrow) noexcept
166 {
167 poffset[lrow] = lrow*nnz;
168 });
169}
170
171template <typename T, template<typename> class Allocator>
172void
174 Long const* col_index, Long nnz,
175 Long const* row_index)
176{
177 m_partition = std::move(partition);
178 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
179 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
180 Long nlocalrows = this->numLocalRows();
181 m_data.mat.resize(nnz);
182 m_data.col_index.resize(nnz);
183 m_data.row_offset.resize(nlocalrows+1);
184 m_data.nnz = nnz;
185 Gpu::copyAsync(Gpu::deviceToDevice, mat, mat+nnz, m_data.mat.begin());
186 Gpu::copyAsync(Gpu::deviceToDevice, col_index, col_index+nnz,
187 m_data.col_index.begin());
188 Gpu::copyAsync(Gpu::deviceToDevice, row_index, row_index+nlocalrows+1,
189 m_data.row_index.begin());
191}
192
193template <typename T, template<typename> class Allocator>
194void
195SpMatrix<T,Allocator>::printToFile (std::string const& file) const
196{
197 // xxxxx TODO: This function only prints square part of the local rows,
198 // not the full rows.
199
200#ifdef AMREX_USE_GPU
202 csr.mat.resize(m_data.mat.size());
203 csr.col_index.resize(m_data.col_index.size());
204 csr.row_offset.resize(m_data.row_offset.size());
205 Gpu::copyAsync(Gpu::deviceToHost, m_data.mat.begin(), m_data.mat.end(), csr.mat.begin());
206 Gpu::copyAsync(Gpu::deviceToHost, m_data.col_index.begin(), m_data.col_index.end(), csr.col_index.begin());
207 Gpu::copyAsync(Gpu::deviceToHost, m_data.row_offset.begin(), m_data.row_offset.end(), csr.row_offset.begin());
208 csr.nnz = m_data.nnz;
210#else
211 auto const& csr = m_data;
212#endif
213
214 const Long lrow_begin = m_partition[ParallelDescriptor::MyProc()];
215 std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
216 ofs << m_row_begin << " " << m_row_end << " " << csr.nnz << "\n";
217 for (Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
218 Long nnz_row = csr.row_offset[i+1] - csr.row_offset[i];
219 T const* mat = csr.mat.data() + csr.row_offset[i];
220 Long const* col = csr.col_index.data() + csr.row_offset[i];
221 for (Long j = 0; j < nnz_row; ++j) {
222 ofs << i+lrow_begin << " " << col[j] << " " << mat[j] << "\n";
223 }
224 }
225}
226
227template <typename T, template<typename> class Allocator>
228template <typename F>
230{
231 // xxxxx TODO: We can try to optimize this later by using shared memory.
232
233 Long nlocalrows = this->numLocalRows();
234 Long rowbegin = this->globalRowBegin();
235 auto* pmat = m_data.mat.data();
236 auto* pcolindex = m_data.col_index.data();
237 auto* prowoffset = m_data.row_offset.data();
238 ParallelFor(nlocalrows, [=] AMREX_GPU_DEVICE (int lrow) noexcept
239 {
240 f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
241 });
242}
243
244template <typename T, template<typename> class Allocator>
246{
247 if (m_diagonal.empty()) {
248 m_diagonal.define(this->partition());
249 auto* AMREX_RESTRICT p = m_diagonal.data();
250 auto const* AMREX_RESTRICT mat = m_data.mat.data();
251 auto const* AMREX_RESTRICT col = m_data.col_index.data();
252 auto const* AMREX_RESTRICT row = m_data.row_offset.data();
253 auto offset = m_shifted ? Long(0) : m_row_begin;
254 Long nrows = this->numLocalRows();
255 ParallelFor(nrows, [=] AMREX_GPU_DEVICE (Long i)
256 {
257 T d = 0;
258 for (Long j = row[i]; j < row[i+1]; ++j) {
259 if (i == col[j] - offset) {
260 d = mat[j];
261 break;
262 }
263 }
264 p[i] = d;
265 });
266 }
267 return m_diagonal;
268}
269
270template <typename T, template<typename> class Allocator>
272{
273#ifndef AMREX_USE_MPI
275#else
276 if (this->partition().numActiveProcs() <= 1) { return; }
277
278 this->prepare_comm();
279
280 auto const mpi_tag = ParallelDescriptor::SeqNum();
281 auto const mpi_t_type = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
282 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
283
284 auto const nrecvs = int(m_recv_from.size());
285 if (nrecvs > 0) {
286 m_recv_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_recv);
287 m_recv_reqs.resize(nrecvs, MPI_REQUEST_NULL);
288 auto* p_recv = m_recv_buffer;
289 for (int irecv = 0; irecv < nrecvs; ++irecv) {
290 BL_MPI_REQUIRE(MPI_Irecv(p_recv,
291 m_recv_counts[irecv], mpi_t_type,
292 m_recv_from[irecv], mpi_tag, mpi_comm,
293 &(m_recv_reqs[irecv])));
294 p_recv += m_recv_counts[irecv];
295 }
296 AMREX_ASSERT(p_recv == m_recv_buffer + m_total_counts_recv);
297 }
298
299 auto const nsends = int(m_send_to.size());
300 if (nsends > 0) {
301 m_send_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_send);
302
303 pack_buffer(x);
305
306 m_send_reqs.resize(nsends, MPI_REQUEST_NULL);
307 auto* p_send = m_send_buffer;
308 for (int isend = 0; isend < nsends; ++isend) {
309 auto count = m_send_counts[isend];
310 BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_send_to[isend],
311 mpi_tag, mpi_comm, &(m_send_reqs[isend])));
312 p_send += count;
313 }
314 AMREX_ASSERT(p_send == m_send_buffer + m_total_counts_send);
315 }
316#endif
317}
318
319template <typename T, template<typename> class Allocator>
321{
322 if (this->numLocalRows() == 0) { return; }
323
324#ifndef AMREX_USE_MPI
326#else
327 if (this->partition().numActiveProcs() <= 1) { return; }
328
329 if ( ! m_recv_reqs.empty()) {
330 Vector<MPI_Status> mpi_statuses(m_recv_reqs.size());
331 BL_MPI_REQUIRE(MPI_Waitall(int(m_recv_reqs.size()),
332 m_recv_reqs.data(),
333 mpi_statuses.data()));
334 }
335
336 unpack_buffer(y);
337
338 if ( ! m_send_reqs.empty()) {
339 Vector<MPI_Status> mpi_statuses(m_send_reqs.size());
340 BL_MPI_REQUIRE(MPI_Waitall(int(m_send_reqs.size()),
341 m_send_reqs.data(),
342 mpi_statuses.data()));
343 }
344
346 The_Comms_Arena()->free(m_send_buffer);
347 The_Comms_Arena()->free(m_recv_buffer);
348 m_send_reqs.clear();
349 m_recv_reqs.clear();
350#endif
351}
352
353#ifdef AMREX_USE_MPI
354
355template <typename T, template<typename> class Allocator>
357{
358 if (m_comm_prepared) { return; }
359
360 // This function needs to be safe when nnz is zero.
361
362 // xxxxx TODO: check there is no int overflow.
363
364 const int nprocs = ParallelContext::NProcsSub();
365
366 // First, we need to split the matrix into two parts, a square matrix
367 // for pure local operations and another part for remote operations.
368
369 Long all_nnz = m_data.nnz;
370 Long local_nnz;
371 Gpu::DeviceVector<Long> pfsum(all_nnz);
372 auto* p_pfsum = pfsum.data();
373 auto row_begin = m_row_begin;
374 auto row_end = m_row_end;
375 if (m_data.nnz < Long(std::numeric_limits<int>::max())) {
376 auto const* pcol = m_data.col_index.data();
377 local_nnz = Scan::PrefixSum<int>(int(all_nnz),
378 [=] AMREX_GPU_DEVICE (int i) -> int {
379 return (pcol[i] >= row_begin &&
380 pcol[i] < row_end); },
381 [=] AMREX_GPU_DEVICE (int i, int const& x) {
382 p_pfsum[i] = x; },
384 } else {
385 auto const* pcol = m_data.col_index.data();
386 local_nnz = Scan::PrefixSum<Long>(all_nnz,
387 [=] AMREX_GPU_DEVICE (Long i) -> Long {
388 return (pcol[i] >= row_begin &&
389 pcol[i] < row_end); },
390 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
391 p_pfsum[i] = x; },
393 }
394
395 m_data.nnz = local_nnz;
396 Long remote_nnz = all_nnz - local_nnz;
397 m_data_remote.nnz = remote_nnz;
398
399 Vector<Vector<Long>>unique_remote_cols_vv(nprocs);
400 Vector<Long> unique_remote_cols_v;
401
402 if (local_nnz != all_nnz) {
403 m_data_remote.mat.resize(remote_nnz);
404 m_data_remote.col_index.resize(remote_nnz);
405 DVec<T> new_mat(local_nnz);
406 DVec<Long> new_col(local_nnz);
407 auto const* pmat = m_data.mat.data();
408 auto const* pcol = m_data.col_index.data();
409 auto* pmat_l = new_mat.data();
410 auto* pcol_l = new_col.data();
411 auto* pmat_r = m_data_remote.mat.data();
412 auto* pcol_r = m_data_remote.col_index.data();
413 ParallelFor(all_nnz, [=] AMREX_GPU_DEVICE (Long i)
414 {
415 auto ps = p_pfsum[i];
416 auto local = (pcol[i] >= row_begin &&
417 pcol[i] < row_end);
418 if (local) {
419 pmat_l[ps] = pmat[i];
420 pcol_l[ps] = pcol[i] - row_begin; // shift the column index to local
421 } else {
422 pmat_r[i-ps] = pmat[i];
423 pcol_r[i-ps] = pcol[i];
424 }
425 });
426 m_shifted = true;
427 auto noffset = Long(m_data.row_offset.size());
428 auto* pro = m_data.row_offset.data();
429 m_data_remote.row_offset.resize(noffset);
430 auto* pro_r = m_data_remote.row_offset.data();
431 ParallelFor(noffset, [=] AMREX_GPU_DEVICE (Long i)
432 {
433 if (i < noffset-1) {
434 auto ro_l = p_pfsum[pro[i]];
435 pro_r[i] = pro[i] - ro_l;
436 pro[i] = ro_l;
437 } else {
438 pro[i] = local_nnz;
439 pro_r[i] = remote_nnz;
440 }
441 });
443 m_data.mat.swap(new_mat);
444 m_data.col_index.swap(new_col);
445
446 // In the remote part, it's expected that some rows don't have
447 // non-zeros. So we trim them off.
448 {
449 Long old_size = m_data_remote.row_offset.size();
450 m_rtol.resize(old_size-1);
451 auto* p_rtol = m_rtol.data();
452 DVec<Long> trimmed_row_offset(old_size);
453 auto const* p_ro = m_data_remote.row_offset.data();
454 auto* p_tro = trimmed_row_offset.data();
455 Long new_size;
456 if (old_size < Long(std::numeric_limits<int>::max())) {
457 // This is basically std::unique.
458 new_size = Scan::PrefixSum<int>(int(old_size),
459 [=] AMREX_GPU_DEVICE (int i) -> int {
460 if (i+1 < old_size) {
461 return (p_ro[i+1] > p_ro[i]);
462 } else {
463 return 1;
464 }
465 },
466 [=] AMREX_GPU_DEVICE (int i, int const& x) {
467 if (i == 0) {
468 p_tro[0] = 0;
469 } else if (p_ro[i] > p_ro[i-1]) {
470 p_tro[x] = p_ro[i];
471 }
472 if ((i+1 < old_size) &&
473 p_ro[i+1] > p_ro[i])
474 {
475 p_rtol[x] = i;
476 }
477 },
479 } else {
480 // This is basically std::unique.
481 new_size = Scan::PrefixSum<Long>(old_size,
482 [=] AMREX_GPU_DEVICE (Long i) -> Long {
483 if (i+1 < old_size) {
484 return (p_ro[i+1] > p_ro[i]);
485 } else {
486 return 1;
487 }
488 },
489 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
490 if (i == 0) {
491 p_tro[0] = 0;
492 } else if (p_ro[i] > p_ro[i-1]) {
493 p_tro[x] = p_ro[i];
494 }
495 if ((i+1 < old_size) &&
496 p_ro[i+1] > p_ro[i])
497 {
498 p_rtol[x] = i;
499 }
500 },
502 }
503
504 m_rtol.resize(new_size-1);
505 trimmed_row_offset.resize(new_size);
506#ifdef AMREX_USE_GPU
507 m_rtol.shrink_to_fit();
508 trimmed_row_offset.shrink_to_fit();
509#endif
510 m_data_remote.row_offset.swap(trimmed_row_offset);
511 }
512
513#ifdef AMREX_USE_GPU
514 m_remote_cols.resize(m_data_remote.col_index.size());
515 Gpu::copyAsync(Gpu::deviceToHost, m_data_remote.col_index.begin(),
516 m_data_remote.col_index.end(),
517 m_remote_cols.begin());
519#else
520 auto const& m_remote_cols = m_data_remote.col_index;
521#endif
522
523 unique_remote_cols_v.resize(m_remote_cols.size());
524 std::partial_sort_copy(m_remote_cols.begin(),
525 m_remote_cols.end(),
526 unique_remote_cols_v.begin(),
527 unique_remote_cols_v.end());
528 amrex::RemoveDuplicates(unique_remote_cols_v);
529
530 m_total_counts_recv = Long(unique_remote_cols_v.size());
531
532 // Note that amrex::RemoveDuplicates sorts the data.
533 auto const& rows = this->m_partition.dataVector();
534 auto it = rows.cbegin();
535 for (auto c : unique_remote_cols_v) {
536 it = std::find_if(it, rows.cend(), [&] (auto x) { return x > c; });
537 if (it != rows.cend()) {
538 int iproc = int(std::distance(rows.cbegin(),it)) - 1;
539 unique_remote_cols_vv[iproc].push_back(c);
540 } else {
541 amrex::Abort("SpMatrix::prepare_comm: how did this happen?");
542 }
543 }
544 }
545
546 // Need to make plans for MPI
547 auto const mpi_tag = ParallelDescriptor::SeqNum();
548 auto const mpi_int = ParallelDescriptor::Mpi_typemap<int>::type(); // NOLINT(readability-qualified-auto)
549 auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
550 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
551
552 amrex::Vector<int> need_from(nprocs);
553 for (int iproc = 0; iproc < nprocs; ++iproc) {
554 need_from[iproc] = unique_remote_cols_vv[iproc].empty() ? 0 : 1;
555 }
556 amrex::Vector<int> reduce_scatter_counts(nprocs,1);
557 int nsends = 0;
558 BL_MPI_REQUIRE(MPI_Reduce_scatter
559 (need_from.data(), &nsends, reduce_scatter_counts.data(),
560 mpi_int, MPI_SUM, mpi_comm));
561
562 // nsends is the number of processes that need data from me.
563
564 Vector<MPI_Request> mpi_requests;
565 for (int iproc = 0; iproc < nprocs; ++iproc) {
566 if ( ! unique_remote_cols_vv[iproc].empty()) {
567 mpi_requests.push_back(MPI_REQUEST_NULL);
568 // I need to let other processes know what I need from them.
569 BL_MPI_REQUIRE(MPI_Isend(unique_remote_cols_vv[iproc].data(),
570 int(unique_remote_cols_vv[iproc].size()),
571 mpi_long, iproc, mpi_tag, mpi_comm,
572 &(mpi_requests.back())));
573 m_recv_from.push_back(iproc);
574 m_recv_counts.push_back(int(unique_remote_cols_vv[iproc].size()));
575 }
576 }
577
578 Vector<Vector<Long>> send_indices(nsends);
579 m_total_counts_send = 0;
580 for (int isend = 0; isend < nsends; ++isend) {
581 MPI_Status mpi_status;
582 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
583 int receiver = mpi_status.MPI_SOURCE;
584 int count;
585 BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
586 m_send_to.push_back(receiver);
587 m_send_counts.push_back(count);
588 send_indices[isend].resize(count);
589 BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
590 receiver, mpi_tag, mpi_comm, &mpi_status));
591 m_total_counts_send += count;
592 }
593
594 m_send_indices.resize(m_total_counts_send);
595 Gpu::PinnedVector<Long> send_indices_all;
596 send_indices_all.reserve(m_total_counts_send);
597 for (auto const& vl : send_indices) {
598 for (auto x : vl) {
599 send_indices_all.push_back(x);
600 }
601 }
602 Gpu::copyAsync(Gpu::hostToDevice, send_indices_all.begin(), send_indices_all.end(),
603 m_send_indices.begin());
605
606 Vector<MPI_Status> mpi_statuses(mpi_requests.size());
607 BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
608 mpi_statuses.data()));
609
610 // Now we convert the remote indices from global to local.
611 std::map<Long,Long> gtol;
612 for (Long i = 0, N = Long(unique_remote_cols_v.size()); i < N; ++i) {
613 gtol[unique_remote_cols_v[i]] = i;
614 }
615#ifdef AMREX_USE_GPU
616 auto& cols = m_remote_cols;
617#else
618 auto& cols = m_data_remote.col_index;
619#endif
620 for (auto& c : cols) {
621 c = gtol[c];
622 }
623
624#ifdef AMREX_USE_GPU
625 Gpu::copyAsync(Gpu::hostToDevice, m_remote_cols.begin(), m_remote_cols.end(),
626 m_data_remote.col_index.data());
627#endif
628
629 m_comm_prepared = true;
630}
631
632template <typename T, template<typename> class Allocator>
634{
635 auto* pdst = m_send_buffer;
636 auto* pidx = m_send_indices.data();
637 auto const& vv = v.view();
638 auto const nsends = Long(m_send_indices.size());
639 ParallelFor(nsends, [=] AMREX_GPU_DEVICE (Long i)
640 {
641 pdst[i] = vv(pidx[i]);
642 });
643}
644
645template <typename T, template<typename> class Allocator>
647{
648 auto const& csr = m_data_remote;
649 if (csr.nnz > 0) {
650 T const* AMREX_RESTRICT mat = csr.mat.data();
651 auto const* AMREX_RESTRICT col = csr.col_index.data();
652 auto const* AMREX_RESTRICT row = csr.row_offset.data();
653
654 auto const* rtol = m_rtol.data();
655
656 auto const* AMREX_RESTRICT px = m_recv_buffer;
657 auto * AMREX_RESTRICT py = v.data();
658
659 auto const nrr = Long(csr.row_offset.size())-1;
660 ParallelFor(nrr, [=] AMREX_GPU_DEVICE (Long i)
661 {
662 T r = 0;
663 for (Long j = row[i]; j < row[i+1]; ++j) {
664 r += mat[j] * px[col[j]];
665 }
666 py[rtol[i]] += r;
667 });
668 }
669}
670
671#endif
672
673}
674
675#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_RESTRICT
Definition AMReX_Extension.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:53
Definition AMReX_AlgPartition.H:14
Long numGlobalRows() const
Definition AMReX_AlgPartition.H:28
Definition AMReX_AlgVector.H:19
T const * data() const
Definition AMReX_AlgVector.H:53
void define(Long global_size)
Definition AMReX_AlgVector.H:128
AMREX_FORCE_INLINE Table1D< T const, Long > view() const
Definition AMReX_AlgVector.H:57
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Definition AMReX_PODVector.H:262
void reserve(size_type a_capacity)
Definition AMReX_PODVector.H:663
void shrink_to_fit()
Definition AMReX_PODVector.H:672
iterator begin() noexcept
Definition AMReX_PODVector.H:617
iterator end() noexcept
Definition AMReX_PODVector.H:621
void resize(size_type a_new_size)
Definition AMReX_PODVector.H:641
T * data() noexcept
Definition AMReX_PODVector.H:609
void push_back(const T &a_value)
Definition AMReX_PODVector.H:572
Definition AMReX_SpMatrix.H:19
AlgVector< T > m_diagonal
Definition AMReX_SpMatrix.H:102
bool m_comm_prepared
Definition AMReX_SpMatrix.H:128
CSR< DVec > m_data_remote
Definition AMReX_SpMatrix.H:105
void prepare_comm()
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:356
Long globalRowBegin() const
Inclusive global index begin.
Definition AMReX_SpMatrix.H:52
void setVal(F const &f)
Definition AMReX_SpMatrix.H:229
Long * rowOffset()
Definition AMReX_SpMatrix.H:61
Long m_row_end
Definition AMReX_SpMatrix.H:99
Long const * rowOffset() const
Definition AMReX_SpMatrix.H:60
void define(AlgPartition partition, int nnz)
Definition AMReX_SpMatrix.H:145
Gpu::DeviceVector< Long > m_send_indices
Definition AMReX_SpMatrix.H:115
void startComm(AlgVector< T > const &x)
Definition AMReX_SpMatrix.H:271
Long m_total_counts_send
Definition AMReX_SpMatrix.H:122
T * data()
Definition AMReX_SpMatrix.H:57
Vector< int > m_recv_counts
Definition AMReX_SpMatrix.H:118
Long numGlobalRows() const
Definition AMReX_SpMatrix.H:48
Gpu::PinnedVector< Long > m_remote_cols
Definition AMReX_SpMatrix.H:108
SpMatrix & operator=(SpMatrix const &)=delete
~SpMatrix()=default
T value_type
Definition AMReX_SpMatrix.H:21
T const * data() const
Definition AMReX_SpMatrix.H:56
Long globalRowEnd() const
Exclusive global index end.
Definition AMReX_SpMatrix.H:54
bool m_shifted
Definition AMReX_SpMatrix.H:131
Long const * columnIndex() const
Definition AMReX_SpMatrix.H:58
Vector< int > m_recv_from
Definition AMReX_SpMatrix.H:117
Long m_row_begin
Definition AMReX_SpMatrix.H:98
void finishComm(AlgVector< T > &y)
Definition AMReX_SpMatrix.H:320
void printToFile(std::string const &file) const
Definition AMReX_SpMatrix.H:195
Long m_total_counts_recv
Definition AMReX_SpMatrix.H:126
void unpack_buffer(AlgVector< T > &v)
Definition AMReX_SpMatrix.H:646
Vector< MPI_Request > m_send_reqs
Definition AMReX_SpMatrix.H:120
SpMatrix()=default
SpMatrix(SpMatrix const &)=delete
CSR< DVec > m_data
Definition AMReX_SpMatrix.H:100
T * m_recv_buffer
Definition AMReX_SpMatrix.H:125
Vector< MPI_Request > m_recv_reqs
Definition AMReX_SpMatrix.H:124
AlgPartition m_partition
Definition AMReX_SpMatrix.H:97
SpMatrix(SpMatrix &&)=default
friend void SpMV(AlgVector< U > &y, SpMatrix< U > const &A, AlgVector< U > const &x)
Long numLocalRows() const
Definition AMReX_SpMatrix.H:47
AlgPartition const & partition() const
Definition AMReX_SpMatrix.H:45
Long numLocalNonZero() const
Definition AMReX_SpMatrix.H:49
Long * columnIndex()
Definition AMReX_SpMatrix.H:59
T * m_send_buffer
Definition AMReX_SpMatrix.H:121
Vector< int > m_send_to
Definition AMReX_SpMatrix.H:113
AlgVector< T > const & diagonalVector() const
Definition AMReX_SpMatrix.H:245
Vector< int > m_send_counts
Definition AMReX_SpMatrix.H:114
void define_doit(int nnz)
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:155
DVec< Long > m_rtol
Definition AMReX_SpMatrix.H:111
void pack_buffer(AlgVector< T > const &v)
Definition AMReX_SpMatrix.H:633
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Long size() const noexcept
Definition AMReX_Vector.H:50
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:233
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:100
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:237
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
int MyProc() noexcept
return the rank number local to the current Parallel Context
Definition AMReX_ParallelDescriptor.H:125
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:613
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition AMReX_Scan.H:29
Definition AMReX_Amr.cpp:49
amrex::ArenaAllocator< T > DefaultAllocator
Definition AMReX_GpuAllocators.H:194
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:676
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:208
Definition AMReX_ccse-mpi.H:51
Definition AMReX_SpMatrix.H:90
Long nnz
Definition AMReX_SpMatrix.H:94
V< Long > row_offset
Definition AMReX_SpMatrix.H:93
V< Long > col_index
Definition AMReX_SpMatrix.H:92
V< T > mat
Definition AMReX_SpMatrix.H:91