Block-Structured AMR Software Framework
AMReX_SpMatrix.H
Go to the documentation of this file.
1 #ifndef AMREX_SP_MATRIX_H_
2 #define AMREX_SP_MATRIX_H_
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_AlgPartition.H>
6 #include <AMReX_AlgVector.H>
7 #include <AMReX_Gpu.H>
8 #include <AMReX_INT.H>
9 #include <AMReX_Scan.H>
10 
11 #include <fstream>
12 #include <string>
13 #include <type_traits>
14 
15 namespace amrex {
16 
17 template <typename T, template<typename> class Allocator = DefaultAllocator>
18 class SpMatrix
19 {
20 public:
21  using value_type = T;
22 
23  SpMatrix () = default;
24 
25  SpMatrix (AlgPartition partition, int nnz);
26 
27  SpMatrix (SpMatrix const&) = delete;
28  SpMatrix& operator= (SpMatrix const&) = delete;
29 
30  SpMatrix (SpMatrix &&) = default;
31  SpMatrix& operator= (SpMatrix &&) = default;
32 
33  ~SpMatrix () = default;
34 
35  void define (AlgPartition partition, int nnz);
36 
37  [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
38 
39  [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; }
40  [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
41  [[nodiscard]] Long numLocalNonZero () const { return m_data.nnz; }
42 
44  [[nodiscard]] Long globalRowBegin () const { return m_row_begin; }
46  [[nodiscard]] Long globalRowEnd () const { return m_row_end; }
47 
48  [[nodiscard]] T const* data () const { return m_data.mat.data(); }
49  [[nodiscard]] T * data () { return m_data.mat.data(); }
50  [[nodiscard]] Long const* columnIndex () const { return m_data.col_index.data(); }
51  [[nodiscard]] Long * columnIndex () { return m_data.col_index.data(); }
52  [[nodiscard]] Long const* rowOffset () const { return m_data.row_offset.data(); }
53  [[nodiscard]] Long * rowOffset () { return m_data.row_offset.data(); }
54 
55  void printToFile (std::string const& file) const;
56 
57  template <typename F>
58  void setVal (F const& f);
59 
60  [[nodiscard]] AlgVector<T> const& diagonalVector () const;
61 
62  template <typename U> friend void SpMV(AlgVector<U>& y, SpMatrix<U> const& A, AlgVector<U> const& x);
63 
65  void define_doit (int nnz);
66 
67 #ifdef AMREX_USE_MPI
69  void prepare_comm ();
70  void pack_buffer (AlgVector<T> const& v);
71  void unpack_buffer (AlgVector<T>& v);
72 #endif
73 
74 private:
75 
76  void startComm (AlgVector<T> const& x);
77  void finishComm (AlgVector<T>& y);
78 
79  template <class U> using DVec = PODVector<U,Allocator<U> >;
80 
81  template <template <typename> class V>
82  struct CSR {
83  V<T> mat;
84  V<Long> col_index;
85  V<Long> row_offset;
86  Long nnz = -1;
87  };
88 
90  Long m_row_begin = 0;
91  Long m_row_end = 0;
93 
95 
96 #ifdef AMREX_USE_MPI
98 
99 #ifdef AMREX_USE_GPU
101 #endif
102 
104 
108 
111 
113  T* m_send_buffer = nullptr;
115 
117  T* m_recv_buffer = nullptr;
119 
120  bool m_comm_prepared = false;
121 #endif
122 
123  bool m_shifted = false;
124 };
125 
126 template <typename T, template<typename> class Allocator>
128  : m_partition(std::move(partition)),
129  m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
130  m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
131 {
132  static_assert(std::is_floating_point<T>::value, "SpMatrix is for floating point type only");
133  define_doit(nnz);
134 }
135 
136 template <typename T, template<typename> class Allocator>
138 {
139  m_partition = std::move(partition);
140  m_row_begin = m_partition[ParallelDescriptor::MyProc()];
141  m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
142  define_doit(nnz);
143 }
144 
145 template <typename T, template<typename> class Allocator>
146 void
148 {
149  Long nlocalrows = this->numLocalRows();
150  Long total_nnz = nlocalrows*nnz;
151  m_data.mat.resize(total_nnz);
152  m_data.col_index.resize(total_nnz);
153  m_data.row_offset.resize(nlocalrows+1);
154  m_data.nnz = total_nnz;
155 
156  auto* poffset = m_data.row_offset.data();
157  ParallelFor(nlocalrows+1, [=] AMREX_GPU_DEVICE (Long lrow) noexcept
158  {
159  poffset[lrow] = lrow*nnz;
160  });
161 }
162 
163 template <typename T, template<typename> class Allocator>
164 void
165 SpMatrix<T,Allocator>::printToFile (std::string const& file) const
166 {
167  // xxxxx TODO: This function only prints square part of the local rows,
168  // not the full rows.
169 
170 #ifdef AMREX_USE_GPU
172  csr.mat.resize(m_data.mat.size());
173  csr.col_index.resize(m_data.col_index.size());
174  csr.row_offset.resize(m_data.row_offset.size());
175  Gpu::copyAsync(Gpu::deviceToHost, m_data.mat.begin(), m_data.mat.end(), csr.mat.begin());
176  Gpu::copyAsync(Gpu::deviceToHost, m_data.col_index.begin(), m_data.col_index.end(), csr.col_index.begin());
177  Gpu::copyAsync(Gpu::deviceToHost, m_data.row_offset.begin(), m_data.row_offset.end(), csr.row_offset.begin());
178  csr.nnz = m_data.nnz;
180 #else
181  auto const& csr = m_data;
182 #endif
183 
184  const Long lrow_begin = m_partition[ParallelDescriptor::MyProc()];
185  std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
186  ofs << m_row_begin << " " << m_row_end << " " << csr.nnz << "\n";
187  for (Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
188  Long nnz_row = csr.row_offset[i+1] - csr.row_offset[i];
189  T const* mat = csr.mat.data() + csr.row_offset[i];
190  Long const* col = csr.col_index.data() + csr.row_offset[i];
191  for (Long j = 0; j < nnz_row; ++j) {
192  ofs << i+lrow_begin << " " << col[j] << " " << mat[j] << "\n";
193  }
194  }
195 }
196 
197 template <typename T, template<typename> class Allocator>
198 template <typename F>
200 {
201  // xxxxx TODO: We can try to optimize this later by using shared memory.
202 
203  Long nlocalrows = this->numLocalRows();
204  Long rowbegin = this->globalRowBegin();
205  auto* pmat = m_data.mat.data();
206  auto* pcolindex = m_data.col_index.data();
207  auto* prowoffset = m_data.row_offset.data();
208  ParallelFor(nlocalrows, [=] AMREX_GPU_DEVICE (int lrow) noexcept
209  {
210  f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
211  });
212 }
213 
214 template <typename T, template<typename> class Allocator>
216 {
217  if (m_diagonal.empty()) {
218  m_diagonal.define(this->partition());
219  auto* AMREX_RESTRICT p = m_diagonal.data();
220  auto const* AMREX_RESTRICT mat = m_data.mat.data();
221  auto const* AMREX_RESTRICT col = m_data.col_index.data();
222  auto const* AMREX_RESTRICT row = m_data.row_offset.data();
223  auto offset = m_shifted ? Long(0) : m_row_begin;
224  Long nrows = this->numLocalRows();
225  ParallelFor(nrows, [=] AMREX_GPU_DEVICE (Long i)
226  {
227  T d = 0;
228  for (Long j = row[i]; j < row[i+1]; ++j) {
229  if (i == col[j] - offset) {
230  d = mat[j];
231  break;
232  }
233  }
234  p[i] = d;
235  });
236  }
237  return m_diagonal;
238 }
239 
240 template <typename T, template<typename> class Allocator>
242 {
243 #ifndef AMREX_USE_MPI
245 #else
246  if (this->partition().numActiveProcs() <= 1) { return; }
247 
248  this->prepare_comm();
249 
250  auto const mpi_tag = ParallelDescriptor::SeqNum();
251  auto const mpi_t_type = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
252  auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
253 
254  auto const nrecvs = int(m_recv_from.size());
255  if (nrecvs > 0) {
256  m_recv_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_recv);
257  m_recv_reqs.resize(nrecvs, MPI_REQUEST_NULL);
258  auto* p_recv = m_recv_buffer;
259  for (int irecv = 0; irecv < nrecvs; ++irecv) {
260  BL_MPI_REQUIRE(MPI_Irecv(p_recv,
261  m_recv_counts[irecv], mpi_t_type,
262  m_recv_from[irecv], mpi_tag, mpi_comm,
263  &(m_recv_reqs[irecv])));
264  p_recv += m_recv_counts[irecv];
265  }
266  AMREX_ASSERT(p_recv == m_recv_buffer + m_total_counts_recv);
267  }
268 
269  auto const nsends = int(m_send_to.size());
270  if (nsends > 0) {
271  m_send_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_total_counts_send);
272 
273  pack_buffer(x);
275 
276  m_send_reqs.resize(nsends, MPI_REQUEST_NULL);
277  auto* p_send = m_send_buffer;
278  for (int isend = 0; isend < nsends; ++isend) {
279  auto count = m_send_counts[isend];
280  BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_send_to[isend],
281  mpi_tag, mpi_comm, &(m_send_reqs[isend])));
282  p_send += count;
283  }
284  AMREX_ASSERT(p_send == m_send_buffer + m_total_counts_send);
285  }
286 #endif
287 }
288 
289 template <typename T, template<typename> class Allocator>
291 {
292  if (this->numLocalRows() == 0) { return; }
293 
294 #ifndef AMREX_USE_MPI
296 #else
297  if (this->partition().numActiveProcs() <= 1) { return; }
298 
299  if ( ! m_recv_reqs.empty()) {
300  Vector<MPI_Status> mpi_statuses(m_recv_reqs.size());
301  BL_MPI_REQUIRE(MPI_Waitall(int(m_recv_reqs.size()),
302  m_recv_reqs.data(),
303  mpi_statuses.data()));
304  }
305 
306  unpack_buffer(y);
307 
308  if ( ! m_send_reqs.empty()) {
309  Vector<MPI_Status> mpi_statuses(m_send_reqs.size());
310  BL_MPI_REQUIRE(MPI_Waitall(int(m_send_reqs.size()),
311  m_send_reqs.data(),
312  mpi_statuses.data()));
313  }
314 
316  The_Comms_Arena()->free(m_send_buffer);
317  The_Comms_Arena()->free(m_recv_buffer);
318  m_send_reqs.clear();
319  m_recv_reqs.clear();
320 #endif
321 }
322 
323 #ifdef AMREX_USE_MPI
324 
325 template <typename T, template<typename> class Allocator>
327 {
328  if (m_comm_prepared) { return; }
329 
330  // This function needs to be safe when nnz is zero.
331 
332  // xxxxx TODO: check there is no int overflow.
333 
334  const int nprocs = ParallelContext::NProcsSub();
335 
336  // First, we need to split the matrix into two parts, a square matrix
337  // for pure local operations and another part for remote operations.
338 
339  Long all_nnz = m_data.nnz;
340  Long local_nnz;
341  Gpu::DeviceVector<Long> pfsum(all_nnz);
342  auto* p_pfsum = pfsum.data();
343  auto row_begin = m_row_begin;
344  auto row_end = m_row_end;
345  if (m_data.nnz < Long(std::numeric_limits<int>::max())) {
346  auto const* pcol = m_data.col_index.data();
347  local_nnz = Scan::PrefixSum<int>(int(all_nnz),
348  [=] AMREX_GPU_DEVICE (int i) -> int {
349  return (pcol[i] >= row_begin &&
350  pcol[i] < row_end); },
351  [=] AMREX_GPU_DEVICE (int i, int const& x) {
352  p_pfsum[i] = x; },
354  } else {
355  auto const* pcol = m_data.col_index.data();
356  local_nnz = Scan::PrefixSum<Long>(all_nnz,
357  [=] AMREX_GPU_DEVICE (Long i) -> Long {
358  return (pcol[i] >= row_begin &&
359  pcol[i] < row_end); },
360  [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
361  p_pfsum[i] = x; },
363  }
364 
365  m_data.nnz = local_nnz;
366  Long remote_nnz = all_nnz - local_nnz;
367  m_data_remote.nnz = remote_nnz;
368 
369  Vector<Vector<Long>>unique_remote_cols_vv(nprocs);
370  Vector<Long> unique_remote_cols_v;
371 
372  if (local_nnz != all_nnz) {
373  m_data_remote.mat.resize(remote_nnz);
374  m_data_remote.col_index.resize(remote_nnz);
375  DVec<T> new_mat(local_nnz);
376  DVec<Long> new_col(local_nnz);
377  auto const* pmat = m_data.mat.data();
378  auto const* pcol = m_data.col_index.data();
379  auto* pmat_l = new_mat.data();
380  auto* pcol_l = new_col.data();
381  auto* pmat_r = m_data_remote.mat.data();
382  auto* pcol_r = m_data_remote.col_index.data();
383  ParallelFor(all_nnz, [=] AMREX_GPU_DEVICE (Long i)
384  {
385  auto ps = p_pfsum[i];
386  auto local = (pcol[i] >= row_begin &&
387  pcol[i] < row_end);
388  if (local) {
389  pmat_l[ps] = pmat[i];
390  pcol_l[ps] = pcol[i] - row_begin; // shift the column index to local
391  } else {
392  pmat_r[i-ps] = pmat[i];
393  pcol_r[i-ps] = pcol[i];
394  }
395  });
396  m_shifted = true;
397  auto noffset = Long(m_data.row_offset.size());
398  auto* pro = m_data.row_offset.data();
399  m_data_remote.row_offset.resize(noffset);
400  auto* pro_r = m_data_remote.row_offset.data();
401  ParallelFor(noffset, [=] AMREX_GPU_DEVICE (Long i)
402  {
403  if (i < noffset-1) {
404  auto ro_l = p_pfsum[pro[i]];
405  pro_r[i] = pro[i] - ro_l;
406  pro[i] = ro_l;
407  } else {
408  pro[i] = local_nnz;
409  pro_r[i] = remote_nnz;
410  }
411  });
413  m_data.mat.swap(new_mat);
414  m_data.col_index.swap(new_col);
415 
416  // In the remote part, it's expected that some rows don't have
417  // non-zeros. So we trim them off.
418  {
419  Long old_size = m_data_remote.row_offset.size();
420  m_rtol.resize(old_size-1);
421  auto* p_rtol = m_rtol.data();
422  DVec<Long> trimmed_row_offset(old_size);
423  auto const* p_ro = m_data_remote.row_offset.data();
424  auto* p_tro = trimmed_row_offset.data();
425  Long new_size;
426  if (old_size < Long(std::numeric_limits<int>::max())) {
427  // This is basically std::unique.
428  new_size = Scan::PrefixSum<int>(int(old_size),
429  [=] AMREX_GPU_DEVICE (int i) -> int {
430  if (i+1 < old_size) {
431  return (p_ro[i+1] > p_ro[i]);
432  } else {
433  return 1;
434  }
435  },
436  [=] AMREX_GPU_DEVICE (int i, int const& x) {
437  if (i == 0) {
438  p_tro[0] = 0;
439  } else if (p_ro[i] > p_ro[i-1]) {
440  p_tro[x] = p_ro[i];
441  }
442  if ((i+1 < old_size) &&
443  p_ro[i+1] > p_ro[i])
444  {
445  p_rtol[x] = i;
446  }
447  },
449  } else {
450  // This is basically std::unique.
451  new_size = Scan::PrefixSum<Long>(old_size,
452  [=] AMREX_GPU_DEVICE (Long i) -> Long {
453  if (i+1 < old_size) {
454  return (p_ro[i+1] > p_ro[i]);
455  } else {
456  return 1;
457  }
458  },
459  [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
460  if (i == 0) {
461  p_tro[0] = 0;
462  } else if (p_ro[i] > p_ro[i-1]) {
463  p_tro[x] = p_ro[i];
464  }
465  if ((i+1 < old_size) &&
466  p_ro[i+1] > p_ro[i])
467  {
468  p_rtol[x] = i;
469  }
470  },
472  }
473 
474  m_rtol.resize(new_size-1);
475  trimmed_row_offset.resize(new_size);
476 #ifdef AMREX_USE_GPU
477  m_rtol.shrink_to_fit();
478  trimmed_row_offset.shrink_to_fit();
479 #endif
480  m_data_remote.row_offset.swap(trimmed_row_offset);
481  }
482 
483 #ifdef AMREX_USE_GPU
484  m_remote_cols.resize(m_data_remote.col_index.size());
485  Gpu::copyAsync(Gpu::deviceToHost, m_data_remote.col_index.begin(),
486  m_data_remote.col_index.end(),
487  m_remote_cols.begin());
489 #else
490  auto const& m_remote_cols = m_data_remote.col_index;
491 #endif
492 
493  unique_remote_cols_v.resize(m_remote_cols.size());
494  std::partial_sort_copy(m_remote_cols.begin(),
495  m_remote_cols.end(),
496  unique_remote_cols_v.begin(),
497  unique_remote_cols_v.end());
498  amrex::RemoveDuplicates(unique_remote_cols_v);
499 
500  m_total_counts_recv = Long(unique_remote_cols_v.size());
501 
502  // Note that amrex::RemoveDuplicates sorts the data.
503  auto const& rows = this->m_partition.dataVector();
504  auto it = rows.cbegin();
505  for (auto c : unique_remote_cols_v) {
506  it = std::find_if(it, rows.cend(), [&] (auto x) { return x > c; });
507  if (it != rows.cend()) {
508  int iproc = int(std::distance(rows.cbegin(),it)) - 1;
509  unique_remote_cols_vv[iproc].push_back(c);
510  } else {
511  amrex::Abort("SpMatrix::prepare_comm: how did this happen?");
512  }
513  }
514  }
515 
516  // Need to make plans for MPI
517  auto const mpi_tag = ParallelDescriptor::SeqNum();
518  auto const mpi_int = ParallelDescriptor::Mpi_typemap<int>::type(); // NOLINT(readability-qualified-auto)
519  auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
520  auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
521 
522  amrex::Vector<int> need_from(nprocs);
523  for (int iproc = 0; iproc < nprocs; ++iproc) {
524  need_from[iproc] = unique_remote_cols_vv[iproc].empty() ? 0 : 1;
525  }
526  amrex::Vector<int> reduce_scatter_counts(nprocs,1);
527  int nsends = 0;
528  BL_MPI_REQUIRE(MPI_Reduce_scatter
529  (need_from.data(), &nsends, reduce_scatter_counts.data(),
530  mpi_int, MPI_SUM, mpi_comm));
531 
532  // nsends is the number of processes that need data from me.
533 
534  Vector<MPI_Request> mpi_requests;
535  for (int iproc = 0; iproc < nprocs; ++iproc) {
536  if ( ! unique_remote_cols_vv[iproc].empty()) {
537  mpi_requests.push_back(MPI_REQUEST_NULL);
538  // I need to let other processes know what I need from them.
539  BL_MPI_REQUIRE(MPI_Isend(unique_remote_cols_vv[iproc].data(),
540  int(unique_remote_cols_vv[iproc].size()),
541  mpi_long, iproc, mpi_tag, mpi_comm,
542  &(mpi_requests.back())));
543  m_recv_from.push_back(iproc);
544  m_recv_counts.push_back(int(unique_remote_cols_vv[iproc].size()));
545  }
546  }
547 
548  Vector<Vector<Long>> send_indices(nsends);
549  m_total_counts_send = 0;
550  for (int isend = 0; isend < nsends; ++isend) {
551  MPI_Status mpi_status;
552  BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
553  int receiver = mpi_status.MPI_SOURCE;
554  int count;
555  BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
556  m_send_to.push_back(receiver);
557  m_send_counts.push_back(count);
558  send_indices[isend].resize(count);
559  BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
560  receiver, mpi_tag, mpi_comm, &mpi_status));
561  m_total_counts_send += count;
562  }
563 
564  m_send_indices.resize(m_total_counts_send);
565  Gpu::PinnedVector<Long> send_indices_all;
566  send_indices_all.reserve(m_total_counts_send);
567  for (auto const& vl : send_indices) {
568  for (auto x : vl) {
569  send_indices_all.push_back(x);
570  }
571  }
572  Gpu::copyAsync(Gpu::hostToDevice, send_indices_all.begin(), send_indices_all.end(),
573  m_send_indices.begin());
575 
576  Vector<MPI_Status> mpi_statuses(mpi_requests.size());
577  BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
578  mpi_statuses.data()));
579 
580  // Now we convert the remote indices from global to local.
581  std::map<Long,Long> gtol;
582  for (Long i = 0, N = Long(unique_remote_cols_v.size()); i < N; ++i) {
583  gtol[unique_remote_cols_v[i]] = i;
584  }
585 #ifdef AMREX_USE_GPU
586  auto& cols = m_remote_cols;
587 #else
588  auto& cols = m_data_remote.col_index;
589 #endif
590  for (auto& c : cols) {
591  c = gtol[c];
592  }
593 
594 #ifdef AMREX_USE_GPU
595  Gpu::copyAsync(Gpu::hostToDevice, m_remote_cols.begin(), m_remote_cols.end(),
596  m_data_remote.col_index.data());
597 #endif
598 
599  m_comm_prepared = true;
600 }
601 
602 template <typename T, template<typename> class Allocator>
604 {
605  auto* pdst = m_send_buffer;
606  auto* pidx = m_send_indices.data();
607  auto const& vv = v.view();
608  auto const nsends = Long(m_send_indices.size());
609  ParallelFor(nsends, [=] AMREX_GPU_DEVICE (Long i)
610  {
611  pdst[i] = vv(pidx[i]);
612  });
613 }
614 
615 template <typename T, template<typename> class Allocator>
617 {
618  auto const& csr = m_data_remote;
619  if (csr.nnz > 0) {
620  T const* AMREX_RESTRICT mat = csr.mat.data();
621  auto const* AMREX_RESTRICT col = csr.col_index.data();
622  auto const* AMREX_RESTRICT row = csr.row_offset.data();
623 
624  auto const* rtol = m_rtol.data();
625 
626  auto const* AMREX_RESTRICT px = m_recv_buffer;
627  auto * AMREX_RESTRICT py = v.data();
628 
629  auto const nrr = Long(csr.row_offset.size())-1;
630  ParallelFor(nrr, [=] AMREX_GPU_DEVICE (Long i)
631  {
632  T r = 0;
633  for (Long j = row[i]; j < row[i+1]; ++j) {
634  r += mat[j] * px[col[j]];
635  }
636  py[rtol[i]] += r;
637  });
638  }
639 }
640 
641 #endif
642 
643 }
644 
645 #endif
#define AMREX_ASSERT(EX)
Definition: AMReX_BLassert.H:38
#define AMREX_RESTRICT
Definition: AMReX_Extension.H:37
#define AMREX_GPU_DEVICE
Definition: AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition: AMReX_HypreMLABecLap.cpp:1089
Real * pdst
Definition: AMReX_HypreMLABecLap.cpp:1090
static constexpr int MPI_REQUEST_NULL
Definition: AMReX_ccse-mpi.H:53
Definition: AMReX_AlgPartition.H:14
Long numGlobalRows() const
Definition: AMReX_AlgPartition.H:28
Definition: AMReX_AlgVector.H:19
T const * data() const
Definition: AMReX_AlgVector.H:53
AMREX_FORCE_INLINE Table1D< T const, Long > view() const
Definition: AMReX_AlgVector.H:57
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Definition: AMReX_PODVector.H:246
void reserve(size_type a_capacity)
Definition: AMReX_PODVector.H:647
void shrink_to_fit()
Definition: AMReX_PODVector.H:656
T * data() noexcept
Definition: AMReX_PODVector.H:593
iterator begin() noexcept
Definition: AMReX_PODVector.H:601
iterator end() noexcept
Definition: AMReX_PODVector.H:605
void resize(size_type a_new_size)
Definition: AMReX_PODVector.H:625
void push_back(const T &a_value)
Definition: AMReX_PODVector.H:556
Definition: AMReX_SpMatrix.H:19
AlgVector< T > m_diagonal
Definition: AMReX_SpMatrix.H:94
bool m_comm_prepared
Definition: AMReX_SpMatrix.H:120
CSR< DVec > m_data_remote
Definition: AMReX_SpMatrix.H:97
void prepare_comm()
Private function, but public for cuda.
Definition: AMReX_SpMatrix.H:326
Long const * columnIndex() const
Definition: AMReX_SpMatrix.H:50
Long globalRowBegin() const
Inclusive global index begin.
Definition: AMReX_SpMatrix.H:44
AlgPartition const & partition() const
Definition: AMReX_SpMatrix.H:37
void setVal(F const &f)
Definition: AMReX_SpMatrix.H:199
T const * data() const
Definition: AMReX_SpMatrix.H:48
Long m_row_end
Definition: AMReX_SpMatrix.H:91
void define(AlgPartition partition, int nnz)
Definition: AMReX_SpMatrix.H:137
Gpu::DeviceVector< Long > m_send_indices
Definition: AMReX_SpMatrix.H:107
void startComm(AlgVector< T > const &x)
Definition: AMReX_SpMatrix.H:241
Long m_total_counts_send
Definition: AMReX_SpMatrix.H:114
Vector< int > m_recv_counts
Definition: AMReX_SpMatrix.H:110
Long numGlobalRows() const
Definition: AMReX_SpMatrix.H:40
Gpu::PinnedVector< Long > m_remote_cols
Definition: AMReX_SpMatrix.H:100
~SpMatrix()=default
T value_type
Definition: AMReX_SpMatrix.H:21
T * data()
Definition: AMReX_SpMatrix.H:49
Long * columnIndex()
Definition: AMReX_SpMatrix.H:51
Long globalRowEnd() const
Exclusive global index end.
Definition: AMReX_SpMatrix.H:46
bool m_shifted
Definition: AMReX_SpMatrix.H:123
Vector< int > m_recv_from
Definition: AMReX_SpMatrix.H:109
Long m_row_begin
Definition: AMReX_SpMatrix.H:90
void finishComm(AlgVector< T > &y)
Definition: AMReX_SpMatrix.H:290
void printToFile(std::string const &file) const
Definition: AMReX_SpMatrix.H:165
Long m_total_counts_recv
Definition: AMReX_SpMatrix.H:118
void unpack_buffer(AlgVector< T > &v)
Definition: AMReX_SpMatrix.H:616
Vector< MPI_Request > m_send_reqs
Definition: AMReX_SpMatrix.H:112
SpMatrix()=default
SpMatrix(SpMatrix const &)=delete
CSR< DVec > m_data
Definition: AMReX_SpMatrix.H:92
T * m_recv_buffer
Definition: AMReX_SpMatrix.H:117
Vector< MPI_Request > m_recv_reqs
Definition: AMReX_SpMatrix.H:116
AlgPartition m_partition
Definition: AMReX_SpMatrix.H:89
SpMatrix(SpMatrix &&)=default
friend void SpMV(AlgVector< U > &y, SpMatrix< U > const &A, AlgVector< U > const &x)
Long numLocalRows() const
Definition: AMReX_SpMatrix.H:39
Long const * rowOffset() const
Definition: AMReX_SpMatrix.H:52
Long numLocalNonZero() const
Definition: AMReX_SpMatrix.H:41
T * m_send_buffer
Definition: AMReX_SpMatrix.H:113
Vector< int > m_send_to
Definition: AMReX_SpMatrix.H:105
AlgVector< T > const & diagonalVector() const
Definition: AMReX_SpMatrix.H:215
Vector< int > m_send_counts
Definition: AMReX_SpMatrix.H:106
Long * rowOffset()
Definition: AMReX_SpMatrix.H:53
SpMatrix & operator=(SpMatrix const &)=delete
void define_doit(int nnz)
Private function, but public for cuda.
Definition: AMReX_SpMatrix.H:147
DVec< Long > m_rtol
Definition: AMReX_SpMatrix.H:103
void pack_buffer(AlgVector< T > const &v)
Definition: AMReX_SpMatrix.H:603
Long size() const noexcept
Definition: AMReX_Vector.H:50
AMREX_GPU_HOST_DEVICE Long size(T const &b) noexcept
integer version
Definition: AMReX_GpuRange.H:26
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition: AMReX_GpuContainers.H:233
static constexpr DeviceToHost deviceToHost
Definition: AMReX_GpuContainers.H:99
static constexpr HostToDevice hostToDevice
Definition: AMReX_GpuContainers.H:98
void streamSynchronize() noexcept
Definition: AMReX_GpuDevice.H:237
int MyProc()
Definition: AMReX_MPMD.cpp:117
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition: AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition: AMReX_ParallelContext.H:74
int MyProc() noexcept
return the rank number local to the current Parallel Context
Definition: AMReX_ParallelDescriptor.H:125
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition: AMReX_ParallelDescriptor.H:613
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition: AMReX_Scan.H:29
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ max
Definition: AMReX_ParallelReduce.H:17
Definition: AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition: AMReX_CTOParallelForImpl.H:200
amrex::ArenaAllocator< T > DefaultAllocator
Definition: AMReX_GpuAllocators.H:194
Arena * The_Comms_Arena()
Definition: AMReX_Arena.cpp:669
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition: AMReX.cpp:225
const int[]
Definition: AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition: AMReX_Vector.H:190
Definition: AMReX_ccse-mpi.H:51
Definition: AMReX_SpMatrix.H:82
Long nnz
Definition: AMReX_SpMatrix.H:86
V< Long > row_offset
Definition: AMReX_SpMatrix.H:85
V< Long > col_index
Definition: AMReX_SpMatrix.H:84
V< T > mat
Definition: AMReX_SpMatrix.H:83