1#ifndef AMREX_SP_MATRIX_H_
2#define AMREX_SP_MATRIX_H_
3#include <AMReX_Config.H>
11#if defined(AMREX_USE_CUDA)
13#elif defined(AMREX_USE_HIP)
14# include <rocsparse/rocsparse.h>
15#elif defined(AMREX_USE_SYCL)
16# include <mkl_version.h>
17# include <oneapi/mkl/spblas.hpp>
23#include <unordered_map>
47 explicit operator bool()
const {
return b; }
53 explicit operator bool()
const {
return b; }
203 return m_csr.
mat.data();
258 template <
typename F>
291 template <
typename U,
template<
typename>
class M,
typename N>
friend
294 template <
typename U,
template<
typename>
class M>
friend
297 template <
typename U>
friend class AMG;
311 template <
typename I>
313 Long nentries,
Long const* row_offset);
331 void set_num_neighbors ();
335 Long m_row_begin = 0;
337 Long m_col_begin = 0;
344 bool m_split =
false;
377 int m_num_neighbors = -1;
424 template <
typename C>
436void transpose (CsrView<T>
const& csrt, CsrView<T const>
const& csr)
438 Long nrows = csr.nrows;
439 Long ncols = csrt.nrows;
442 if (nrows <= 0 || ncols <= 0 || nnz <= 0) {
444 auto* p = csrt.row_offset;
452#if defined(AMREX_USE_CUDA)
454 cusparseHandle_t handle;
458 cudaDataType data_type;
459 if constexpr (std::is_same_v<T,float>) {
460 data_type = CUDA_R_32F;
461 }
else if constexpr (std::is_same_v<T,double>) {
462 data_type = CUDA_R_64F;
463 }
else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
464 data_type = CUDA_C_32F;
465 }
else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
466 data_type = CUDA_C_64F;
468 amrex::Abort(
"SpMatrix transpose: unsupported data type");
472 ncols <
Long(std::numeric_limits<int>::max()) &&
473 nnz <
Long(std::numeric_limits<int>::max()));
475 auto* csr_col_index = (
int*)
The_Arena()->
alloc(csr.nnz*
sizeof(
int));
477 auto* csrt_col_index = (
int*)
The_Arena()->
alloc(csrt.nnz*
sizeof(
int));
483 csr_col_index[i] = int(csr.col_index[i]);
485 if (i < csr.nrows+1) {
486 csr_row_offset[i] = int(csr.row_offset[i]);
490 std::size_t buffer_size;
492 cusparseCsr2cscEx2_bufferSize(handle,
int(nrows),
int(ncols),
int(nnz),
493 csr.mat, csr_row_offset, csr_col_index,
494 csrt.mat, csrt_row_offset, csrt_col_index,
495 data_type, CUSPARSE_ACTION_NUMERIC,
496 CUSPARSE_INDEX_BASE_ZERO,
497 CUSPARSE_CSR2CSC_ALG1,
503 cusparseCsr2cscEx2(handle,
int(nrows),
int(ncols),
int(nnz),
504 csr.mat, csr_row_offset, csr_col_index,
505 csrt.mat, csrt_row_offset, csrt_col_index,
506 data_type, CUSPARSE_ACTION_NUMERIC,
507 CUSPARSE_INDEX_BASE_ZERO,
508 CUSPARSE_CSR2CSC_ALG1,
514 csrt.col_index[i] = csrt_col_index[i];
516 if (i < csrt.nrows+1) {
517 csrt.row_offset[i] = csrt_row_offset[i];
529#elif defined(AMREX_USE_HIP)
531 rocsparse_handle handle;
532 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&handle));
533 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle,
Gpu::gpuStream()));
536 ncols <
Long(std::numeric_limits<rocsparse_int>::max()) &&
537 nnz <
Long(std::numeric_limits<rocsparse_int>::max()));
539 rocsparse_int* csr_col_index;
540 rocsparse_int* csr_row_offset;
541 rocsparse_int* csrt_col_index;
542 rocsparse_int* csrt_row_offset;
543 if (std::is_same_v<rocsparse_int,Long>) {
544 csr_col_index = (rocsparse_int*)csr.col_index;
545 csr_row_offset = (rocsparse_int*)csr.row_offset;
546 csrt_col_index = (rocsparse_int*)csrt.col_index;
547 csrt_row_offset = (rocsparse_int*)csrt.row_offset;
549 csr_col_index = (rocsparse_int*)
The_Arena()->
alloc(csr.nnz*
sizeof(rocsparse_int));
550 csr_row_offset = (rocsparse_int*)
The_Arena()->
alloc((csr.nrows+1)*
sizeof(rocsparse_int));
551 csrt_col_index = (rocsparse_int*)
The_Arena()->
alloc(csrt.nnz*
sizeof(rocsparse_int));
552 csrt_row_offset = (rocsparse_int*)
The_Arena()->
alloc((csrt.nrows+1)*
sizeof(rocsparse_int));
556 csr_col_index[i] = rocsparse_int(csr.col_index[i]);
558 if (i < csr.nrows+1) {
559 csr_row_offset[i] = rocsparse_int(csr.row_offset[i]);
564 std::size_t buffer_size;
565 AMREX_ROCSPARSE_SAFE_CALL(
566 rocsparse_csr2csc_buffer_size(handle, rocsparse_int(nrows),
567 rocsparse_int(ncols), rocsparse_int(nnz),
568 csr_row_offset, csr_col_index,
569 rocsparse_action_numeric,
574 if constexpr (std::is_same_v<T,float>) {
575 AMREX_ROCSPARSE_SAFE_CALL(
576 rocsparse_scsr2csc(handle, rocsparse_int(nrows),
577 rocsparse_int(ncols), rocsparse_int(nnz),
578 csr.mat, csr_row_offset, csr_col_index,
579 csrt.mat, csrt_col_index, csrt_row_offset,
580 rocsparse_action_numeric,
581 rocsparse_index_base_zero,
583 }
else if constexpr (std::is_same_v<T,double>) {
584 AMREX_ROCSPARSE_SAFE_CALL(
585 rocsparse_dcsr2csc(handle, rocsparse_int(nrows),
586 rocsparse_int(ncols), rocsparse_int(nnz),
587 csr.mat, csr_row_offset, csr_col_index,
588 csrt.mat, csrt_col_index, csrt_row_offset,
589 rocsparse_action_numeric,
590 rocsparse_index_base_zero,
592 }
else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
593 AMREX_ROCSPARSE_SAFE_CALL(
594 rocsparse_ccsr2csc(handle, rocsparse_int(nrows),
595 rocsparse_int(ncols), rocsparse_int(nnz),
596 (rocsparse_float_complex*)csr.mat, csr_row_offset, csr_col_index,
597 (rocsparse_float_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
598 rocsparse_action_numeric,
599 rocsparse_index_base_zero,
601 }
else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
602 AMREX_ROCSPARSE_SAFE_CALL(
603 rocsparse_zcsr2csc(handle, rocsparse_int(nrows),
604 rocsparse_int(ncols), rocsparse_int(nnz),
605 (rocsparse_double_complex*)csr.mat, csr_row_offset, csr_col_index,
606 (rocsparse_double_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
607 rocsparse_action_numeric,
608 rocsparse_index_base_zero,
611 amrex::Abort(
"SpMatrix transpose: unsupported data type");
614 if (! std::is_same_v<rocsparse_int,Long>) {
618 csrt.col_index[i] = csrt_col_index[i];
620 if (i < csrt.nrows+1) {
621 csrt.row_offset[i] = csrt_row_offset[i];
627 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(handle));
629 if (! std::is_same_v<rocsparse_int,Long>) {
636#elif defined(AMREX_USE_SYCL)
638 mkl::sparse::matrix_handle_t handle_in{};
639 mkl::sparse::matrix_handle_t handle_out{};
640 mkl::sparse::init_matrix_handle(&handle_in);
641 mkl::sparse::init_matrix_handle(&handle_out);
643#if defined(INTEL_MKL_VERSION) && (INTEL_MKL_VERSION < 20250300)
645 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols,
646 mkl::index_base::zero, (
Long*)csr.row_offset,
647 (
Long*)csr.col_index, (T*)csr.mat);
648 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows,
649 mkl::index_base::zero, (
Long*)csrt.row_offset,
650 (
Long*)csrt.col_index, (T*)csrt.mat);
652 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols, nnz,
653 mkl::index_base::zero, (
Long*)csr.row_offset,
654 (
Long*)csr.col_index, (T*)csr.mat);
655 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows, nnz,
656 mkl::index_base::zero, (
Long*)csrt.row_offset,
657 (
Long*)csrt.col_index, (T*)csrt.mat);
660 mkl::sparse::omatcopy(Gpu::Device::streamQueue(), mkl::transpose::trans,
661 handle_in, handle_out);
663 mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_in);
664 auto ev = mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_out);
673 auto* p = csrt.row_offset;
679#pragma omp parallel for
681 for (
Long i = 0; i < nnz; ++i) {
682 auto col = csr.col_index[i];
684#pragma omp atomic update
690 Vector<Long> current_pos(ncols+1);
692 for (
Long i = 0; i < ncols; ++i) {
694 current_pos[i+1] = p[i+1];
699 for (
Long i = 0; i < nrows; ++i) {
700 for (
Long idx = csr.row_offset[i]; idx < csr.row_offset[i+1]; ++idx) {
701 auto col = csr.col_index[idx];
702 Long dest = current_pos[col]++;
703 csrt.mat[dest] = csr.mat[idx];
704 csrt.col_index[dest] = i;
712template <
typename T,
template<
typename>
class Allocator>
714 : m_partition(std::move(partition)),
715 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
716 m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
721template <
typename T,
template<
typename>
class Allocator>
723 : m_partition(std::move(partition)),
724 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
725 m_row_end(m_partition[ParallelDescriptor::MyProc()+1]),
727 m_csr(std::move(csr))
730template <
typename T,
template<
typename>
class Allocator>
733 m_partition = std::move(partition);
736 define_doit(nnz_per_row);
739template <
typename T,
template<
typename>
class Allocator>
743 m_partition = std::move(partition);
747 m_csr = std::move(csr);
748 if (! is_sorted) { m_csr.sort(); }
751template <
typename T,
template<
typename>
class Allocator>
755 if (nnz_per_row <= 0) {
return; };
759 Long nlocalrows = this->numLocalRows();
760 m_nnz = nlocalrows*nnz_per_row;
761 m_csr.mat.resize(m_nnz);
762 m_csr.col_index.resize(m_nnz);
763 m_csr.row_offset.resize(nlocalrows+1);
766 auto* poffset = m_csr.row_offset.data();
769 poffset[lrow] = lrow*nnz_per_row;
773template <
typename T,
template<
typename>
class Allocator>
776 Long const* col_index,
Long nentries,
782 m_partition = std::move(partition);
790 Long nlocalrows = this->numLocalRows();
791 m_csr.mat.resize(nentries);
792 m_csr.col_index.resize(nentries);
793 m_csr.row_offset.resize(nlocalrows+1);
794 m_csr.nnz = nentries;
797 m_csr.col_index.begin());
799 m_csr.row_offset.begin());
801 if (nentries <
Long(std::numeric_limits<int>::max())) {
802 define_and_filter_doit<int>(mat, col_index, nentries, row_offset);
804 define_and_filter_doit<Long>(mat, col_index, nentries, row_offset);
819template <
typename T,
template<
typename>
class Allocator>
826template <
typename T,
template<
typename>
class Allocator>
830 Long nentries,
Long const* row_offset)
833 auto* ps = psum.
data();
834 m_nnz = Scan::PrefixSum<I>(I(nentries),
836 return col_index[i] >= 0 && mat[i] != 0; },
840 Long nlocalrows = this->numLocalRows();
841 m_csr.mat.resize(m_nnz);
842 m_csr.col_index.resize(m_nnz);
843 m_csr.row_offset.resize(nlocalrows+1);
845 auto* pmat = m_csr.mat.data();
846 auto* pcol = m_csr.col_index.data();
847 auto* prow = m_csr.row_offset.data();
848 auto actual_nnz = m_nnz;
852 if (col_index[i] >= 0 && mat[i] != 0) {
853 pmat[ps[i]] = mat[i];
854 pcol[ps[i]] = col_index[i];
857 if (i < nlocalrows) {
858 prow[i] = ps[row_offset[i]];
859 if (i == nlocalrows - 1) {
860 prow[nlocalrows] = actual_nnz;
867template <
typename T,
template<
typename>
class Allocator>
888 auto const& remote_cols = m_remote_cols_v;
895 auto const& csr = m_csr;
897 auto const& csr_r = m_csr_remote;
898 auto const& ri_ltor = m_ri_ltor;
899 auto const& remote_cols = m_remote_cols_v;
904 Long nnz = m_csr.nnz;
906 nnz += m_csr_remote.nnz;
910 ofs << m_row_begin <<
" " << m_row_end <<
" " << nnz <<
"\n";
911 for (
Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
915 for (
Long j = 0; j < nnz_row; ++j) {
916 ofs << i+m_row_begin <<
" " << col[j]+m_col_begin <<
" " << mat[j] <<
"\n";
919 if (i <
Long(ri_ltor.
size()) && ri_ltor[i] >= 0) {
920 Long ii = ri_ltor[i];
924 for (
Long j = 0; j < nnz_row; ++j) {
925 ofs << i+m_row_begin <<
" " << remote_cols[col[j]] <<
" " << mat[j] <<
"\n";
932template <
typename T,
template<
typename>
class Allocator>
940 Long nlocalrows = this->numLocalRows();
941 Long rowbegin = this->globalRowBegin();
942 auto* pmat = m_csr.mat.data();
943 auto* pcolindex = m_csr.col_index.data();
944 auto* prowoffset = m_csr.row_offset.data();
947 f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
950 if (! is_sorted) { m_csr.sort(); }
953template <
typename T,
template<
typename>
class Allocator>
956 if (m_diagonal.empty()) {
957 m_diagonal.
define(this->partition());
962 auto offset = m_split ?
Long(0) : m_row_begin;
963 Long nrows = this->numLocalRows();
967 for (
Long j = row[i]; j < row[i+1]; ++j) {
968 if (i == col[j] -
offset) {
979template <
typename T,
template<
typename>
class Allocator>
984 auto const& a = this->const_parcsr();
988 for (
auto idx = a.csr0.row_offset[i];
989 idx < a.csr0.row_offset[i+1]; ++idx) {
990 s += a.csr0.mat[idx];
992 if (a.csr1.nnz > 0) {
993 for (
auto idx = a.csr1.row_offset[i];
994 idx < a.csr1.row_offset[i+1]; ++idx) {
995 s += a.csr1.mat[idx];
1003template <
typename T,
template<
typename>
class Allocator>
1008 m_csr_remote.view(),
1016# ifdef AMREX_USE_GPU
1017 m_remote_cols_dv.data()
1019 m_remote_cols_v.data()
1027template <
typename T,
template<
typename>
class Allocator>
1033 m_csr_remote.const_view(),
1041# ifdef AMREX_USE_GPU
1042 m_remote_cols_dv.data()
1044 m_remote_cols_v.data()
1052template <
typename T,
template<
typename>
class Allocator>
1055 return this->const_parcsr();
1058template <
typename T,
template<
typename>
class Allocator>
1061#ifndef AMREX_USE_MPI
1064 if (this->partition().numActiveProcs() <= 1) {
return; }
1066 this->prepare_comm_mv(
x.partition());
1072 auto const nrecvs =
int(m_comm_mv.recv_from.size());
1076 auto* p_recv = m_comm_mv.recv_buffer;
1077 for (
int irecv = 0; irecv < nrecvs; ++irecv) {
1078 BL_MPI_REQUIRE(MPI_Irecv(p_recv,
1079 m_comm_mv.recv_counts[irecv], mpi_t_type,
1080 m_comm_mv.recv_from[irecv], mpi_tag, mpi_comm,
1081 &(m_comm_mv.recv_reqs[irecv])));
1082 p_recv += m_comm_mv.recv_counts[irecv];
1084 AMREX_ASSERT(p_recv == m_comm_mv.recv_buffer + m_comm_mv.total_counts_recv);
1087 auto const nsends =
int(m_comm_mv.send_to.size());
1095 auto* p_send = m_comm_mv.send_buffer;
1096 for (
int isend = 0; isend < nsends; ++isend) {
1097 auto count = m_comm_mv.send_counts[isend];
1098 BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_comm_mv.send_to[isend],
1099 mpi_tag, mpi_comm, &(m_comm_mv.send_reqs[isend])));
1102 AMREX_ASSERT(p_send == m_comm_mv.send_buffer + m_comm_mv.total_counts_send);
1107template <
typename T,
template<
typename>
class Allocator>
1110 if (this->numLocalRows() == 0) {
return; }
1112#ifndef AMREX_USE_MPI
1115 if (this->partition().numActiveProcs() <= 1) {
return; }
1117 if ( ! m_comm_mv.recv_reqs.empty()) {
1119 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_mv.recv_reqs.size()),
1120 m_comm_mv.recv_reqs.data(),
1121 mpi_statuses.data()));
1124 unpack_buffer_mv(
y);
1126 if ( ! m_comm_mv.send_reqs.empty()) {
1128 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_mv.send_reqs.size()),
1129 m_comm_mv.send_reqs.data(),
1130 mpi_statuses.data()));
1136 m_comm_mv.send_reqs.clear();
1137 m_comm_mv.recv_reqs.clear();
1141template <
typename T,
template<
typename>
class Allocator>
1145 if (this->partition().numActiveProcs() <= 1) {
return; }
1147 this->split_csr(col_partition);
1156 if (m_csr_remote.nnz > 0) {
1157 m_comm_tr.csrt.nnz = m_csr_remote.nnz;
1158 m_comm_tr.csrt.nrows = m_remote_cols_v.size();
1160 (
sizeof(T)*m_comm_tr.csrt.nnz);
1162 (
sizeof(
Long)*m_comm_tr.csrt.nnz);
1164 (
sizeof(
Long)*(m_comm_tr.csrt.nrows+1));
1167 csr_comm.
resize(m_comm_tr.csrt.nrows, m_comm_tr.csrt.nnz);
1168 auto const& csrv_comm = csr_comm.
view();
1170 auto const& csrv_comm = m_comm_tr.csrt;
1172 detail::transpose(csrv_comm, m_csr_remote.const_view());
1173 auto row_begin = m_row_begin;
1174 auto ri_rtol = m_ri_rtol.data();
1175 auto* col_index = csrv_comm.col_index;
1178 auto gjt =ri_rtol[col_index[idx]] + row_begin;
1179 col_index[idx] = gjt;
1184 csrv_comm. mat + csrv_comm.nnz,
1185 m_comm_tr.csrt.mat);
1187 csrv_comm. col_index,
1188 csrv_comm. col_index + csrv_comm.nnz,
1189 m_comm_tr.csrt.col_index);
1191 csrv_comm. row_offset,
1192 csrv_comm. row_offset + csrv_comm.nrows+1,
1193 m_comm_tr.csrt.row_offset);
1198 if (m_num_neighbors < 0) { set_num_neighbors(); }
1204 mpi_requests.reserve(nprocs);
1205 if (m_csr_remote.nnz > 0) {
1207 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1209 for (
Long i = 0; i <
Long(m_remote_cols_vv[iproc].size()); ++i) {
1210 n += m_comm_tr.csrt.row_offset[it+1] - m_comm_tr.csrt.row_offset[it];
1216 std::array<int,2> nn{
int(n),
int(m_remote_cols_vv[iproc].size())};
1217 BL_MPI_REQUIRE(MPI_Isend(nn.data(), 2, MPI_INT, iproc, mpi_tag,
1218 mpi_comm, &(mpi_requests.back())));
1219 m_comm_tr.send_to.push_back(iproc);
1220 m_comm_tr.send_counts.push_back(nn);
1228 for (
int irecv = 0; irecv < m_num_neighbors; ++irecv) {
1230 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1231 int sender = mpi_status.MPI_SOURCE;
1232 std::array<int,2> nn;
1233 BL_MPI_REQUIRE(MPI_Recv(nn.data(), 2, MPI_INT, sender, mpi_tag,
1234 mpi_comm, &mpi_status));
1235 m_comm_tr.recv_from.push_back(sender);
1236 m_comm_tr.recv_counts.push_back(nn);
1237 m_comm_tr.total_counts_recv[0] += nn[0];
1238 m_comm_tr.total_counts_recv[1] += nn[1];
1241 if (! mpi_requests.empty()) {
1243 BL_MPI_REQUIRE(MPI_Waitall(
int(mpi_requests.
size()), mpi_requests.data(),
1244 mpi_statuses.data()));
1256 auto const nrecvs =
int(m_comm_tr.recv_from.size());
1259 (
sizeof(T) * m_comm_tr.total_counts_recv[0]);
1261 (
sizeof(
Long) * m_comm_tr.total_counts_recv[0]);
1263 (
sizeof(
Long) * (m_comm_tr.total_counts_recv[1]+nrecvs));
1265 (
sizeof(
Long) * m_comm_tr.total_counts_recv[1]);
1266 m_comm_tr.recv_buffer_offset.push_back({0,0,0,0});
1268 for (
int irecv = 0; irecv < nrecvs; ++irecv) {
1269 auto [os0, os1, os2, os3] = m_comm_tr.recv_buffer_offset.back();
1270 auto [n0, n1] = m_comm_tr.recv_counts[irecv];
1271 auto recv_from_rank = m_comm_tr.recv_from[irecv];
1272 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_mat + os0,
1278 &(m_comm_tr.recv_reqs[irecv*4])));
1279 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_col_index + os1,
1285 &(m_comm_tr.recv_reqs[irecv*4+1])));
1286 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_row_offset + os2,
1292 &(m_comm_tr.recv_reqs[irecv*4+2])));
1293 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_idx_map + os3,
1299 &(m_comm_tr.recv_reqs[irecv*4+3])));
1300 m_comm_tr.recv_buffer_offset.push_back({os0 + n0,
1307 auto const nsends =
int(m_comm_tr.send_to.size());
1310 Long os0 = 0, os1 = 0;
1311 for (
int isend = 0; isend < nsends; ++isend) {
1312 auto [n0, n1] = m_comm_tr.send_counts[isend];
1313 auto send_to_rank = m_comm_tr.send_to[isend];
1314 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.mat + os0,
1320 &(m_comm_tr.send_reqs[isend*4])));
1321 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.col_index + os0,
1327 &(m_comm_tr.send_reqs[isend*4+1])));
1328 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.row_offset + os1,
1334 &(m_comm_tr.send_reqs[isend*4+2])));
1335 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[send_to_rank].data(),
1341 &(m_comm_tr.send_reqs[isend*4+3])));
1351template <
typename T,
template<
typename>
class Allocator>
1355 if (this->partition().numActiveProcs() <= 1) {
return; }
1357 if (! m_comm_tr.recv_reqs.empty()) {
1359 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_tr.recv_reqs.size()),
1360 m_comm_tr.recv_reqs.data(),
1361 mpi_statuses.data()));
1366 if (! m_comm_tr.send_reqs.empty()) {
1368 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_tr.send_reqs.size()),
1369 m_comm_tr.send_reqs.data(),
1370 mpi_statuses.data()));
1373 if (m_comm_tr.csrt.nnz > 0) {
1378 if (m_comm_tr.recv_buffer_mat) {
1392template <
typename T,
template<
typename>
class Allocator>
1404 m_col_partition = col_partition;
1416 auto* p_pfsum = pfsum.
data();
1417 auto col_begin = m_col_begin;
1418 auto col_end = m_col_end;
1419 if (m_csr.nnz <
Long(std::numeric_limits<int>::max())) {
1420 auto const* pcol = m_csr.col_index.data();
1421 local_nnz = Scan::PrefixSum<int>(
int(m_nnz),
1423 return (pcol[i] >= col_begin &&
1424 pcol[i] < col_end); },
1429 auto const* pcol = m_csr.col_index.data();
1430 local_nnz = Scan::PrefixSum<Long>(m_nnz,
1432 return (pcol[i] >= col_begin &&
1433 pcol[i] < col_end); },
1439 m_csr.nnz = local_nnz;
1440 Long remote_nnz = m_nnz - local_nnz;
1441 m_csr_remote.nnz = remote_nnz;
1443 if (local_nnz != m_nnz) {
1444 m_csr_remote.mat.resize(remote_nnz);
1445 m_csr_remote.col_index.resize(remote_nnz);
1448 auto const* pmat = m_csr.mat.data();
1449 auto const* pcol = m_csr.col_index.data();
1450 auto* pmat_l = new_mat.
data();
1451 auto* pcol_l = new_col.
data();
1452 auto* pmat_r = m_csr_remote.mat.data();
1453 auto* pcol_r = m_csr_remote.col_index.data();
1456 auto ps = p_pfsum[i];
1457 auto local = (pcol[i] >= col_begin &&
1460 pmat_l[ps] = pmat[i];
1461 pcol_l[ps] = pcol[i] - col_begin;
1463 pmat_r[i-ps] = pmat[i];
1464 pcol_r[i-ps] = pcol[i];
1467 auto noffset =
Long(m_csr.row_offset.size());
1468 auto* pro = m_csr.row_offset.data();
1469 m_csr_remote.row_offset.resize(noffset);
1470 auto* pro_r = m_csr_remote.row_offset.data();
1473 if (i < noffset-1) {
1474 auto ro_l = p_pfsum[pro[i]];
1475 pro_r[i] = pro[i] - ro_l;
1479 pro_r[i] = remote_nnz;
1483 m_csr.mat.swap(new_mat);
1484 m_csr.col_index.swap(new_col);
1489 Long old_size = m_csr_remote.row_offset.size();
1490 m_ri_ltor.resize(old_size-1);
1491 m_ri_rtol.resize(old_size-1);
1492 auto* p_ltor = m_ri_ltor.data();
1493 auto* p_rtol = m_ri_rtol.data();
1495 auto const* p_ro = m_csr_remote.row_offset.data();
1496 auto* p_tro = trimmed_row_offset.
data();
1498 if (old_size <
Long(std::numeric_limits<int>::max())) {
1500 new_size = Scan::PrefixSum<int>(
int(old_size),
1502 if (i+1 < old_size) {
1503 return (p_ro[i+1] > p_ro[i]);
1511 }
else if (p_ro[i] > p_ro[i-1]) {
1514 if (i+1 < old_size) {
1515 if (p_ro[i+1] > p_ro[i]) {
1526 new_size = Scan::PrefixSum<Long>(old_size,
1528 if (i+1 < old_size) {
1529 return (p_ro[i+1] > p_ro[i]);
1537 }
else if (p_ro[i] > p_ro[i-1]) {
1540 if (i+1 < old_size) {
1541 if (p_ro[i+1] > p_ro[i]) {
1552 m_ri_rtol.resize(new_size-1);
1553 trimmed_row_offset.
resize(new_size);
1555 m_ri_rtol.shrink_to_fit();
1558 m_csr_remote.row_offset.swap(trimmed_row_offset);
1561 }
else if (col_begin > 0) {
1562 auto* pcol = m_csr.col_index.data();
1566 update_remote_col_index(m_csr_remote,
true);
1571template <
typename T,
template<
typename>
class Allocator>
1572template <
typename C>
1579 m_remote_cols_v.clear();
1580 m_remote_cols_vv.clear();
1581 m_remote_cols_vv.resize(nprocs);
1583 m_remote_cols_dv.clear();
1586 if (csrr.nnz == 0) {
return; }
1591 if (in_device_memory) {
1592 m_remote_cols_v.resize(csrr.nnz);
1594 csrr.col_index.begin(),
1595 csrr.col_index.end(),
1596 m_remote_cols_v.begin());
1601 m_remote_cols_v.assign(csrr.col_index.begin(),
1602 csrr.col_index.end());
1608 m_remote_cols_dv.resize(m_remote_cols_v.size());
1610 m_remote_cols_v.begin(),
1611 m_remote_cols_v.end(),
1612 m_remote_cols_dv.data());
1616 auto const& cp = this->m_col_partition.dataVector();
1618 m_remote_cols_v.back() < cp.back());
1619 auto it = cp.cbegin();
1620 for (
auto c : m_remote_cols_v) {
1621 it = std::find_if(it, cp.cend(), [&] (
auto x) { return x > c; });
1622 if (it != cp.cend()) {
1623 int iproc =
int(std::distance(cp.cbegin(),it)) - 1;
1624 m_remote_cols_vv[iproc].push_back(c);
1626 amrex::Abort(
"SpMatrix::update_remote_col_index: how did this happen?");
1631 std::map<Long,Long> gtol;
1632 for (
Long i = 0, N =
Long(m_remote_cols_v.size()); i < N; ++i) {
1633 gtol[m_remote_cols_v[i]] = i;
1637 if (in_device_memory) {
1640 csrr.col_index.begin(),
1641 csrr.col_index.end(),
1642 host_col_index.
begin());
1644 for (
auto& c : host_col_index) {
1648 host_col_index.
begin(),
1649 host_col_index.
end(),
1650 csrr.col_index.begin());
1655 for (
auto& c : csrr.col_index) {
1661template <
typename T,
template<
typename>
class Allocator>
1664 if (m_num_neighbors >= 0) {
return; }
1671 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1672 connection[iproc] = m_remote_cols_vv[iproc].empty() ? 0 : 1;
1675 m_num_neighbors = 0;
1676 BL_MPI_REQUIRE(MPI_Reduce_scatter
1677 (connection.data(), &m_num_neighbors, reduce_scatter_counts.data(),
1678 mpi_int, MPI_SUM, mpi_comm));
1681template <
typename T,
template<
typename>
class Allocator>
1684 if (m_comm_mv.prepared) {
return; }
1688 this->split_csr(col_partition);
1695 if (m_num_neighbors < 0) { set_num_neighbors(); }
1698 mpi_requests.reserve(nprocs);
1699 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1700 if ( ! m_remote_cols_vv[iproc].empty()) {
1702 auto const sz = m_remote_cols_vv[iproc].
size();
1703 if (sz >
static_cast<Long>(std::numeric_limits<int>::max())) {
1704 amrex::Abort(
"SpMatrix::prepare_comm_mv: remote column payload exceeds MPI int count range.");
1706 auto const msg_count =
static_cast<int>(sz);
1708 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[iproc].data(),
1710 mpi_long, iproc, mpi_tag, mpi_comm,
1711 &(mpi_requests.back())));
1712 m_comm_mv.recv_from.push_back(iproc);
1713 m_comm_mv.recv_counts.push_back(msg_count);
1717 m_comm_mv.total_counts_recv =
Long(m_remote_cols_v.size());
1720 m_comm_mv.total_counts_send = 0;
1721 for (
int isend = 0; isend < m_num_neighbors; ++isend) {
1723 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1724 int receiver = mpi_status.MPI_SOURCE;
1726 BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
1727 m_comm_mv.send_to.push_back(receiver);
1728 m_comm_mv.send_counts.push_back(count);
1729 send_indices[isend].resize(count);
1730 BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
1731 receiver, mpi_tag, mpi_comm, &mpi_status));
1732 m_comm_mv.total_counts_send += count;
1735 m_comm_mv.send_indices.resize(m_comm_mv.total_counts_send);
1737 send_indices_all.
reserve(m_comm_mv.total_counts_send);
1738 for (
auto const& vl : send_indices) {
1744 m_comm_mv.send_indices.begin());
1747 if (! mpi_requests.empty()) {
1749 BL_MPI_REQUIRE(MPI_Waitall(
int(mpi_requests.
size()), mpi_requests.data(),
1750 mpi_statuses.data()));
1753 m_comm_mv.prepared =
true;
1756template <
typename T,
template<
typename>
class Allocator>
1759 auto*
pdst = m_comm_mv.send_buffer;
1760 auto* pidx = m_comm_mv.send_indices.data();
1761 auto const& vv = v.
view();
1762 auto const nsends =
Long(m_comm_mv.send_indices.size());
1765 pdst[i] = vv(pidx[i]);
1769template <
typename T,
template<
typename>
class Allocator>
1772 auto const& csr = m_csr_remote;
1778 auto const* rtol = m_ri_rtol.data();
1783 auto const nrr =
Long(csr.row_offset.size())-1;
1787 for (
Long j = row[i]; j < row[i+1]; ++j) {
1788 r += mat[j] * px[col[j]];
1795template <
typename T,
template<
typename>
class Allocator>
1800 m_col_partition = col_partition;
1804 m_ri_ltor.resize(m_csr.nrows(), -1);
1808 if (nnz == 0) {
return; }
1823 auto& csrr = m_csr_remote;
1825 csrr.
mat.resize(nnz);
1832 for (
int i = 0; i < nb; ++i) {
1841 for (
int lr = 0; lr < nrow_i; ++lr) {
1842 Long const gr = idx_map[lr];
1843 while (p < nrows && ri_map[p] < gr) { ++p; }
1846 row_nnz[p] +=
int(row_offset[lr+1] - row_offset[lr]);
1851 std::partial_sum(row_nnz.begin(), row_nnz.end(), csrr.
row_offset.begin()+1);
1856 for (
int i = 0; i < nb; ++i) {
1868 for (
int lr = 0; lr < nrow_i; ++lr) {
1869 Long const gr = idx_map[lr];
1870 while (p < nrows && ri_map[p] < gr) { ++p; }
1873 auto os_src = row_offset[lr] - row_offset[0];
1874 auto nvals = row_offset[lr+1] - row_offset[lr];
1875 auto os_dst = rowpos[p];
1876 std::memcpy(csrr. mat.data()+os_dst, mat+os_src,
1878 std::memcpy(csrr.
col_index.data()+os_dst, col_index+os_src,
1879 sizeof(
Long)*nvals);
1885 m_ri_rtol.resize(nrows);
1888 auto row_begin = m_row_begin;
1892 rtol[i] -= row_begin;
1898 update_remote_col_index(csrr,
false);
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_CUSPARSE_SAFE_CALL(call)
Definition AMReX_GpuError.H:101
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
Definition AMReX_AlgPartition.H:21
Long numGlobalRows() const
Total number of rows covered by the partition.
Definition AMReX_AlgPartition.H:47
Distributed dense vector that mirrors the layout of an AlgPartition.
Definition AMReX_AlgVector.H:29
Long numLocalRows() const
Number of entries stored on this rank.
Definition AMReX_AlgVector.H:74
T const * data() const
Definition AMReX_AlgVector.H:85
void define(Long global_size)
Resize/repartition the vector to span global_size rows.
Definition AMReX_AlgVector.H:257
Table1D< T const, Long > view() const
Definition AMReX_AlgVector.H:94
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
void reserve(size_type a_capacity, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:811
size_type size() const noexcept
Definition AMReX_PODVector.H:648
void shrink_to_fit()
Definition AMReX_PODVector.H:818
iterator begin() noexcept
Definition AMReX_PODVector.H:674
void resize(size_type a_new_size, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:728
iterator end() noexcept
Definition AMReX_PODVector.H:678
T * data() noexcept
Definition AMReX_PODVector.H:666
void push_back(const T &a_value)
Definition AMReX_PODVector.H:629
Distributed CSR matrix that manages storage and GPU-friendly partitions.
Definition AMReX_SpMatrix.H:61
void finishComm_tr(SpMatrix< T, Allocator > &AT)
Complete transpose communication, writing the assembled matrix into AT.
Definition AMReX_SpMatrix.H:1352
void split_csr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1393
Long globalRowBegin() const
Inclusive global index begin.
Definition AMReX_SpMatrix.H:196
void define_and_filter_doit(T const *mat, Long const *col_index, Long nentries, Long const *row_offset)
Private helper (exposed for CUDA) that copies/filters CSR arrays into device storage.
Definition AMReX_SpMatrix.H:829
Long * rowOffset()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:213
void sortCSR()
Definition AMReX_SpMatrix.H:821
void pack_buffer_mv(AlgVector< T, AllocT > const &v)
Definition AMReX_SpMatrix.H:1757
void unpack_buffer_mv(AlgVector< T, AllocT > &v)
Definition AMReX_SpMatrix.H:1770
void update_remote_col_index(C &csrr, bool in_device_memory)
Definition AMReX_SpMatrix.H:1573
void startComm_tr(AlgPartition const &col_partition)
Initiate communication required to build the transpose with column partition col_partition.
Definition AMReX_SpMatrix.H:1142
void define_doit(int nnz_per_row)
Private helper (exposed for CUDA) that allocates fixed-connectivity matrices with nnz_per_row entries...
Definition AMReX_SpMatrix.H:753
T * data()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:201
friend class AMG
Definition AMReX_SpMatrix.H:297
Long numGlobalRows() const
Global row count.
Definition AMReX_SpMatrix.H:191
SpMatrix & operator=(SpMatrix const &)=delete
friend SpMatrix< U, M > transpose(SpMatrix< U, M > const &A, AlgPartition col_partition)
T value_type
Definition AMReX_SpMatrix.H:63
Allocator< U > allocator_type
Definition AMReX_SpMatrix.H:64
Long globalRowEnd() const
Exclusive global index end.
Definition AMReX_SpMatrix.H:198
Long numLocalNonZeros() const
Number of nonzeros stored locally.
Definition AMReX_SpMatrix.H:193
struct amrex::SpMatrix::CommMV m_comm_mv
AlgVector< T, AllocT > rowSum() const
Sum the values in each local row and return the result as an AlgVector.
Definition AMReX_SpMatrix.H:980
AlgPartition const & columnPartition() const
Return the column partition used for matrix-vector and matrix-matrix multiplications.
Definition AMReX_SpMatrix.H:186
void printToFile(std::string const &file) const
Definition AMReX_SpMatrix.H:869
ParCsr< T const > const_parcsr() const
Const-qualified alias of parcsr() for convenience.
Definition AMReX_SpMatrix.H:1028
SpMatrix(SpMatrix const &)=delete
ParCsr< T > parcsr()
Build GPU-friendly CSR views split into diagonal/off-diagonal blocks.
Definition AMReX_SpMatrix.H:1004
SpMatrix(SpMatrix &&)=default
Long numLocalRows() const
Number of rows owned by this rank.
Definition AMReX_SpMatrix.H:189
AlgPartition const & partition() const
Row partition describing how matrix rows are distributed across ranks.
Definition AMReX_SpMatrix.H:177
AlgVector< T, AllocT > const & diagonalVector() const
Return (and cache) the diagonal entries of a square matrix.
Definition AMReX_SpMatrix.H:954
Long * columnIndex()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:207
void finishComm_mv(AlgVector< T, AllocT > &y)
Finish halo exchanges and accumulate contributions into y.
Definition AMReX_SpMatrix.H:1108
Allocator< T > AllocT
Definition AMReX_SpMatrix.H:67
struct amrex::SpMatrix::CommTR m_comm_tr
void prepare_comm_mv(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1682
void startComm_mv(AlgVector< T, AllocT > const &x)
Prepare halo exchanges for a subsequent SpMV using x as the source vector.
Definition AMReX_SpMatrix.H:1059
void unpack_buffer_tr(CommTR const &ctr, AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1796
friend void SpMV(AlgVector< U, N > &y, SpMatrix< U, M > const &A, AlgVector< U, N > const &x)
void setVal(F const &f, CsrSorted is_sorted)
Initialize matrix entries using a row-wise functor.
Definition AMReX_SpMatrix.H:934
void define(AlgPartition partition, int nnz_per_row)
Allocate storage for a default-constructed matrix with a fixed number of nonzeros per row.
Definition AMReX_SpMatrix.H:731
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:865
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:845
Arena * The_Arena()
Definition AMReX_Arena.cpp:805
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:696
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
amrex::ArenaAllocator< T > DefaultAllocator
Definition AMReX_GpuAllocators.H:205
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
void duplicateCSR(C c, CSR< T, AD > &dst, CSR< T, AS > const &src)
Copy CSR buffers between memory spaces asynchronously.
Definition AMReX_CSR.H:116
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:211
V< Long > row_offset
Definition AMReX_CSR.H:52
Long nnz
Definition AMReX_CSR.H:53
CsrView< T > view()
Mutable view of the underlying buffers.
Definition AMReX_CSR.H:77
void resize(Long num_rows, Long num_non_zeros)
Resize the storage to accommodate num_rows and num_non_zeros entries.
Definition AMReX_CSR.H:69
V< Long > col_index
Definition AMReX_CSR.H:51
V< T > mat
Definition AMReX_CSR.H:50
Sorted CSR means for each row the column indices are sorted.
Definition AMReX_SpMatrix.H:45
bool b
Definition AMReX_SpMatrix.H:46
Valid CSR means all entries are valid. It may be sorted ro unsorted.
Definition AMReX_SpMatrix.H:51
bool b
Definition AMReX_SpMatrix.H:52
Lightweight non-owning CSR view that can point to host or device buffers.
Definition AMReX_CSR.H:33
GPU-ready non-owning CSR data container.
Definition AMReX_SpMatrix.H:35
Long const *__restrict__ col_map
Definition AMReX_SpMatrix.H:41
Long const *__restrict__ row_map
Definition AMReX_SpMatrix.H:40
CsrView< T > csr1
Definition AMReX_SpMatrix.H:37
Long col_begin
Definition AMReX_SpMatrix.H:39
Long row_begin
Definition AMReX_SpMatrix.H:38
CsrView< T > csr0
Definition AMReX_SpMatrix.H:36
static MPI_Datatype type()
Definition AMReX_SpMatrix.H:381
T * send_buffer
Definition AMReX_SpMatrix.H:390
bool prepared
Definition AMReX_SpMatrix.H:397
Vector< int > recv_counts
Definition AMReX_SpMatrix.H:387
Long total_counts_recv
Definition AMReX_SpMatrix.H:395
Vector< int > recv_from
Definition AMReX_SpMatrix.H:386
T * recv_buffer
Definition AMReX_SpMatrix.H:394
Vector< int > send_counts
Definition AMReX_SpMatrix.H:383
Long total_counts_send
Definition AMReX_SpMatrix.H:391
Gpu::DeviceVector< Long > send_indices
Definition AMReX_SpMatrix.H:384
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:393
Vector< int > send_to
Definition AMReX_SpMatrix.H:382
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:389
Definition AMReX_SpMatrix.H:400
Vector< std::array< int, 2 > > send_counts
Definition AMReX_SpMatrix.H:404
Long * recv_buffer_col_index
Definition AMReX_SpMatrix.H:417
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:405
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:409
Vector< int > send_to
Definition AMReX_SpMatrix.H:403
std::array< Long, 2 > total_counts_recv
Definition AMReX_SpMatrix.H:411
Long * recv_buffer_row_offset
Definition AMReX_SpMatrix.H:418
Vector< std::array< int, 2 > > recv_counts
Definition AMReX_SpMatrix.H:408
CsrView< T > csrt
Definition AMReX_SpMatrix.H:401
T * recv_buffer_mat
Definition AMReX_SpMatrix.H:416
Vector< std::array< Long, 4 > > recv_buffer_offset
Definition AMReX_SpMatrix.H:412
Vector< int > recv_from
Definition AMReX_SpMatrix.H:407
Long * recv_buffer_idx_map
Definition AMReX_SpMatrix.H:419
Definition AMReX_ccse-mpi.H:55