1#ifndef AMREX_SP_MATRIX_H_
2#define AMREX_SP_MATRIX_H_
3#include <AMReX_Config.H>
11#if defined(AMREX_USE_CUDA)
13#elif defined(AMREX_USE_HIP)
14# include <rocsparse/rocsparse.h>
15#elif defined(AMREX_USE_SYCL)
16# include <mkl_version.h>
17# include <oneapi/mkl/spblas.hpp>
23#include <unordered_map>
41 explicit operator bool()
const {
return b; }
47 explicit operator bool()
const {
return b; }
190 return m_csr.
mat.data();
245 template <
typename F>
262 template <
typename U,
template<
typename>
class M,
typename N>
friend
265 template <
typename U,
template<
typename>
class M>
friend
268 template <
typename U>
friend class AMG;
274 template <
typename I>
276 Long nentries,
Long const* row_offset);
286 void set_num_neighbors ();
290 Long m_row_begin = 0;
292 Long m_col_begin = 0;
299 bool m_split =
false;
332 int m_num_neighbors = -1;
379 template <
typename C>
391void transpose (CsrView<T>
const& csrt, CsrView<T const>
const& csr)
393 Long nrows = csr.nrows;
394 Long ncols = csrt.nrows;
397 if (nrows <= 0 || ncols <= 0 || nnz <= 0) {
399 auto* p = csrt.row_offset;
407#if defined(AMREX_USE_CUDA)
409 cusparseHandle_t handle;
413 cudaDataType data_type;
414 if constexpr (std::is_same_v<T,float>) {
415 data_type = CUDA_R_32F;
416 }
else if constexpr (std::is_same_v<T,double>) {
417 data_type = CUDA_R_64F;
418 }
else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
419 data_type = CUDA_C_32F;
420 }
else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
421 data_type = CUDA_C_64F;
423 amrex::Abort(
"SpMatrix transpose: unsupported data type");
427 ncols <
Long(std::numeric_limits<int>::max()) &&
428 nnz <
Long(std::numeric_limits<int>::max()));
430 auto* csr_col_index = (
int*)
The_Arena()->
alloc(csr.nnz*
sizeof(
int));
432 auto* csrt_col_index = (
int*)
The_Arena()->
alloc(csrt.nnz*
sizeof(
int));
438 csr_col_index[i] = int(csr.col_index[i]);
440 if (i < csr.nrows+1) {
441 csr_row_offset[i] = int(csr.row_offset[i]);
445 std::size_t buffer_size;
447 cusparseCsr2cscEx2_bufferSize(handle,
int(nrows),
int(ncols),
int(nnz),
448 csr.mat, csr_row_offset, csr_col_index,
449 csrt.mat, csrt_row_offset, csrt_col_index,
450 data_type, CUSPARSE_ACTION_NUMERIC,
451 CUSPARSE_INDEX_BASE_ZERO,
452 CUSPARSE_CSR2CSC_ALG1,
458 cusparseCsr2cscEx2(handle,
int(nrows),
int(ncols),
int(nnz),
459 csr.mat, csr_row_offset, csr_col_index,
460 csrt.mat, csrt_row_offset, csrt_col_index,
461 data_type, CUSPARSE_ACTION_NUMERIC,
462 CUSPARSE_INDEX_BASE_ZERO,
463 CUSPARSE_CSR2CSC_ALG1,
469 csrt.col_index[i] = csrt_col_index[i];
471 if (i < csrt.nrows+1) {
472 csrt.row_offset[i] = csrt_row_offset[i];
484#elif defined(AMREX_USE_HIP)
486 rocsparse_handle handle;
487 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&handle));
488 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle,
Gpu::gpuStream()));
491 ncols <
Long(std::numeric_limits<rocsparse_int>::max()) &&
492 nnz <
Long(std::numeric_limits<rocsparse_int>::max()));
494 rocsparse_int* csr_col_index;
495 rocsparse_int* csr_row_offset;
496 rocsparse_int* csrt_col_index;
497 rocsparse_int* csrt_row_offset;
498 if (std::is_same_v<rocsparse_int,Long>) {
499 csr_col_index = (rocsparse_int*)csr.col_index;
500 csr_row_offset = (rocsparse_int*)csr.row_offset;
501 csrt_col_index = (rocsparse_int*)csrt.col_index;
502 csrt_row_offset = (rocsparse_int*)csrt.row_offset;
504 csr_col_index = (rocsparse_int*)
The_Arena()->
alloc(csr.nnz*
sizeof(rocsparse_int));
505 csr_row_offset = (rocsparse_int*)
The_Arena()->
alloc((csr.nrows+1)*
sizeof(rocsparse_int));
506 csrt_col_index = (rocsparse_int*)
The_Arena()->
alloc(csrt.nnz*
sizeof(rocsparse_int));
507 csrt_row_offset = (rocsparse_int*)
The_Arena()->
alloc((csrt.nrows+1)*
sizeof(rocsparse_int));
511 csr_col_index[i] = rocsparse_int(csr.col_index[i]);
513 if (i < csr.nrows+1) {
514 csr_row_offset[i] = rocsparse_int(csr.row_offset[i]);
519 std::size_t buffer_size;
520 AMREX_ROCSPARSE_SAFE_CALL(
521 rocsparse_csr2csc_buffer_size(handle, rocsparse_int(nrows),
522 rocsparse_int(ncols), rocsparse_int(nnz),
523 csr_row_offset, csr_col_index,
524 rocsparse_action_numeric,
529 if constexpr (std::is_same_v<T,float>) {
530 AMREX_ROCSPARSE_SAFE_CALL(
531 rocsparse_scsr2csc(handle, rocsparse_int(nrows),
532 rocsparse_int(ncols), rocsparse_int(nnz),
533 csr.mat, csr_row_offset, csr_col_index,
534 csrt.mat, csrt_col_index, csrt_row_offset,
535 rocsparse_action_numeric,
536 rocsparse_index_base_zero,
538 }
else if constexpr (std::is_same_v<T,double>) {
539 AMREX_ROCSPARSE_SAFE_CALL(
540 rocsparse_dcsr2csc(handle, rocsparse_int(nrows),
541 rocsparse_int(ncols), rocsparse_int(nnz),
542 csr.mat, csr_row_offset, csr_col_index,
543 csrt.mat, csrt_col_index, csrt_row_offset,
544 rocsparse_action_numeric,
545 rocsparse_index_base_zero,
547 }
else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
548 AMREX_ROCSPARSE_SAFE_CALL(
549 rocsparse_ccsr2csc(handle, rocsparse_int(nrows),
550 rocsparse_int(ncols), rocsparse_int(nnz),
551 (rocsparse_float_complex*)csr.mat, csr_row_offset, csr_col_index,
552 (rocsparse_float_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
553 rocsparse_action_numeric,
554 rocsparse_index_base_zero,
556 }
else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
557 AMREX_ROCSPARSE_SAFE_CALL(
558 rocsparse_zcsr2csc(handle, rocsparse_int(nrows),
559 rocsparse_int(ncols), rocsparse_int(nnz),
560 (rocsparse_double_complex*)csr.mat, csr_row_offset, csr_col_index,
561 (rocsparse_double_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
562 rocsparse_action_numeric,
563 rocsparse_index_base_zero,
566 amrex::Abort(
"SpMatrix transpose: unsupported data type");
569 if (! std::is_same_v<rocsparse_int,Long>) {
573 csrt.col_index[i] = csrt_col_index[i];
575 if (i < csrt.nrows+1) {
576 csrt.row_offset[i] = csrt_row_offset[i];
582 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(handle));
584 if (! std::is_same_v<rocsparse_int,Long>) {
591#elif defined(AMREX_USE_SYCL)
593 mkl::sparse::matrix_handle_t handle_in{};
594 mkl::sparse::matrix_handle_t handle_out{};
595 mkl::sparse::init_matrix_handle(&handle_in);
596 mkl::sparse::init_matrix_handle(&handle_out);
598#if defined(INTEL_MKL_VERSION) && (INTEL_MKL_VERSION < 20250300)
600 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols,
601 mkl::index_base::zero, (
Long*)csr.row_offset,
602 (
Long*)csr.col_index, (T*)csr.mat);
603 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows,
604 mkl::index_base::zero, (
Long*)csrt.row_offset,
605 (
Long*)csrt.col_index, (T*)csrt.mat);
607 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols, nnz,
608 mkl::index_base::zero, (
Long*)csr.row_offset,
609 (
Long*)csr.col_index, (T*)csr.mat);
610 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows, nnz,
611 mkl::index_base::zero, (
Long*)csrt.row_offset,
612 (
Long*)csrt.col_index, (T*)csrt.mat);
615 mkl::sparse::omatcopy(Gpu::Device::streamQueue(), mkl::transpose::trans,
616 handle_in, handle_out);
618 mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_in);
619 auto ev = mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_out);
628 auto* p = csrt.row_offset;
634#pragma omp parallel for
636 for (
Long i = 0; i < nnz; ++i) {
637 auto col = csr.col_index[i];
639#pragma omp atomic update
645 Vector<Long> current_pos(ncols+1);
647 for (
Long i = 0; i < ncols; ++i) {
649 current_pos[i+1] = p[i+1];
654 for (
Long i = 0; i < nrows; ++i) {
655 for (
Long idx = csr.row_offset[i]; idx < csr.row_offset[i+1]; ++idx) {
656 auto col = csr.col_index[idx];
657 Long dest = current_pos[col]++;
658 csrt.mat[dest] = csr.mat[idx];
659 csrt.col_index[dest] = i;
667template <
typename T,
template<
typename>
class Allocator>
669 : m_partition(std::move(partition)),
670 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
671 m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
676template <
typename T,
template<
typename>
class Allocator>
678 : m_partition(std::move(partition)),
679 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
680 m_row_end(m_partition[ParallelDescriptor::MyProc()+1]),
682 m_csr(std::move(csr))
685template <
typename T,
template<
typename>
class Allocator>
688 m_partition = std::move(partition);
691 define_doit(nnz_per_row);
694template <
typename T,
template<
typename>
class Allocator>
698 m_partition = std::move(partition);
702 m_csr = std::move(csr);
703 if (! is_sorted) { m_csr.sort(); }
706template <
typename T,
template<
typename>
class Allocator>
710 if (nnz_per_row <= 0) {
return; };
714 Long nlocalrows = this->numLocalRows();
715 m_nnz = nlocalrows*nnz_per_row;
716 m_csr.mat.resize(m_nnz);
717 m_csr.col_index.resize(m_nnz);
718 m_csr.row_offset.resize(nlocalrows+1);
721 auto* poffset = m_csr.row_offset.data();
724 poffset[lrow] = lrow*nnz_per_row;
728template <
typename T,
template<
typename>
class Allocator>
731 Long const* col_index,
Long nentries,
737 m_partition = std::move(partition);
745 Long nlocalrows = this->numLocalRows();
746 m_csr.mat.resize(nentries);
747 m_csr.col_index.resize(nentries);
748 m_csr.row_offset.resize(nlocalrows+1);
749 m_csr.nnz = nentries;
752 m_csr.col_index.begin());
754 m_csr.row_offset.begin());
756 if (nentries <
Long(std::numeric_limits<int>::max())) {
757 define_and_filter_doit<int>(mat, col_index, nentries, row_offset);
759 define_and_filter_doit<Long>(mat, col_index, nentries, row_offset);
774template <
typename T,
template<
typename>
class Allocator>
781template <
typename T,
template<
typename>
class Allocator>
785 Long nentries,
Long const* row_offset)
788 auto* ps = psum.
data();
789 m_nnz = Scan::PrefixSum<I>(I(nentries),
791 return col_index[i] >= 0 && mat[i] != 0; },
795 Long nlocalrows = this->numLocalRows();
796 m_csr.mat.resize(m_nnz);
797 m_csr.col_index.resize(m_nnz);
798 m_csr.row_offset.resize(nlocalrows+1);
800 auto* pmat = m_csr.mat.data();
801 auto* pcol = m_csr.col_index.data();
802 auto* prow = m_csr.row_offset.data();
803 auto actual_nnz = m_nnz;
807 if (col_index[i] >= 0 && mat[i] != 0) {
808 pmat[ps[i]] = mat[i];
809 pcol[ps[i]] = col_index[i];
812 if (i < nlocalrows) {
813 prow[i] = ps[row_offset[i]];
814 if (i == nlocalrows - 1) {
815 prow[nlocalrows] = actual_nnz;
822template <
typename T,
template<
typename>
class Allocator>
843 auto const& remote_cols = m_remote_cols_v;
850 auto const& csr = m_csr;
852 auto const& csr_r = m_csr_remote;
853 auto const& ri_ltor = m_ri_ltor;
854 auto const& remote_cols = m_remote_cols_v;
859 Long nnz = m_csr.nnz;
861 nnz += m_csr_remote.nnz;
865 ofs << m_row_begin <<
" " << m_row_end <<
" " << nnz <<
"\n";
866 for (
Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
870 for (
Long j = 0; j < nnz_row; ++j) {
871 ofs << i+m_row_begin <<
" " << col[j]+m_col_begin <<
" " << mat[j] <<
"\n";
874 if (i <
Long(ri_ltor.
size()) && ri_ltor[i] >= 0) {
875 Long ii = ri_ltor[i];
879 for (
Long j = 0; j < nnz_row; ++j) {
880 ofs << i+m_row_begin <<
" " << remote_cols[col[j]] <<
" " << mat[j] <<
"\n";
887template <
typename T,
template<
typename>
class Allocator>
895 Long nlocalrows = this->numLocalRows();
896 Long rowbegin = this->globalRowBegin();
897 auto* pmat = m_csr.mat.data();
898 auto* pcolindex = m_csr.col_index.data();
899 auto* prowoffset = m_csr.row_offset.data();
902 f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
905 if (! is_sorted) { m_csr.sort(); }
908template <
typename T,
template<
typename>
class Allocator>
911 if (m_diagonal.empty()) {
912 m_diagonal.
define(this->partition());
917 auto offset = m_split ?
Long(0) : m_row_begin;
918 Long nrows = this->numLocalRows();
922 for (
Long j = row[i]; j < row[i+1]; ++j) {
923 if (i == col[j] -
offset) {
934template <
typename T,
template<
typename>
class Allocator>
939 auto const& a = this->const_parcsr();
943 for (
auto idx = a.csr0.row_offset[i];
944 idx < a.csr0.row_offset[i+1]; ++idx) {
945 s += a.csr0.mat[idx];
947 if (a.csr1.nnz > 0) {
948 for (
auto idx = a.csr1.row_offset[i];
949 idx < a.csr1.row_offset[i+1]; ++idx) {
950 s += a.csr1.mat[idx];
958template <
typename T,
template<
typename>
class Allocator>
972 m_remote_cols_dv.data()
974 m_remote_cols_v.data()
982template <
typename T,
template<
typename>
class Allocator>
988 m_csr_remote.const_view(),
997 m_remote_cols_dv.data()
999 m_remote_cols_v.data()
1007template <
typename T,
template<
typename>
class Allocator>
1010 return this->const_parcsr();
1013template <
typename T,
template<
typename>
class Allocator>
1016#ifndef AMREX_USE_MPI
1019 if (this->partition().numActiveProcs() <= 1) {
return; }
1021 this->prepare_comm_mv(
x.partition());
1027 auto const nrecvs =
int(m_comm_mv.recv_from.size());
1031 auto* p_recv = m_comm_mv.recv_buffer;
1032 for (
int irecv = 0; irecv < nrecvs; ++irecv) {
1033 BL_MPI_REQUIRE(MPI_Irecv(p_recv,
1034 m_comm_mv.recv_counts[irecv], mpi_t_type,
1035 m_comm_mv.recv_from[irecv], mpi_tag, mpi_comm,
1036 &(m_comm_mv.recv_reqs[irecv])));
1037 p_recv += m_comm_mv.recv_counts[irecv];
1039 AMREX_ASSERT(p_recv == m_comm_mv.recv_buffer + m_comm_mv.total_counts_recv);
1042 auto const nsends =
int(m_comm_mv.send_to.size());
1050 auto* p_send = m_comm_mv.send_buffer;
1051 for (
int isend = 0; isend < nsends; ++isend) {
1052 auto count = m_comm_mv.send_counts[isend];
1053 BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_comm_mv.send_to[isend],
1054 mpi_tag, mpi_comm, &(m_comm_mv.send_reqs[isend])));
1057 AMREX_ASSERT(p_send == m_comm_mv.send_buffer + m_comm_mv.total_counts_send);
1062template <
typename T,
template<
typename>
class Allocator>
1065 if (this->numLocalRows() == 0) {
return; }
1067#ifndef AMREX_USE_MPI
1070 if (this->partition().numActiveProcs() <= 1) {
return; }
1072 if ( ! m_comm_mv.recv_reqs.empty()) {
1074 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_mv.recv_reqs.size()),
1075 m_comm_mv.recv_reqs.data(),
1076 mpi_statuses.data()));
1079 unpack_buffer_mv(
y);
1081 if ( ! m_comm_mv.send_reqs.empty()) {
1083 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_mv.send_reqs.size()),
1084 m_comm_mv.send_reqs.data(),
1085 mpi_statuses.data()));
1091 m_comm_mv.send_reqs.clear();
1092 m_comm_mv.recv_reqs.clear();
1096template <
typename T,
template<
typename>
class Allocator>
1100 if (this->partition().numActiveProcs() <= 1) {
return; }
1102 this->split_csr(col_partition);
1111 if (m_csr_remote.nnz > 0) {
1112 m_comm_tr.csrt.nnz = m_csr_remote.nnz;
1113 m_comm_tr.csrt.nrows = m_remote_cols_v.size();
1115 (
sizeof(T)*m_comm_tr.csrt.nnz);
1117 (
sizeof(
Long)*m_comm_tr.csrt.nnz);
1119 (
sizeof(
Long)*(m_comm_tr.csrt.nrows+1));
1122 csr_comm.
resize(m_comm_tr.csrt.nrows, m_comm_tr.csrt.nnz);
1123 auto const& csrv_comm = csr_comm.
view();
1125 auto const& csrv_comm = m_comm_tr.csrt;
1127 detail::transpose(csrv_comm, m_csr_remote.const_view());
1128 auto row_begin = m_row_begin;
1129 auto ri_rtol = m_ri_rtol.data();
1130 auto* col_index = csrv_comm.col_index;
1133 auto gjt =ri_rtol[col_index[idx]] + row_begin;
1134 col_index[idx] = gjt;
1139 csrv_comm. mat + csrv_comm.nnz,
1140 m_comm_tr.csrt.mat);
1142 csrv_comm. col_index,
1143 csrv_comm. col_index + csrv_comm.nnz,
1144 m_comm_tr.csrt.col_index);
1146 csrv_comm. row_offset,
1147 csrv_comm. row_offset + csrv_comm.nrows+1,
1148 m_comm_tr.csrt.row_offset);
1153 if (m_num_neighbors < 0) { set_num_neighbors(); }
1159 mpi_requests.reserve(nprocs);
1160 if (m_csr_remote.nnz > 0) {
1162 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1164 for (
Long i = 0; i <
Long(m_remote_cols_vv[iproc].size()); ++i) {
1165 n += m_comm_tr.csrt.row_offset[it+1] - m_comm_tr.csrt.row_offset[it];
1171 std::array<int,2> nn{
int(n),
int(m_remote_cols_vv[iproc].size())};
1172 BL_MPI_REQUIRE(MPI_Isend(nn.data(), 2, MPI_INT, iproc, mpi_tag,
1173 mpi_comm, &(mpi_requests.back())));
1174 m_comm_tr.send_to.push_back(iproc);
1175 m_comm_tr.send_counts.push_back(nn);
1183 for (
int irecv = 0; irecv < m_num_neighbors; ++irecv) {
1185 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1186 int sender = mpi_status.MPI_SOURCE;
1187 std::array<int,2> nn;
1188 BL_MPI_REQUIRE(MPI_Recv(nn.data(), 2, MPI_INT, sender, mpi_tag,
1189 mpi_comm, &mpi_status));
1190 m_comm_tr.recv_from.push_back(sender);
1191 m_comm_tr.recv_counts.push_back(nn);
1192 m_comm_tr.total_counts_recv[0] += nn[0];
1193 m_comm_tr.total_counts_recv[1] += nn[1];
1196 if (! mpi_requests.empty()) {
1198 BL_MPI_REQUIRE(MPI_Waitall(
int(mpi_requests.
size()), mpi_requests.data(),
1199 mpi_statuses.data()));
1211 auto const nrecvs =
int(m_comm_tr.recv_from.size());
1214 (
sizeof(T) * m_comm_tr.total_counts_recv[0]);
1216 (
sizeof(
Long) * m_comm_tr.total_counts_recv[0]);
1218 (
sizeof(
Long) * (m_comm_tr.total_counts_recv[1]+nrecvs));
1220 (
sizeof(
Long) * m_comm_tr.total_counts_recv[1]);
1221 m_comm_tr.recv_buffer_offset.push_back({0,0,0,0});
1223 for (
int irecv = 0; irecv < nrecvs; ++irecv) {
1224 auto [os0, os1, os2, os3] = m_comm_tr.recv_buffer_offset.back();
1225 auto [n0, n1] = m_comm_tr.recv_counts[irecv];
1226 auto recv_from_rank = m_comm_tr.recv_from[irecv];
1227 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_mat + os0,
1233 &(m_comm_tr.recv_reqs[irecv*4])));
1234 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_col_index + os1,
1240 &(m_comm_tr.recv_reqs[irecv*4+1])));
1241 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_row_offset + os2,
1247 &(m_comm_tr.recv_reqs[irecv*4+2])));
1248 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_idx_map + os3,
1254 &(m_comm_tr.recv_reqs[irecv*4+3])));
1255 m_comm_tr.recv_buffer_offset.push_back({os0 + n0,
1262 auto const nsends =
int(m_comm_tr.send_to.size());
1265 Long os0 = 0, os1 = 0;
1266 for (
int isend = 0; isend < nsends; ++isend) {
1267 auto [n0, n1] = m_comm_tr.send_counts[isend];
1268 auto send_to_rank = m_comm_tr.send_to[isend];
1269 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.mat + os0,
1275 &(m_comm_tr.send_reqs[isend*4])));
1276 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.col_index + os0,
1282 &(m_comm_tr.send_reqs[isend*4+1])));
1283 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.row_offset + os1,
1289 &(m_comm_tr.send_reqs[isend*4+2])));
1290 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[send_to_rank].data(),
1296 &(m_comm_tr.send_reqs[isend*4+3])));
1306template <
typename T,
template<
typename>
class Allocator>
1310 if (this->partition().numActiveProcs() <= 1) {
return; }
1312 if (! m_comm_tr.recv_reqs.empty()) {
1314 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_tr.recv_reqs.size()),
1315 m_comm_tr.recv_reqs.data(),
1316 mpi_statuses.data()));
1321 if (! m_comm_tr.send_reqs.empty()) {
1323 BL_MPI_REQUIRE(MPI_Waitall(
int(m_comm_tr.send_reqs.size()),
1324 m_comm_tr.send_reqs.data(),
1325 mpi_statuses.data()));
1328 if (m_comm_tr.csrt.nnz > 0) {
1333 if (m_comm_tr.recv_buffer_mat) {
1347template <
typename T,
template<
typename>
class Allocator>
1359 m_col_partition = col_partition;
1371 auto* p_pfsum = pfsum.
data();
1372 auto col_begin = m_col_begin;
1373 auto col_end = m_col_end;
1374 if (m_csr.nnz <
Long(std::numeric_limits<int>::max())) {
1375 auto const* pcol = m_csr.col_index.data();
1376 local_nnz = Scan::PrefixSum<int>(
int(m_nnz),
1378 return (pcol[i] >= col_begin &&
1379 pcol[i] < col_end); },
1384 auto const* pcol = m_csr.col_index.data();
1385 local_nnz = Scan::PrefixSum<Long>(m_nnz,
1387 return (pcol[i] >= col_begin &&
1388 pcol[i] < col_end); },
1394 m_csr.nnz = local_nnz;
1395 Long remote_nnz = m_nnz - local_nnz;
1396 m_csr_remote.nnz = remote_nnz;
1398 if (local_nnz != m_nnz) {
1399 m_csr_remote.mat.resize(remote_nnz);
1400 m_csr_remote.col_index.resize(remote_nnz);
1403 auto const* pmat = m_csr.mat.data();
1404 auto const* pcol = m_csr.col_index.data();
1405 auto* pmat_l = new_mat.
data();
1406 auto* pcol_l = new_col.
data();
1407 auto* pmat_r = m_csr_remote.mat.data();
1408 auto* pcol_r = m_csr_remote.col_index.data();
1411 auto ps = p_pfsum[i];
1412 auto local = (pcol[i] >= col_begin &&
1415 pmat_l[ps] = pmat[i];
1416 pcol_l[ps] = pcol[i] - col_begin;
1418 pmat_r[i-ps] = pmat[i];
1419 pcol_r[i-ps] = pcol[i];
1422 auto noffset =
Long(m_csr.row_offset.size());
1423 auto* pro = m_csr.row_offset.data();
1424 m_csr_remote.row_offset.resize(noffset);
1425 auto* pro_r = m_csr_remote.row_offset.data();
1428 if (i < noffset-1) {
1429 auto ro_l = p_pfsum[pro[i]];
1430 pro_r[i] = pro[i] - ro_l;
1434 pro_r[i] = remote_nnz;
1438 m_csr.mat.swap(new_mat);
1439 m_csr.col_index.swap(new_col);
1444 Long old_size = m_csr_remote.row_offset.size();
1445 m_ri_ltor.resize(old_size-1);
1446 m_ri_rtol.resize(old_size-1);
1447 auto* p_ltor = m_ri_ltor.data();
1448 auto* p_rtol = m_ri_rtol.data();
1450 auto const* p_ro = m_csr_remote.row_offset.data();
1451 auto* p_tro = trimmed_row_offset.
data();
1453 if (old_size <
Long(std::numeric_limits<int>::max())) {
1455 new_size = Scan::PrefixSum<int>(
int(old_size),
1457 if (i+1 < old_size) {
1458 return (p_ro[i+1] > p_ro[i]);
1466 }
else if (p_ro[i] > p_ro[i-1]) {
1469 if (i+1 < old_size) {
1470 if (p_ro[i+1] > p_ro[i]) {
1481 new_size = Scan::PrefixSum<Long>(old_size,
1483 if (i+1 < old_size) {
1484 return (p_ro[i+1] > p_ro[i]);
1492 }
else if (p_ro[i] > p_ro[i-1]) {
1495 if (i+1 < old_size) {
1496 if (p_ro[i+1] > p_ro[i]) {
1507 m_ri_rtol.resize(new_size-1);
1508 trimmed_row_offset.
resize(new_size);
1510 m_ri_rtol.shrink_to_fit();
1513 m_csr_remote.row_offset.swap(trimmed_row_offset);
1516 }
else if (col_begin > 0) {
1517 auto* pcol = m_csr.col_index.data();
1521 update_remote_col_index(m_csr_remote,
true);
1526template <
typename T,
template<
typename>
class Allocator>
1527template <
typename C>
1534 m_remote_cols_v.clear();
1535 m_remote_cols_vv.clear();
1536 m_remote_cols_vv.resize(nprocs);
1538 m_remote_cols_dv.clear();
1541 if (csrr.nnz == 0) {
return; }
1546 if (in_device_memory) {
1547 m_remote_cols_v.resize(csrr.nnz);
1549 csrr.col_index.begin(),
1550 csrr.col_index.end(),
1551 m_remote_cols_v.begin());
1556 m_remote_cols_v.assign(csrr.col_index.begin(),
1557 csrr.col_index.end());
1563 m_remote_cols_dv.resize(m_remote_cols_v.size());
1565 m_remote_cols_v.begin(),
1566 m_remote_cols_v.end(),
1567 m_remote_cols_dv.data());
1571 auto const& cp = this->m_col_partition.dataVector();
1573 m_remote_cols_v.back() < cp.back());
1574 auto it = cp.cbegin();
1575 for (
auto c : m_remote_cols_v) {
1576 it = std::find_if(it, cp.cend(), [&] (
auto x) { return x > c; });
1577 if (it != cp.cend()) {
1578 int iproc =
int(std::distance(cp.cbegin(),it)) - 1;
1579 m_remote_cols_vv[iproc].push_back(c);
1581 amrex::Abort(
"SpMatrix::update_remote_col_index: how did this happen?");
1586 std::map<Long,Long> gtol;
1587 for (
Long i = 0, N =
Long(m_remote_cols_v.size()); i < N; ++i) {
1588 gtol[m_remote_cols_v[i]] = i;
1592 if (in_device_memory) {
1595 csrr.col_index.begin(),
1596 csrr.col_index.end(),
1597 host_col_index.
begin());
1599 for (
auto& c : host_col_index) {
1603 host_col_index.
begin(),
1604 host_col_index.
end(),
1605 csrr.col_index.begin());
1610 for (
auto& c : csrr.col_index) {
1616template <
typename T,
template<
typename>
class Allocator>
1619 if (m_num_neighbors >= 0) {
return; }
1626 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1627 connection[iproc] = m_remote_cols_vv[iproc].empty() ? 0 : 1;
1630 m_num_neighbors = 0;
1631 BL_MPI_REQUIRE(MPI_Reduce_scatter
1632 (connection.data(), &m_num_neighbors, reduce_scatter_counts.data(),
1633 mpi_int, MPI_SUM, mpi_comm));
1636template <
typename T,
template<
typename>
class Allocator>
1639 if (m_comm_mv.prepared) {
return; }
1643 this->split_csr(col_partition);
1650 if (m_num_neighbors < 0) { set_num_neighbors(); }
1653 mpi_requests.reserve(nprocs);
1654 for (
int iproc = 0; iproc < nprocs; ++iproc) {
1655 if ( ! m_remote_cols_vv[iproc].empty()) {
1658 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[iproc].data(),
1659 int(m_remote_cols_vv[iproc].size()),
1660 mpi_long, iproc, mpi_tag, mpi_comm,
1661 &(mpi_requests.back())));
1662 m_comm_mv.recv_from.push_back(iproc);
1663 m_comm_mv.recv_counts.push_back(
int(m_remote_cols_vv[iproc].size()));
1667 m_comm_mv.total_counts_recv =
Long(m_remote_cols_v.size());
1670 m_comm_mv.total_counts_send = 0;
1671 for (
int isend = 0; isend < m_num_neighbors; ++isend) {
1673 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1674 int receiver = mpi_status.MPI_SOURCE;
1676 BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
1677 m_comm_mv.send_to.push_back(receiver);
1678 m_comm_mv.send_counts.push_back(count);
1679 send_indices[isend].resize(count);
1680 BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
1681 receiver, mpi_tag, mpi_comm, &mpi_status));
1682 m_comm_mv.total_counts_send += count;
1685 m_comm_mv.send_indices.resize(m_comm_mv.total_counts_send);
1687 send_indices_all.
reserve(m_comm_mv.total_counts_send);
1688 for (
auto const& vl : send_indices) {
1694 m_comm_mv.send_indices.begin());
1697 if (! mpi_requests.empty()) {
1699 BL_MPI_REQUIRE(MPI_Waitall(
int(mpi_requests.
size()), mpi_requests.data(),
1700 mpi_statuses.data()));
1703 m_comm_mv.prepared =
true;
1706template <
typename T,
template<
typename>
class Allocator>
1709 auto*
pdst = m_comm_mv.send_buffer;
1710 auto* pidx = m_comm_mv.send_indices.data();
1711 auto const& vv = v.
view();
1712 auto const nsends =
Long(m_comm_mv.send_indices.size());
1715 pdst[i] = vv(pidx[i]);
1719template <
typename T,
template<
typename>
class Allocator>
1722 auto const& csr = m_csr_remote;
1728 auto const* rtol = m_ri_rtol.data();
1733 auto const nrr =
Long(csr.row_offset.size())-1;
1737 for (
Long j = row[i]; j < row[i+1]; ++j) {
1738 r += mat[j] * px[col[j]];
1745template <
typename T,
template<
typename>
class Allocator>
1750 m_col_partition = col_partition;
1754 m_ri_ltor.resize(m_csr.nrows(), -1);
1758 if (nnz == 0) {
return; }
1773 auto& csrr = m_csr_remote;
1775 csrr.
mat.resize(nnz);
1782 for (
int i = 0; i < nb; ++i) {
1791 for (
int lr = 0; lr < nrow_i; ++lr) {
1792 Long const gr = idx_map[lr];
1793 while (p < nrows && ri_map[p] < gr) { ++p; }
1796 row_nnz[p] +=
int(row_offset[lr+1] - row_offset[lr]);
1801 std::partial_sum(row_nnz.begin(), row_nnz.end(), csrr.
row_offset.begin()+1);
1806 for (
int i = 0; i < nb; ++i) {
1818 for (
int lr = 0; lr < nrow_i; ++lr) {
1819 Long const gr = idx_map[lr];
1820 while (p < nrows && ri_map[p] < gr) { ++p; }
1823 auto os_src = row_offset[lr] - row_offset[0];
1824 auto nvals = row_offset[lr+1] - row_offset[lr];
1825 auto os_dst = rowpos[p];
1826 std::memcpy(csrr. mat.data()+os_dst, mat+os_src,
1828 std::memcpy(csrr.
col_index.data()+os_dst, col_index+os_src,
1829 sizeof(
Long)*nvals);
1835 m_ri_rtol.resize(nrows);
1838 auto row_begin = m_row_begin;
1842 rtol[i] -= row_begin;
1848 update_remote_col_index(csrr,
false);
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_CUSPARSE_SAFE_CALL(call)
Definition AMReX_GpuError.H:101
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
Definition AMReX_AlgPartition.H:14
Long numGlobalRows() const
Definition AMReX_AlgPartition.H:28
Definition AMReX_AlgVector.H:20
Long numLocalRows() const
Definition AMReX_AlgVector.H:46
T const * data() const
Definition AMReX_AlgVector.H:56
void define(Long global_size)
Definition AMReX_AlgVector.H:228
Table1D< T const, Long > view() const
Definition AMReX_AlgVector.H:65
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
void reserve(size_type a_capacity, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:811
size_type size() const noexcept
Definition AMReX_PODVector.H:648
void shrink_to_fit()
Definition AMReX_PODVector.H:818
iterator begin() noexcept
Definition AMReX_PODVector.H:674
void resize(size_type a_new_size, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:728
iterator end() noexcept
Definition AMReX_PODVector.H:678
T * data() noexcept
Definition AMReX_PODVector.H:666
void push_back(const T &a_value)
Definition AMReX_PODVector.H:629
Definition AMReX_SpMatrix.H:52
void finishComm_tr(SpMatrix< T, Allocator > &AT)
Definition AMReX_SpMatrix.H:1307
void split_csr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1348
Long globalRowBegin() const
Inclusive global index begin.
Definition AMReX_SpMatrix.H:183
void define_and_filter_doit(T const *mat, Long const *col_index, Long nentries, Long const *row_offset)
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:784
Long * rowOffset()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:200
void sortCSR()
Definition AMReX_SpMatrix.H:776
void pack_buffer_mv(AlgVector< T, AllocT > const &v)
Definition AMReX_SpMatrix.H:1707
void unpack_buffer_mv(AlgVector< T, AllocT > &v)
Definition AMReX_SpMatrix.H:1720
void update_remote_col_index(C &csrr, bool in_device_memory)
Definition AMReX_SpMatrix.H:1528
void startComm_tr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1097
void define_doit(int nnz_per_row)
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:708
T * data()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:188
friend class AMG
Definition AMReX_SpMatrix.H:268
Long numGlobalRows() const
Definition AMReX_SpMatrix.H:179
SpMatrix & operator=(SpMatrix const &)=delete
friend SpMatrix< U, M > transpose(SpMatrix< U, M > const &A, AlgPartition col_partition)
T value_type
Definition AMReX_SpMatrix.H:54
Allocator< U > allocator_type
Definition AMReX_SpMatrix.H:55
Long globalRowEnd() const
Exclusive global index end.
Definition AMReX_SpMatrix.H:185
Long numLocalNonZeros() const
Definition AMReX_SpMatrix.H:180
struct amrex::SpMatrix::CommMV m_comm_mv
AlgVector< T, AllocT > rowSum() const
Return row-sum vector.
Definition AMReX_SpMatrix.H:935
AlgPartition const & columnPartition() const
Return the column partition used for matrix-vector and matrix-matrix multiplications.
Definition AMReX_SpMatrix.H:176
void printToFile(std::string const &file) const
Definition AMReX_SpMatrix.H:824
ParCsr< T const > const_parcsr() const
Definition AMReX_SpMatrix.H:983
SpMatrix(SpMatrix const &)=delete
ParCsr< T > parcsr()
Definition AMReX_SpMatrix.H:959
SpMatrix(SpMatrix &&)=default
Long numLocalRows() const
Definition AMReX_SpMatrix.H:178
AlgPartition const & partition() const
Definition AMReX_SpMatrix.H:167
AlgVector< T, AllocT > const & diagonalVector() const
Return diagonal elements in a square matrix.
Definition AMReX_SpMatrix.H:909
Long * columnIndex()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:194
void finishComm_mv(AlgVector< T, AllocT > &y)
Definition AMReX_SpMatrix.H:1063
Allocator< T > AllocT
Definition AMReX_SpMatrix.H:58
struct amrex::SpMatrix::CommTR m_comm_tr
void prepare_comm_mv(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1637
void startComm_mv(AlgVector< T, AllocT > const &x)
Definition AMReX_SpMatrix.H:1014
void unpack_buffer_tr(CommTR const &ctr, AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1746
friend void SpMV(AlgVector< U, N > &y, SpMatrix< U, M > const &A, AlgVector< U, N > const &x)
void setVal(F const &f, CsrSorted is_sorted)
Initialize matrix entries using a row-wise functor.
Definition AMReX_SpMatrix.H:889
void define(AlgPartition partition, int nnz_per_row)
Allocate storage for a default-constructed matrix with a fixed number of nonzeros per row.
Definition AMReX_SpMatrix.H:686
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:243
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:843
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:696
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
amrex::ArenaAllocator< T > DefaultAllocator
Definition AMReX_GpuAllocators.H:199
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
void duplicateCSR(C c, CSR< T, AD > &dst, CSR< T, AS > const &src)
Definition AMReX_CSR.H:71
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:211
V< Long > row_offset
Definition AMReX_CSR.H:33
Long nnz
Definition AMReX_CSR.H:34
CsrView< T > view()
Definition AMReX_CSR.H:47
void resize(Long num_rows, Long num_non_zeros)
Definition AMReX_CSR.H:40
V< Long > col_index
Definition AMReX_CSR.H:32
V< T > mat
Definition AMReX_CSR.H:31
Sorted CSR means for each row the column indices are sorted.
Definition AMReX_SpMatrix.H:39
bool b
Definition AMReX_SpMatrix.H:40
Valid CSR means all entries are valid. It may be sorted ro unsorted.
Definition AMReX_SpMatrix.H:45
bool b
Definition AMReX_SpMatrix.H:46
Definition AMReX_CSR.H:20
GPU-ready non-owning CSR data container.
Definition AMReX_SpMatrix.H:29
Long const *__restrict__ col_map
Definition AMReX_SpMatrix.H:35
Long const *__restrict__ row_map
Definition AMReX_SpMatrix.H:34
CsrView< T > csr1
Definition AMReX_SpMatrix.H:31
Long col_begin
Definition AMReX_SpMatrix.H:33
Long row_begin
Definition AMReX_SpMatrix.H:32
CsrView< T > csr0
Definition AMReX_SpMatrix.H:30
static MPI_Datatype type()
Definition AMReX_SpMatrix.H:336
T * send_buffer
Definition AMReX_SpMatrix.H:345
bool prepared
Definition AMReX_SpMatrix.H:352
Vector< int > recv_counts
Definition AMReX_SpMatrix.H:342
Long total_counts_recv
Definition AMReX_SpMatrix.H:350
Vector< int > recv_from
Definition AMReX_SpMatrix.H:341
T * recv_buffer
Definition AMReX_SpMatrix.H:349
Vector< int > send_counts
Definition AMReX_SpMatrix.H:338
Long total_counts_send
Definition AMReX_SpMatrix.H:346
Gpu::DeviceVector< Long > send_indices
Definition AMReX_SpMatrix.H:339
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:348
Vector< int > send_to
Definition AMReX_SpMatrix.H:337
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:344
Definition AMReX_SpMatrix.H:355
Vector< std::array< int, 2 > > send_counts
Definition AMReX_SpMatrix.H:359
Long * recv_buffer_col_index
Definition AMReX_SpMatrix.H:372
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:360
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:364
Vector< int > send_to
Definition AMReX_SpMatrix.H:358
std::array< Long, 2 > total_counts_recv
Definition AMReX_SpMatrix.H:366
Long * recv_buffer_row_offset
Definition AMReX_SpMatrix.H:373
Vector< std::array< int, 2 > > recv_counts
Definition AMReX_SpMatrix.H:363
CsrView< T > csrt
Definition AMReX_SpMatrix.H:356
T * recv_buffer_mat
Definition AMReX_SpMatrix.H:371
Vector< std::array< Long, 4 > > recv_buffer_offset
Definition AMReX_SpMatrix.H:367
Vector< int > recv_from
Definition AMReX_SpMatrix.H:362
Long * recv_buffer_idx_map
Definition AMReX_SpMatrix.H:374
Definition AMReX_ccse-mpi.H:55