Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_SpMatrix.H
Go to the documentation of this file.
1#ifndef AMREX_SP_MATRIX_H_
2#define AMREX_SP_MATRIX_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_AlgVector.H>
7#include <AMReX_CSR.H>
8#include <AMReX_Gpu.H>
9#include <AMReX_Scan.H>
10
11#if defined(AMREX_USE_CUDA)
12# include <cusparse.h>
13#elif defined(AMREX_USE_HIP)
14# include <rocsparse/rocsparse.h>
15#elif defined(AMREX_USE_SYCL)
16# include <mkl_version.h>
17# include <oneapi/mkl/spblas.hpp>
18#endif
19
20#include <fstream>
21#include <string>
22#include <type_traits>
23#include <unordered_map>
24
25namespace amrex {
26
34template <typename T>
35struct ParCsr {
36 CsrView<T> csr0; // diagonal part
37 CsrView<T> csr1; // off-diagonal part
38 Long row_begin = 0; // global row index begin
39 Long col_begin = 0; // global col index begin
40 Long const* AMREX_RESTRICT row_map = nullptr; // mapping from csr0's row to csr1
41 Long const* AMREX_RESTRICT col_map = nullptr; // mapping from csr1's col to global
42};
43
45struct CsrSorted {
46 bool b = true;
47 explicit operator bool() const { return b; }
48};
49
51struct CsrValid {
52 bool b = true;
53 explicit operator bool() const { return b; }
54};
55
59template <typename T, template<typename> class Allocator = DefaultAllocator>
61{
62public:
63 using value_type = T;
64 template <class U> using allocator_type = Allocator<U>;
65 template <class U> using container_type = PODVector<U,Allocator<U> >;
67 using AllocT = Allocator<T>;
68
69 SpMatrix () = default;
70
89 SpMatrix (AlgPartition partition, int nnz_per_row);
90
102
103 SpMatrix (SpMatrix const&) = delete;
104 SpMatrix& operator= (SpMatrix const&) = delete;
105
106 SpMatrix (SpMatrix &&) = default;
108
109 ~SpMatrix () = default;
110
125 void define (AlgPartition partition, int nnz_per_row);
126
159 void define (AlgPartition partition, T const* mat, Long const* col_index,
160 Long nentries, Long const* row_offset, CsrSorted is_sorted,
161 CsrValid is_valid);
162
174 void define (AlgPartition partition, csr_type csr, CsrSorted is_sorted);
175
177 [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
178
186 [[nodiscard]] AlgPartition const& columnPartition () const { return m_col_partition; }
187
189 [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; }
191 [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
193 [[nodiscard]] Long numLocalNonZeros () const { return m_nnz; }
194
196 [[nodiscard]] Long globalRowBegin () const { return m_row_begin; }
198 [[nodiscard]] Long globalRowEnd () const { return m_row_end; }
199
201 [[nodiscard]] T* data () {
202 AMREX_ALWAYS_ASSERT(m_split == false);
203 return m_csr.mat.data();
204 }
205
207 [[nodiscard]] Long* columnIndex () {
208 AMREX_ALWAYS_ASSERT(m_split == false);
209 return m_csr.col_index.data();
210 }
211
213 [[nodiscard]] Long* rowOffset () {
214 AMREX_ALWAYS_ASSERT(m_split == false);
215 return m_csr.row_offset.data();
216 }
217
218 /*
219 * \brief Print the matrix to files.
220 *
221 * Each process writes its local portion of the matrix to a separate
222 * file. This function is provided for debugging purpose. Using it in
223 * large-scale runs may overwhelm the file system.
224 *
225 * File format:
226 * - The first line contains three integers describing the local matrix:
227 * the global row begin index, the global row end index, and the
228 * number of local nonzeros.
229 * - This is followed by one line per nonzero entry. Each line contains:
230 * the global row index, global column index, and matrix value.
231 *
232 * \param file Base file name. The full name on process `i` is `{file}.{i}`.
233 */
234 void printToFile (std::string const& file) const;
235
258 template <typename F>
259 void setVal (F const& f, CsrSorted is_sorted);
260
263 void sortCSR ();
264
270 [[nodiscard]] AlgVector<T,AllocT> const& diagonalVector () const;
271
278 [[nodiscard]] AlgVector<T,AllocT> rowSum () const;
279
285 [[nodiscard]] ParCsr<T > parcsr () ;
287 [[nodiscard]] ParCsr<T const> parcsr () const;
289 [[nodiscard]] ParCsr<T const> const_parcsr () const;
290
291 template <typename U, template<typename> class M, typename N> friend
293
294 template <typename U, template<typename> class M> friend
296
297 template <typename U> friend class AMG;
298
300 void define_doit (int nnz_per_row);
301
311 template <typename I>
312 void define_and_filter_doit (T const* mat, Long const* col_index,
313 Long nentries, Long const* row_offset);
314
320 void startComm_mv (AlgVector<T,AllocT> const& x);
323
325 void startComm_tr (AlgPartition const& col_partition);
328
329private:
330
331 void set_num_neighbors ();
332
333 AlgPartition m_partition;
334 AlgPartition m_col_partition;
335 Long m_row_begin = 0;
336 Long m_row_end = 0;
337 Long m_col_begin = 0;
338 Long m_col_end = 0;
339 Long m_nnz = 0;
340 csr_type m_csr;
341
342 mutable AlgVector<T,AllocT> m_diagonal;
343
344 bool m_split = false; // Has the matrix been split into diagonal and off-diagonal parts?
345
346#ifdef AMREX_USE_MPI
347 csr_type m_csr_remote;
348
349 // It should be noted that m_csr and m_csr_remote may have different
350 // number of rows, because some rows may be purely local and they do not
351 // appear in m_csr_remote. Thus, we need a mapping from local row index
352 // in m_csr_remote to local row index in m_csr. Then we will be able to
353 // know its global row index by adding m_row_begin. For example,
354 // m_row_begin + m_ri_rtol[i] is the global index for local row i in
355 // m_csr_remote.
356 container_type<Long> m_ri_rtol; // size: m_csr_remote.nrows()
357
358 // This is local row index mapping from m_csr to m_csr_remote. -1 means
359 // the row does not exist in m_csr_remote.
360 container_type<Long> m_ri_ltor; // size: m_csr.nrows()
361
362 // For column index, we also need to be careful with local vs. global,
363 // and there two types of locals: local in m_csr and local in
364 // m_csr_remote. For m_csr, col_index is the global index if m_split is
365 // false, and it becomes the local index if m_split is true. The
366 // conversion is global_col_index = local_col_index + m_col_begin.
367 //
368 // For m_csr_remote, col_index is also local. For a give col_index j,
369 // m_remote_cols_v[j] gives us the global index.
370 Vector<Long> m_remote_cols_v;
371#ifdef AMREX_USE_GPU
372 container_type<Long> m_remote_cols_dv;
373#endif
374 // The size of outer vector is # of procs. The indices are global.
375 Vector<Vector<Long>> m_remote_cols_vv;
376
377 int m_num_neighbors = -1; // No. of other processes involved in communication
378
379public: // NOLINT Private functions, but public for cuda
380
399
400 struct CommTR {
401 CsrView<T> csrt; // Own the memory inside
402
406
410
411 std::array<Long,2> total_counts_recv = {0,0};
413 // The raw pointers below are owning.
414 // xxxxx TODO GPU: Currently we use pinned memory. In the future, we
415 // may explore device memory for GPU aware MPI.
416 T* recv_buffer_mat = nullptr;
419 Long* recv_buffer_idx_map = nullptr; // local -> global for row of A^T.
420
422
423 void split_csr (AlgPartition const& col_partition);
424 template <typename C>
425 void update_remote_col_index (C& csrr, bool in_device_memory);
426 void prepare_comm_mv (AlgPartition const& col_partition);
427 void pack_buffer_mv (AlgVector<T,AllocT> const& v);
429 void unpack_buffer_tr (CommTR const& ctr, AlgPartition const& col_partition);
430
431#endif
432};
433
434namespace detail {
435template <typename T>
436void transpose (CsrView<T> const& csrt, CsrView<T const> const& csr)
437{
438 Long nrows = csr.nrows;
439 Long ncols = csrt.nrows;
440 Long nnz = csr.nnz;
441
442 if (nrows <= 0 || ncols <= 0 || nnz <= 0) {
443 if (ncols > 0) {
444 auto* p = csrt.row_offset;
445 ParallelForOMP(ncols+1, [=] AMREX_GPU_DEVICE (Long i) { p[i] = 0; });
446 }
447 return;
448 }
449
450#ifdef AMREX_USE_GPU
451
452#if defined(AMREX_USE_CUDA)
453
454 cusparseHandle_t handle;
455 AMREX_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));
456 AMREX_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, Gpu::gpuStream()));
457
458 cudaDataType data_type;
459 if constexpr (std::is_same_v<T,float>) {
460 data_type = CUDA_R_32F;
461 } else if constexpr (std::is_same_v<T,double>) {
462 data_type = CUDA_R_64F;
463 } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
464 data_type = CUDA_C_32F;
465 } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
466 data_type = CUDA_C_64F;
467 } else {
468 amrex::Abort("SpMatrix transpose: unsupported data type");
469 }
470
471 AMREX_ALWAYS_ASSERT(nrows < Long(std::numeric_limits<int>::max()) &&
472 ncols < Long(std::numeric_limits<int>::max()) &&
473 nnz < Long(std::numeric_limits<int>::max()));
474
475 auto* csr_col_index = (int*)The_Arena()->alloc(csr.nnz*sizeof(int));
476 auto* csr_row_offset = (int*)The_Arena()->alloc((csr.nrows+1)*sizeof(int));
477 auto* csrt_col_index = (int*)The_Arena()->alloc(csrt.nnz*sizeof(int));
478 auto* csrt_row_offset = (int*)The_Arena()->alloc((csrt.nrows+1)*sizeof(int));
479
480 ParallelFor(std::max(csr.nnz, csr.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
481 {
482 if (i < csr.nnz) {
483 csr_col_index[i] = int(csr.col_index[i]);
484 }
485 if (i < csr.nrows+1) {
486 csr_row_offset[i] = int(csr.row_offset[i]);
487 }
488 });
489
490 std::size_t buffer_size;
492 cusparseCsr2cscEx2_bufferSize(handle, int(nrows), int(ncols), int(nnz),
493 csr.mat, csr_row_offset, csr_col_index,
494 csrt.mat, csrt_row_offset, csrt_col_index,
495 data_type, CUSPARSE_ACTION_NUMERIC,
496 CUSPARSE_INDEX_BASE_ZERO,
497 CUSPARSE_CSR2CSC_ALG1,
498 &buffer_size));
499
500 auto* pbuffer = (void*)The_Arena()->alloc(buffer_size);
501
503 cusparseCsr2cscEx2(handle, int(nrows), int(ncols), int(nnz),
504 csr.mat, csr_row_offset, csr_col_index,
505 csrt.mat, csrt_row_offset, csrt_col_index,
506 data_type, CUSPARSE_ACTION_NUMERIC,
507 CUSPARSE_INDEX_BASE_ZERO,
508 CUSPARSE_CSR2CSC_ALG1,
509 pbuffer));
510
511 ParallelFor(std::max(csrt.nnz, csrt.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
512 {
513 if (i < csrt.nnz) {
514 csrt.col_index[i] = csrt_col_index[i];
515 }
516 if (i < csrt.nrows+1) {
517 csrt.row_offset[i] = csrt_row_offset[i];
518 }
519 });
520
522 AMREX_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
523 The_Arena()->free(pbuffer);
524 The_Arena()->free(csr_row_offset);
525 The_Arena()->free(csr_col_index);
526 The_Arena()->free(csrt_row_offset);
527 The_Arena()->free(csrt_col_index);
528
529#elif defined(AMREX_USE_HIP)
530
531 rocsparse_handle handle;
532 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&handle));
533 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, Gpu::gpuStream()));
534
535 AMREX_ALWAYS_ASSERT(nrows < Long(std::numeric_limits<rocsparse_int>::max()) &&
536 ncols < Long(std::numeric_limits<rocsparse_int>::max()) &&
537 nnz < Long(std::numeric_limits<rocsparse_int>::max()));
538
539 rocsparse_int* csr_col_index;
540 rocsparse_int* csr_row_offset;
541 rocsparse_int* csrt_col_index;
542 rocsparse_int* csrt_row_offset;
543 if (std::is_same_v<rocsparse_int,Long>) {
544 csr_col_index = (rocsparse_int*)csr.col_index;
545 csr_row_offset = (rocsparse_int*)csr.row_offset;
546 csrt_col_index = (rocsparse_int*)csrt.col_index;
547 csrt_row_offset = (rocsparse_int*)csrt.row_offset;
548 } else {
549 csr_col_index = (rocsparse_int*)The_Arena()->alloc(csr.nnz*sizeof(rocsparse_int));
550 csr_row_offset = (rocsparse_int*)The_Arena()->alloc((csr.nrows+1)*sizeof(rocsparse_int));
551 csrt_col_index = (rocsparse_int*)The_Arena()->alloc(csrt.nnz*sizeof(rocsparse_int));
552 csrt_row_offset = (rocsparse_int*)The_Arena()->alloc((csrt.nrows+1)*sizeof(rocsparse_int));
553 ParallelFor(std::max(csr.nnz, csr.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
554 {
555 if (i < csr.nnz) {
556 csr_col_index[i] = rocsparse_int(csr.col_index[i]);
557 }
558 if (i < csr.nrows+1) {
559 csr_row_offset[i] = rocsparse_int(csr.row_offset[i]);
560 }
561 });
562 }
563
564 std::size_t buffer_size;
565 AMREX_ROCSPARSE_SAFE_CALL(
566 rocsparse_csr2csc_buffer_size(handle, rocsparse_int(nrows),
567 rocsparse_int(ncols), rocsparse_int(nnz),
568 csr_row_offset, csr_col_index,
569 rocsparse_action_numeric,
570 &buffer_size));
571
572 auto* pbuffer = (void*)The_Arena()->alloc(buffer_size);
573
574 if constexpr (std::is_same_v<T,float>) {
575 AMREX_ROCSPARSE_SAFE_CALL(
576 rocsparse_scsr2csc(handle, rocsparse_int(nrows),
577 rocsparse_int(ncols), rocsparse_int(nnz),
578 csr.mat, csr_row_offset, csr_col_index,
579 csrt.mat, csrt_col_index, csrt_row_offset,
580 rocsparse_action_numeric,
581 rocsparse_index_base_zero,
582 pbuffer));
583 } else if constexpr (std::is_same_v<T,double>) {
584 AMREX_ROCSPARSE_SAFE_CALL(
585 rocsparse_dcsr2csc(handle, rocsparse_int(nrows),
586 rocsparse_int(ncols), rocsparse_int(nnz),
587 csr.mat, csr_row_offset, csr_col_index,
588 csrt.mat, csrt_col_index, csrt_row_offset,
589 rocsparse_action_numeric,
590 rocsparse_index_base_zero,
591 pbuffer));
592 } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
593 AMREX_ROCSPARSE_SAFE_CALL(
594 rocsparse_ccsr2csc(handle, rocsparse_int(nrows),
595 rocsparse_int(ncols), rocsparse_int(nnz),
596 (rocsparse_float_complex*)csr.mat, csr_row_offset, csr_col_index,
597 (rocsparse_float_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
598 rocsparse_action_numeric,
599 rocsparse_index_base_zero,
600 pbuffer));
601 } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
602 AMREX_ROCSPARSE_SAFE_CALL(
603 rocsparse_zcsr2csc(handle, rocsparse_int(nrows),
604 rocsparse_int(ncols), rocsparse_int(nnz),
605 (rocsparse_double_complex*)csr.mat, csr_row_offset, csr_col_index,
606 (rocsparse_double_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
607 rocsparse_action_numeric,
608 rocsparse_index_base_zero,
609 pbuffer));
610 } else {
611 amrex::Abort("SpMatrix transpose: unsupported data type");
612 }
613
614 if (! std::is_same_v<rocsparse_int,Long>) {
615 ParallelFor(std::max(csrt.nnz, csrt.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
616 {
617 if (i < csrt.nnz) {
618 csrt.col_index[i] = csrt_col_index[i];
619 }
620 if (i < csrt.nrows+1) {
621 csrt.row_offset[i] = csrt_row_offset[i];
622 }
623 });
624 }
625
627 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(handle));
628 The_Arena()->free(pbuffer);
629 if (! std::is_same_v<rocsparse_int,Long>) {
630 The_Arena()->free(csr_row_offset);
631 The_Arena()->free(csr_col_index);
632 The_Arena()->free(csrt_row_offset);
633 The_Arena()->free(csrt_col_index);
634 }
635
636#elif defined(AMREX_USE_SYCL)
637
638 mkl::sparse::matrix_handle_t handle_in{};
639 mkl::sparse::matrix_handle_t handle_out{};
640 mkl::sparse::init_matrix_handle(&handle_in);
641 mkl::sparse::init_matrix_handle(&handle_out);
642
643#if defined(INTEL_MKL_VERSION) && (INTEL_MKL_VERSION < 20250300)
645 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols,
646 mkl::index_base::zero, (Long*)csr.row_offset,
647 (Long*)csr.col_index, (T*)csr.mat);
648 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows,
649 mkl::index_base::zero, (Long*)csrt.row_offset,
650 (Long*)csrt.col_index, (T*)csrt.mat);
651#else
652 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols, nnz,
653 mkl::index_base::zero, (Long*)csr.row_offset,
654 (Long*)csr.col_index, (T*)csr.mat);
655 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows, nnz,
656 mkl::index_base::zero, (Long*)csrt.row_offset,
657 (Long*)csrt.col_index, (T*)csrt.mat);
658#endif
659
660 mkl::sparse::omatcopy(Gpu::Device::streamQueue(), mkl::transpose::trans,
661 handle_in, handle_out);
662
663 mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_in);
664 auto ev = mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_out);
665 ev.wait();
666
667#endif
668
670
671#else
672
673 auto* p = csrt.row_offset;
674
675 ParallelForOMP(ncols+1, [=] AMREX_GPU_DEVICE (Long i) { p[i] = 0; });
676
677 // nonzeros per column
678#ifdef AMREX_USE_OMP
679#pragma omp parallel for
680#endif
681 for (Long i = 0; i < nnz; ++i) {
682 auto col = csr.col_index[i];
683#ifdef AMREX_USE_OMP
684#pragma omp atomic update
685#endif
686 ++p[col+1];
687 }
688
689 // build row_offset for transposed matrix. Also save a copy.
690 Vector<Long> current_pos(ncols+1);
691 current_pos[0] = 0;
692 for (Long i = 0; i < ncols; ++i) {
693 p[i+1] += p[i];
694 current_pos[i+1] = p[i+1];
695 }
696
697 // The following code is not OMP safe. It's difficult to use OMP and
698 // still keep CSR sorted.
699 for (Long i = 0; i < nrows; ++i) {
700 for (Long idx = csr.row_offset[i]; idx < csr.row_offset[i+1]; ++idx) {
701 auto col = csr.col_index[idx];
702 Long dest = current_pos[col]++;
703 csrt.mat[dest] = csr.mat[idx];
704 csrt.col_index[dest] = i;
705 }
706 }
707
708#endif
709}
710}
711
712template <typename T, template<typename> class Allocator>
714 : m_partition(std::move(partition)),
715 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
716 m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
717{
718 define_doit(nnz_per_row);
719}
720
721template <typename T, template<typename> class Allocator>
723 : m_partition(std::move(partition)),
724 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
725 m_row_end(m_partition[ParallelDescriptor::MyProc()+1]),
726 m_nnz(csr.nnz),
727 m_csr(std::move(csr))
728{}
729
730template <typename T, template<typename> class Allocator>
731void SpMatrix<T,Allocator>::define (AlgPartition partition, int nnz_per_row)
732{
733 m_partition = std::move(partition);
734 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
735 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
736 define_doit(nnz_per_row);
737}
738
739template <typename T, template<typename> class Allocator>
741 CsrSorted is_sorted)
742{
743 m_partition = std::move(partition);
744 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
745 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
746 m_nnz = csr.nnz;
747 m_csr = std::move(csr);
748 if (! is_sorted) { m_csr.sort(); }
749}
750
751template <typename T, template<typename> class Allocator>
752void
754{
755 if (nnz_per_row <= 0) { return; };
756
757 AMREX_ALWAYS_ASSERT(m_split == false);
758
759 Long nlocalrows = this->numLocalRows();
760 m_nnz = nlocalrows*nnz_per_row;
761 m_csr.mat.resize(m_nnz);
762 m_csr.col_index.resize(m_nnz);
763 m_csr.row_offset.resize(nlocalrows+1);
764 m_csr.nnz = m_nnz;
765
766 auto* poffset = m_csr.row_offset.data();
767 ParallelForOMP(nlocalrows+1, [=] AMREX_GPU_DEVICE (Long lrow) noexcept
768 {
769 poffset[lrow] = lrow*nnz_per_row;
770 });
771}
772
773template <typename T, template<typename> class Allocator>
774void
776 Long const* col_index, Long nentries,
777 Long const* row_offset, CsrSorted is_sorted,
778 CsrValid is_valid)
779{
780 AMREX_ALWAYS_ASSERT(m_split == false);
781
782 m_partition = std::move(partition);
783 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
784 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
785
786 bool synced = false;
787
788 if (is_valid) {
789 m_nnz = nentries;
790 Long nlocalrows = this->numLocalRows();
791 m_csr.mat.resize(nentries);
792 m_csr.col_index.resize(nentries);
793 m_csr.row_offset.resize(nlocalrows+1);
794 m_csr.nnz = nentries;
795 Gpu::copyAsync(Gpu::deviceToDevice, mat, mat+nentries, m_csr.mat.begin());
796 Gpu::copyAsync(Gpu::deviceToDevice, col_index, col_index+nentries,
797 m_csr.col_index.begin());
798 Gpu::copyAsync(Gpu::deviceToDevice, row_offset, row_offset+nlocalrows+1,
799 m_csr.row_offset.begin());
800 } else {
801 if (nentries < Long(std::numeric_limits<int>::max())) {
802 define_and_filter_doit<int>(mat, col_index, nentries, row_offset);
803 } else {
804 define_and_filter_doit<Long>(mat, col_index, nentries, row_offset);
805 }
806 synced = true;
807 }
808
809 if (! is_sorted) {
810 m_csr.sort();
811 synced = true;
812 }
813
814 if (! synced) {
816 }
817}
818
819template <typename T, template<typename> class Allocator>
820void
822{
823 m_csr.sort();
824}
825
826template <typename T, template<typename> class Allocator>
827template <typename I>
828void
830 Long nentries, Long const* row_offset)
831{
832 Gpu::DeviceVector<I> psum(nentries);
833 auto* ps = psum.data();
834 m_nnz = Scan::PrefixSum<I>(I(nentries),
835 [=] AMREX_GPU_DEVICE (I i) -> I {
836 return col_index[i] >= 0 && mat[i] != 0; },
837 [=] AMREX_GPU_DEVICE (I i, I x) {
838 ps[i] = x; },
840 Long nlocalrows = this->numLocalRows();
841 m_csr.mat.resize(m_nnz);
842 m_csr.col_index.resize(m_nnz);
843 m_csr.row_offset.resize(nlocalrows+1);
844 m_csr.nnz = m_nnz;
845 auto* pmat = m_csr.mat.data();
846 auto* pcol = m_csr.col_index.data();
847 auto* prow = m_csr.row_offset.data();
848 auto actual_nnz = m_nnz;
849 ParallelFor(std::max(nentries,nlocalrows), [=] AMREX_GPU_DEVICE (Long i)
850 {
851 if (i < nentries) {
852 if (col_index[i] >= 0 && mat[i] != 0) {
853 pmat[ps[i]] = mat[i];
854 pcol[ps[i]] = col_index[i];
855 }
856 }
857 if (i < nlocalrows) {
858 prow[i] = ps[row_offset[i]];
859 if (i == nlocalrows - 1) {
860 prow[nlocalrows] = actual_nnz;
861 }
862 }
863 });
865}
866
867template <typename T, template<typename> class Allocator>
868void
869SpMatrix<T,Allocator>::printToFile (std::string const& file) const
870{
871#ifdef AMREX_USE_GPU
874
875# ifdef AMREX_USE_MPI
877 if (m_split) {
878 amrex::duplicateCSR(Gpu::deviceToHost, csr_r, m_csr_remote);
879 }
880
881 Gpu::PinnedVector<Long> ri_ltor(m_ri_ltor.size());
882 if (m_split) {
884 m_ri_ltor.begin(),
885 m_ri_ltor.end(),
886 ri_ltor.begin());
887 }
888 auto const& remote_cols = m_remote_cols_v;
889# endif
890
892
893#else
894
895 auto const& csr = m_csr;
896# ifdef AMREX_USE_MPI
897 auto const& csr_r = m_csr_remote;
898 auto const& ri_ltor = m_ri_ltor;
899 auto const& remote_cols = m_remote_cols_v;
900# endif
901
902#endif
903
904 Long nnz = m_csr.nnz;
905#ifdef AMREX_USE_MPI
906 nnz += m_csr_remote.nnz;
907#endif
908
909 std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
910 ofs << m_row_begin << " " << m_row_end << " " << nnz << "\n";
911 for (Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
912 Long nnz_row = csr.row_offset[i+1] - csr.row_offset[i];
913 T const* mat = csr.mat.data() + csr.row_offset[i];
914 Long const* col = csr.col_index.data() + csr.row_offset[i];
915 for (Long j = 0; j < nnz_row; ++j) {
916 ofs << i+m_row_begin << " " << col[j]+m_col_begin << " " << mat[j] << "\n";
917 }
918#ifdef AMREX_USE_MPI
919 if (i < Long(ri_ltor.size()) && ri_ltor[i] >= 0) {
920 Long ii = ri_ltor[i];
921 nnz_row = csr_r.row_offset[ii+1] - csr_r.row_offset[ii];
922 mat = csr_r.mat.data() + csr_r.row_offset[ii];
923 col = csr_r.col_index.data() + csr_r.row_offset[ii];
924 for (Long j = 0; j < nnz_row; ++j) {
925 ofs << i+m_row_begin << " " << remote_cols[col[j]] << " " << mat[j] << "\n";
926 }
927 }
928#endif
929 }
930}
931
932template <typename T, template<typename> class Allocator>
933template <typename F>
934void SpMatrix<T,Allocator>::setVal (F const& f, CsrSorted is_sorted)
935{
936 // xxxxx TODO: We can try to optimize this later by using shared memory.
937
938 AMREX_ALWAYS_ASSERT(m_split == false);
939
940 Long nlocalrows = this->numLocalRows();
941 Long rowbegin = this->globalRowBegin();
942 auto* pmat = m_csr.mat.data();
943 auto* pcolindex = m_csr.col_index.data();
944 auto* prowoffset = m_csr.row_offset.data();
945 ParallelForOMP(nlocalrows, [=] AMREX_GPU_DEVICE (int lrow) noexcept
946 {
947 f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
948 });
949
950 if (! is_sorted) { m_csr.sort(); }
951}
952
953template <typename T, template<typename> class Allocator>
955{
956 if (m_diagonal.empty()) {
957 m_diagonal.define(this->partition());
958 auto* AMREX_RESTRICT p = m_diagonal.data();
959 auto const* AMREX_RESTRICT mat = m_csr.mat.data();
960 auto const* AMREX_RESTRICT col = m_csr.col_index.data();
961 auto const* AMREX_RESTRICT row = m_csr.row_offset.data();
962 auto offset = m_split ? Long(0) : m_row_begin; // assuming square matrix
963 Long nrows = this->numLocalRows();
965 {
966 T d = 0;
967 for (Long j = row[i]; j < row[i+1]; ++j) {
968 if (i == col[j] - offset) {
969 d = mat[j];
970 break;
971 }
972 }
973 p[i] = d;
974 });
975 }
976 return m_diagonal;
977}
978
979template <typename T, template<typename> class Allocator>
981{
982 AlgVector<T,Allocator<T>> r(this->partition());
983 auto* p = r.data();
984 auto const& a = this->const_parcsr();
986 {
987 T s = 0;
988 for (auto idx = a.csr0.row_offset[i];
989 idx < a.csr0.row_offset[i+1]; ++idx) {
990 s += a.csr0.mat[idx];
991 }
992 if (a.csr1.nnz > 0) {
993 for (auto idx = a.csr1.row_offset[i];
994 idx < a.csr1.row_offset[i+1]; ++idx) {
995 s += a.csr1.mat[idx];
996 }
997 }
998 p[i] = s;
999 });
1000 return r;
1001}
1002
1003template <typename T, template<typename> class Allocator>
1005{
1006 return ParCsr<T>{m_csr.view(),
1007#ifdef AMREX_USE_MPI
1008 m_csr_remote.view(),
1009#else
1010 CsrView<T>{},
1011#endif
1012 m_row_begin,
1013 m_col_begin,
1014#ifdef AMREX_USE_MPI
1015 m_ri_ltor.data(),
1016# ifdef AMREX_USE_GPU
1017 m_remote_cols_dv.data()
1018# else
1019 m_remote_cols_v.data()
1020# endif
1021#else
1022 nullptr, nullptr
1023#endif
1024 };
1025}
1026
1027template <typename T, template<typename> class Allocator>
1029{
1030 using U = T const;
1031 return ParCsr<U>{m_csr.const_view(),
1032#ifdef AMREX_USE_MPI
1033 m_csr_remote.const_view(),
1034#else
1035 CsrView<U>{},
1036#endif
1037 m_row_begin,
1038 m_col_begin,
1039#ifdef AMREX_USE_MPI
1040 m_ri_ltor.data(),
1041# ifdef AMREX_USE_GPU
1042 m_remote_cols_dv.data()
1043# else
1044 m_remote_cols_v.data()
1045# endif
1046#else
1047 nullptr, nullptr
1048#endif
1049 };
1050}
1051
1052template <typename T, template<typename> class Allocator>
1054{
1055 return this->const_parcsr();
1056}
1057
1058template <typename T, template<typename> class Allocator>
1060{
1061#ifndef AMREX_USE_MPI
1063#else
1064 if (this->partition().numActiveProcs() <= 1) { return; }
1065
1066 this->prepare_comm_mv(x.partition());
1067
1068 auto const mpi_tag = ParallelDescriptor::SeqNum();
1069 auto const mpi_t_type = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
1070 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1071
1072 auto const nrecvs = int(m_comm_mv.recv_from.size());
1073 if (nrecvs > 0) {
1074 m_comm_mv.recv_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_comm_mv.total_counts_recv);
1075 m_comm_mv.recv_reqs.resize(nrecvs, MPI_REQUEST_NULL);
1076 auto* p_recv = m_comm_mv.recv_buffer;
1077 for (int irecv = 0; irecv < nrecvs; ++irecv) {
1078 BL_MPI_REQUIRE(MPI_Irecv(p_recv,
1079 m_comm_mv.recv_counts[irecv], mpi_t_type,
1080 m_comm_mv.recv_from[irecv], mpi_tag, mpi_comm,
1081 &(m_comm_mv.recv_reqs[irecv])));
1082 p_recv += m_comm_mv.recv_counts[irecv];
1083 }
1084 AMREX_ASSERT(p_recv == m_comm_mv.recv_buffer + m_comm_mv.total_counts_recv);
1085 }
1086
1087 auto const nsends = int(m_comm_mv.send_to.size());
1088 if (nsends > 0) {
1089 m_comm_mv.send_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_comm_mv.total_counts_send);
1090
1091 pack_buffer_mv(x);
1093
1094 m_comm_mv.send_reqs.resize(nsends, MPI_REQUEST_NULL);
1095 auto* p_send = m_comm_mv.send_buffer;
1096 for (int isend = 0; isend < nsends; ++isend) {
1097 auto count = m_comm_mv.send_counts[isend];
1098 BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_comm_mv.send_to[isend],
1099 mpi_tag, mpi_comm, &(m_comm_mv.send_reqs[isend])));
1100 p_send += count;
1101 }
1102 AMREX_ASSERT(p_send == m_comm_mv.send_buffer + m_comm_mv.total_counts_send);
1103 }
1104#endif
1105}
1106
1107template <typename T, template<typename> class Allocator>
1109{
1110 if (this->numLocalRows() == 0) { return; }
1111
1112#ifndef AMREX_USE_MPI
1114#else
1115 if (this->partition().numActiveProcs() <= 1) { return; }
1116
1117 if ( ! m_comm_mv.recv_reqs.empty()) {
1118 Vector<MPI_Status> mpi_statuses(m_comm_mv.recv_reqs.size());
1119 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_mv.recv_reqs.size()),
1120 m_comm_mv.recv_reqs.data(),
1121 mpi_statuses.data()));
1122 }
1123
1124 unpack_buffer_mv(y);
1125
1126 if ( ! m_comm_mv.send_reqs.empty()) {
1127 Vector<MPI_Status> mpi_statuses(m_comm_mv.send_reqs.size());
1128 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_mv.send_reqs.size()),
1129 m_comm_mv.send_reqs.data(),
1130 mpi_statuses.data()));
1131 }
1132
1134 The_Comms_Arena()->free(m_comm_mv.send_buffer);
1135 The_Comms_Arena()->free(m_comm_mv.recv_buffer);
1136 m_comm_mv.send_reqs.clear();
1137 m_comm_mv.recv_reqs.clear();
1138#endif
1139}
1140
1141template <typename T, template<typename> class Allocator>
1143{
1144#ifdef AMREX_USE_MPI
1145 if (this->partition().numActiveProcs() <= 1) { return; }
1146
1147 this->split_csr(col_partition);
1148
1149 int const nprocs = ParallelContext::NProcsSub();
1150 auto const mpi_tag = ParallelDescriptor::SeqNum();
1151 auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
1152 auto const mpi_t = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
1153 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1154
1155 // transpose the off-diagonal part
1156 if (m_csr_remote.nnz > 0) {
1157 m_comm_tr.csrt.nnz = m_csr_remote.nnz;
1158 m_comm_tr.csrt.nrows = m_remote_cols_v.size();
1159 m_comm_tr.csrt.mat = (T*)The_Comms_Arena()->alloc
1160 (sizeof(T)*m_comm_tr.csrt.nnz);
1161 m_comm_tr.csrt.col_index = (Long*)The_Comms_Arena()->alloc
1162 (sizeof(Long)*m_comm_tr.csrt.nnz);
1163 m_comm_tr.csrt.row_offset = (Long*)The_Comms_Arena()->alloc
1164 (sizeof(Long)*(m_comm_tr.csrt.nrows+1));
1165#ifdef AMREX_USE_GPU
1166 csr_type csr_comm;
1167 csr_comm.resize(m_comm_tr.csrt.nrows, m_comm_tr.csrt.nnz);
1168 auto const& csrv_comm = csr_comm.view();
1169#else
1170 auto const& csrv_comm = m_comm_tr.csrt;
1171#endif
1172 detail::transpose(csrv_comm, m_csr_remote.const_view());
1173 auto row_begin = m_row_begin;
1174 auto ri_rtol = m_ri_rtol.data();
1175 auto* col_index = csrv_comm.col_index;
1176 ParallelForOMP(csrv_comm.nnz, [=] AMREX_GPU_DEVICE (Long idx)
1177 {
1178 auto gjt =ri_rtol[col_index[idx]] + row_begin;
1179 col_index[idx] = gjt; // global index
1180 });
1181#ifdef AMREX_USE_GPU
1183 csrv_comm. mat,
1184 csrv_comm. mat + csrv_comm.nnz,
1185 m_comm_tr.csrt.mat);
1187 csrv_comm. col_index,
1188 csrv_comm. col_index + csrv_comm.nnz,
1189 m_comm_tr.csrt.col_index);
1191 csrv_comm. row_offset,
1192 csrv_comm. row_offset + csrv_comm.nrows+1,
1193 m_comm_tr.csrt.row_offset);
1195#endif
1196 }
1197
1198 if (m_num_neighbors < 0) { set_num_neighbors(); }
1199
1200 // As a sender, I need to let other processes know that how many
1201 // elements I will send them.
1202
1203 Vector<MPI_Request> mpi_requests;
1204 mpi_requests.reserve(nprocs);
1205 if (m_csr_remote.nnz > 0) {
1206 Long it = 0;
1207 for (int iproc = 0; iproc < nprocs; ++iproc) {
1208 Long n = 0;
1209 for (Long i = 0; i < Long(m_remote_cols_vv[iproc].size()); ++i) {
1210 n += m_comm_tr.csrt.row_offset[it+1] - m_comm_tr.csrt.row_offset[it];
1211 ++it;
1212 }
1213 if (n > 0) {
1214 mpi_requests.push_back(MPI_REQUEST_NULL);
1215 AMREX_ALWAYS_ASSERT(n < std::numeric_limits<int>::max());
1216 std::array<int,2> nn{int(n), int(m_remote_cols_vv[iproc].size())};
1217 BL_MPI_REQUIRE(MPI_Isend(nn.data(), 2, MPI_INT, iproc, mpi_tag,
1218 mpi_comm, &(mpi_requests.back())));
1219 m_comm_tr.send_to.push_back(iproc);
1220 m_comm_tr.send_counts.push_back(nn);
1221 }
1222 }
1223 }
1224
1225 // As a receiver, m_num_neighbors is the number of processes from which
1226 // I will receive data.
1227
1228 for (int irecv = 0; irecv < m_num_neighbors; ++irecv) {
1229 MPI_Status mpi_status;
1230 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1231 int sender = mpi_status.MPI_SOURCE;
1232 std::array<int,2> nn;
1233 BL_MPI_REQUIRE(MPI_Recv(nn.data(), 2, MPI_INT, sender, mpi_tag,
1234 mpi_comm, &mpi_status));
1235 m_comm_tr.recv_from.push_back(sender);
1236 m_comm_tr.recv_counts.push_back(nn);
1237 m_comm_tr.total_counts_recv[0] += nn[0];
1238 m_comm_tr.total_counts_recv[1] += nn[1];
1239 }
1240
1241 if (! mpi_requests.empty()) {
1242 Vector<MPI_Status> mpi_statuses(mpi_requests.size());
1243 BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
1244 mpi_statuses.data()));
1245 }
1246
1247 auto const mpi_tag_m = ParallelDescriptor::SeqNum();
1248 auto const mpi_tag_c = ParallelDescriptor::SeqNum();
1249 auto const mpi_tag_r = ParallelDescriptor::SeqNum();
1250 auto const mpi_tag_p = ParallelDescriptor::SeqNum();
1251
1252 // We need to send m_comm_tr.csrt.mat, col_index & row_offset. We also
1253 // need to send m_remote_cols_vv, which maps row index (in transposed
1254 // matrix) form local to global.
1255
1256 auto const nrecvs = int(m_comm_tr.recv_from.size());
1257 if (nrecvs > 0) {
1258 m_comm_tr.recv_buffer_mat = (T*) The_Pinned_Arena()->alloc
1259 (sizeof(T) * m_comm_tr.total_counts_recv[0]);
1260 m_comm_tr.recv_buffer_col_index = (Long*) The_Pinned_Arena()->alloc
1261 (sizeof(Long) * m_comm_tr.total_counts_recv[0]);
1262 m_comm_tr.recv_buffer_row_offset = (Long*) The_Pinned_Arena()->alloc
1263 (sizeof(Long) * (m_comm_tr.total_counts_recv[1]+nrecvs));
1264 m_comm_tr.recv_buffer_idx_map = (Long*) The_Pinned_Arena()->alloc
1265 (sizeof(Long) * m_comm_tr.total_counts_recv[1]);
1266 m_comm_tr.recv_buffer_offset.push_back({0,0,0,0});
1267 m_comm_tr.recv_reqs.resize(4*nrecvs, MPI_REQUEST_NULL);
1268 for (int irecv = 0; irecv < nrecvs; ++irecv) {
1269 auto [os0, os1, os2, os3] = m_comm_tr.recv_buffer_offset.back();
1270 auto [n0, n1] = m_comm_tr.recv_counts[irecv];
1271 auto recv_from_rank = m_comm_tr.recv_from[irecv];
1272 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_mat + os0,
1273 n0,
1274 mpi_t,
1275 recv_from_rank,
1276 mpi_tag_m,
1277 mpi_comm,
1278 &(m_comm_tr.recv_reqs[irecv*4])));
1279 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_col_index + os1,
1280 n0,
1281 mpi_long,
1282 recv_from_rank,
1283 mpi_tag_c,
1284 mpi_comm,
1285 &(m_comm_tr.recv_reqs[irecv*4+1])));
1286 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_row_offset + os2,
1287 n1+1,
1288 mpi_long,
1289 recv_from_rank,
1290 mpi_tag_r,
1291 mpi_comm,
1292 &(m_comm_tr.recv_reqs[irecv*4+2])));
1293 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_idx_map + os3,
1294 n1,
1295 mpi_long,
1296 recv_from_rank,
1297 mpi_tag_p,
1298 mpi_comm,
1299 &(m_comm_tr.recv_reqs[irecv*4+3])));
1300 m_comm_tr.recv_buffer_offset.push_back({os0 + n0,
1301 os1 + n0,
1302 os2 + n1+1,
1303 os3 + n1});
1304 }
1305 }
1306
1307 auto const nsends = int(m_comm_tr.send_to.size());
1308 if (nsends > 0) {
1309 m_comm_tr.send_reqs.resize(4*nsends, MPI_REQUEST_NULL);
1310 Long os0 = 0, os1 = 0;
1311 for (int isend = 0; isend < nsends; ++isend) {
1312 auto [n0, n1] = m_comm_tr.send_counts[isend];
1313 auto send_to_rank = m_comm_tr.send_to[isend];
1314 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.mat + os0,
1315 n0,
1316 mpi_t,
1317 send_to_rank,
1318 mpi_tag_m,
1319 mpi_comm,
1320 &(m_comm_tr.send_reqs[isend*4])));
1321 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.col_index + os0,
1322 n0,
1323 mpi_long,
1324 send_to_rank,
1325 mpi_tag_c,
1326 mpi_comm,
1327 &(m_comm_tr.send_reqs[isend*4+1])));
1328 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.row_offset + os1,
1329 n1+1,
1330 mpi_long,
1331 send_to_rank,
1332 mpi_tag_r,
1333 mpi_comm,
1334 &(m_comm_tr.send_reqs[isend*4+2])));
1335 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[send_to_rank].data(),
1336 n1,
1337 mpi_long,
1338 send_to_rank,
1339 mpi_tag_p,
1340 mpi_comm,
1341 &(m_comm_tr.send_reqs[isend*4+3])));
1342 os0 += n0;
1343 os1 += n1;
1344 }
1345 }
1346#else
1347 amrex::ignore_unused(col_partition);
1348#endif
1349}
1350
1351template <typename T, template<typename> class Allocator>
1353{
1354#ifdef AMREX_USE_MPI
1355 if (this->partition().numActiveProcs() <= 1) { return; }
1356
1357 if (! m_comm_tr.recv_reqs.empty()) {
1358 Vector<MPI_Status> mpi_statuses(m_comm_tr.recv_reqs.size());
1359 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_tr.recv_reqs.size()),
1360 m_comm_tr.recv_reqs.data(),
1361 mpi_statuses.data()));
1362 }
1363
1364 AT.unpack_buffer_tr(m_comm_tr, this->m_partition);
1365
1366 if (! m_comm_tr.send_reqs.empty()) {
1367 Vector<MPI_Status> mpi_statuses(m_comm_tr.send_reqs.size());
1368 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_tr.send_reqs.size()),
1369 m_comm_tr.send_reqs.data(),
1370 mpi_statuses.data()));
1371 }
1372
1373 if (m_comm_tr.csrt.nnz > 0) {
1374 The_Comms_Arena()->free(m_comm_tr.csrt.mat);
1375 The_Comms_Arena()->free(m_comm_tr.csrt.col_index);
1376 The_Comms_Arena()->free(m_comm_tr.csrt.row_offset);
1377 }
1378 if (m_comm_tr.recv_buffer_mat) {
1379 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_mat);
1380 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_col_index);
1381 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_row_offset);
1382 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_idx_map);
1383 }
1384 m_comm_tr = CommTR{};
1385#else
1387#endif
1388}
1389
1390#ifdef AMREX_USE_MPI
1391
1392template <typename T, template<typename> class Allocator>
1394{
1395 if (m_split) {
1397 (m_col_begin == col_partition[ParallelDescriptor::MyProc()] &&
1398 m_col_end == col_partition[ParallelDescriptor::MyProc()+1]);
1399 return;
1400 }
1401
1402 AMREX_ALWAYS_ASSERT(m_col_partition.empty());
1403
1404 m_col_partition = col_partition;
1405 m_col_begin = col_partition[ParallelDescriptor::MyProc()];
1406 m_col_end = col_partition[ParallelDescriptor::MyProc()+1];
1407
1408 // This function needs to be safe when nnz is zero.
1409
1410 // We need to split the matrix into two parts, a diagonal part for pure
1411 // local operations and another part for remote operations in
1412 // matrix-vector or matrix-matrix multiplication.
1413
1414 Long local_nnz;
1415 Gpu::DeviceVector<Long> pfsum(m_nnz);
1416 auto* p_pfsum = pfsum.data();
1417 auto col_begin = m_col_begin;
1418 auto col_end = m_col_end;
1419 if (m_csr.nnz < Long(std::numeric_limits<int>::max())) {
1420 auto const* pcol = m_csr.col_index.data();
1421 local_nnz = Scan::PrefixSum<int>(int(m_nnz),
1422 [=] AMREX_GPU_DEVICE (int i) -> int {
1423 return (pcol[i] >= col_begin &&
1424 pcol[i] < col_end); },
1425 [=] AMREX_GPU_DEVICE (int i, int const& x) {
1426 p_pfsum[i] = x; },
1428 } else {
1429 auto const* pcol = m_csr.col_index.data();
1430 local_nnz = Scan::PrefixSum<Long>(m_nnz,
1431 [=] AMREX_GPU_DEVICE (Long i) -> Long {
1432 return (pcol[i] >= col_begin &&
1433 pcol[i] < col_end); },
1434 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
1435 p_pfsum[i] = x; },
1437 }
1438
1439 m_csr.nnz = local_nnz;
1440 Long remote_nnz = m_nnz - local_nnz;
1441 m_csr_remote.nnz = remote_nnz;
1442
1443 if (local_nnz != m_nnz) {
1444 m_csr_remote.mat.resize(remote_nnz);
1445 m_csr_remote.col_index.resize(remote_nnz);
1446 container_type<T> new_mat(local_nnz);
1447 container_type<Long> new_col(local_nnz);
1448 auto const* pmat = m_csr.mat.data();
1449 auto const* pcol = m_csr.col_index.data();
1450 auto* pmat_l = new_mat.data();
1451 auto* pcol_l = new_col.data();
1452 auto* pmat_r = m_csr_remote.mat.data();
1453 auto* pcol_r = m_csr_remote.col_index.data();
1454 ParallelForOMP(m_nnz, [=] AMREX_GPU_DEVICE (Long i)
1455 {
1456 auto ps = p_pfsum[i];
1457 auto local = (pcol[i] >= col_begin &&
1458 pcol[i] < col_end);
1459 if (local) {
1460 pmat_l[ps] = pmat[i];
1461 pcol_l[ps] = pcol[i] - col_begin; // shift the column index to local
1462 } else {
1463 pmat_r[i-ps] = pmat[i];
1464 pcol_r[i-ps] = pcol[i];
1465 }
1466 });
1467 auto noffset = Long(m_csr.row_offset.size());
1468 auto* pro = m_csr.row_offset.data();
1469 m_csr_remote.row_offset.resize(noffset);
1470 auto* pro_r = m_csr_remote.row_offset.data();
1471 ParallelForOMP(noffset, [=] AMREX_GPU_DEVICE (Long i)
1472 {
1473 if (i < noffset-1) {
1474 auto ro_l = p_pfsum[pro[i]];
1475 pro_r[i] = pro[i] - ro_l;
1476 pro[i] = ro_l;
1477 } else {
1478 pro[i] = local_nnz;
1479 pro_r[i] = remote_nnz;
1480 }
1481 });
1483 m_csr.mat.swap(new_mat);
1484 m_csr.col_index.swap(new_col);
1485
1486 // In the remote part, it's expected that some rows don't have
1487 // nonzeros. So we trim them off.
1488 {
1489 Long old_size = m_csr_remote.row_offset.size();
1490 m_ri_ltor.resize(old_size-1);
1491 m_ri_rtol.resize(old_size-1);
1492 auto* p_ltor = m_ri_ltor.data();
1493 auto* p_rtol = m_ri_rtol.data();
1494 container_type<Long> trimmed_row_offset(old_size);
1495 auto const* p_ro = m_csr_remote.row_offset.data();
1496 auto* p_tro = trimmed_row_offset.data();
1497 Long new_size;
1498 if (old_size < Long(std::numeric_limits<int>::max())) {
1499 // This is basically std::unique.
1500 new_size = Scan::PrefixSum<int>(int(old_size),
1501 [=] AMREX_GPU_DEVICE (int i) -> int {
1502 if (i+1 < old_size) {
1503 return (p_ro[i+1] > p_ro[i]);
1504 } else {
1505 return 1;
1506 }
1507 },
1508 [=] AMREX_GPU_DEVICE (int i, int const& x) {
1509 if (i == 0) {
1510 p_tro[0] = 0;
1511 } else if (p_ro[i] > p_ro[i-1]) {
1512 p_tro[x] = p_ro[i];
1513 }
1514 if (i+1 < old_size) {
1515 if (p_ro[i+1] > p_ro[i]) {
1516 p_rtol[x] = i;
1517 p_ltor[i] = x;
1518 } else {
1519 p_ltor[i] = -1;
1520 }
1521 }
1522 },
1524 } else {
1525 // This is basically std::unique.
1526 new_size = Scan::PrefixSum<Long>(old_size,
1527 [=] AMREX_GPU_DEVICE (Long i) -> Long {
1528 if (i+1 < old_size) {
1529 return (p_ro[i+1] > p_ro[i]);
1530 } else {
1531 return 1;
1532 }
1533 },
1534 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
1535 if (i == 0) {
1536 p_tro[0] = 0;
1537 } else if (p_ro[i] > p_ro[i-1]) {
1538 p_tro[x] = p_ro[i];
1539 }
1540 if (i+1 < old_size) {
1541 if (p_ro[i+1] > p_ro[i]) {
1542 p_rtol[x] = i;
1543 p_ltor[i] = x;
1544 } else {
1545 p_ltor[i] = -1;
1546 }
1547 }
1548 },
1550 }
1551
1552 m_ri_rtol.resize(new_size-1);
1553 trimmed_row_offset.resize(new_size);
1554#ifdef AMREX_USE_GPU
1555 m_ri_rtol.shrink_to_fit();
1556 trimmed_row_offset.shrink_to_fit();
1557#endif
1558 m_csr_remote.row_offset.swap(trimmed_row_offset);
1559 }
1560
1561 } else if (col_begin > 0) {
1562 auto* pcol = m_csr.col_index.data();
1563 ParallelForOMP(m_nnz, [=] AMREX_GPU_DEVICE (Long i) { pcol[i] -= col_begin; });
1564 }
1565
1566 update_remote_col_index(m_csr_remote, true);
1567
1568 m_split = true;
1569}
1570
1571template <typename T, template<typename> class Allocator>
1572template <typename C>
1573void SpMatrix<T,Allocator>::update_remote_col_index (C& csrr, bool in_device_memory)
1574{
1575 int const nprocs = ParallelContext::NProcsSub();
1576
1577 // This function also needs to update m_remote_cols_*.
1578
1579 m_remote_cols_v.clear();
1580 m_remote_cols_vv.clear();
1581 m_remote_cols_vv.resize(nprocs);
1582#ifdef AMREX_USE_GPU
1583 m_remote_cols_dv.clear();
1584#endif
1585
1586 if (csrr.nnz == 0) { return; }
1587
1588 amrex::ignore_unused(in_device_memory);
1589
1590#ifdef AMREX_USE_GPU
1591 if (in_device_memory) {
1592 m_remote_cols_v.resize(csrr.nnz);
1594 csrr.col_index.begin(),
1595 csrr.col_index.end(),
1596 m_remote_cols_v.begin());
1598 } else
1599#endif
1600 {
1601 m_remote_cols_v.assign(csrr.col_index.begin(),
1602 csrr.col_index.end());
1603 }
1604
1605 amrex::RemoveDuplicates(m_remote_cols_v);
1606
1607#ifdef AMREX_USE_GPU
1608 m_remote_cols_dv.resize(m_remote_cols_v.size());
1610 m_remote_cols_v.begin(),
1611 m_remote_cols_v.end(),
1612 m_remote_cols_dv.data());
1613#endif
1614
1615 // Note that amrex::RemoveDuplicates sorts the data.
1616 auto const& cp = this->m_col_partition.dataVector();
1617 AMREX_ALWAYS_ASSERT(m_remote_cols_v.front() >= cp.front() &&
1618 m_remote_cols_v.back() < cp.back());
1619 auto it = cp.cbegin();
1620 for (auto c : m_remote_cols_v) {
1621 it = std::find_if(it, cp.cend(), [&] (auto x) { return x > c; });
1622 if (it != cp.cend()) {
1623 int iproc = int(std::distance(cp.cbegin(),it)) - 1;
1624 m_remote_cols_vv[iproc].push_back(c);
1625 } else {
1626 amrex::Abort("SpMatrix::update_remote_col_index: how did this happen?");
1627 }
1628 }
1629
1630 // Now we convert the remote indices from global to local.
1631 std::map<Long,Long> gtol;
1632 for (Long i = 0, N = Long(m_remote_cols_v.size()); i < N; ++i) {
1633 gtol[m_remote_cols_v[i]] = i;
1634 }
1635
1636#ifdef AMREX_USE_GPU
1637 if (in_device_memory) {
1638 Gpu::PinnedVector<Long> host_col_index(csrr.nnz);
1640 csrr.col_index.begin(),
1641 csrr.col_index.end(),
1642 host_col_index.begin());
1644 for (auto& c : host_col_index) {
1645 c = gtol[c];
1646 }
1648 host_col_index.begin(),
1649 host_col_index.end(),
1650 csrr.col_index.begin());
1652 } else
1653#endif
1654 {
1655 for (auto& c : csrr.col_index) {
1656 c = gtol[c];
1657 }
1658 }
1659}
1660
1661template <typename T, template<typename> class Allocator>
1663{
1664 if (m_num_neighbors >= 0) { return; }
1665
1666 int const nprocs = ParallelContext::NProcsSub();
1667 auto const mpi_int = ParallelDescriptor::Mpi_typemap<int>::type(); // NOLINT(readability-qualified-auto)
1668 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1669
1670 amrex::Vector<int> connection(nprocs);
1671 for (int iproc = 0; iproc < nprocs; ++iproc) {
1672 connection[iproc] = m_remote_cols_vv[iproc].empty() ? 0 : 1;
1673 }
1674 amrex::Vector<int> reduce_scatter_counts(nprocs,1);
1675 m_num_neighbors = 0;
1676 BL_MPI_REQUIRE(MPI_Reduce_scatter
1677 (connection.data(), &m_num_neighbors, reduce_scatter_counts.data(),
1678 mpi_int, MPI_SUM, mpi_comm));
1679}
1680
1681template <typename T, template<typename> class Allocator>
1683{
1684 if (m_comm_mv.prepared) { return; }
1685
1686 // This function needs to be safe when nnz is zero.
1687
1688 this->split_csr(col_partition);
1689
1690 int const nprocs = ParallelContext::NProcsSub();
1691 auto const mpi_tag = ParallelDescriptor::SeqNum();
1692 auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
1693 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1694
1695 if (m_num_neighbors < 0) { set_num_neighbors(); }
1696
1697 Vector<MPI_Request> mpi_requests;
1698 mpi_requests.reserve(nprocs);
1699 for (int iproc = 0; iproc < nprocs; ++iproc) {
1700 if ( ! m_remote_cols_vv[iproc].empty()) {
1701 mpi_requests.push_back(MPI_REQUEST_NULL);
1702 auto const sz = m_remote_cols_vv[iproc].size();
1703 if (sz > static_cast<Long>(std::numeric_limits<int>::max())) {
1704 amrex::Abort("SpMatrix::prepare_comm_mv: remote column payload exceeds MPI int count range.");
1705 }
1706 auto const msg_count = static_cast<int>(sz);
1707 // I need to let other processes know what I need from them.
1708 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[iproc].data(),
1709 msg_count,
1710 mpi_long, iproc, mpi_tag, mpi_comm,
1711 &(mpi_requests.back())));
1712 m_comm_mv.recv_from.push_back(iproc);
1713 m_comm_mv.recv_counts.push_back(msg_count);
1714 }
1715 }
1716
1717 m_comm_mv.total_counts_recv = Long(m_remote_cols_v.size());
1718
1719 Vector<Vector<Long>> send_indices(m_num_neighbors);
1720 m_comm_mv.total_counts_send = 0;
1721 for (int isend = 0; isend < m_num_neighbors; ++isend) {
1722 MPI_Status mpi_status;
1723 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1724 int receiver = mpi_status.MPI_SOURCE;
1725 int count;
1726 BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
1727 m_comm_mv.send_to.push_back(receiver);
1728 m_comm_mv.send_counts.push_back(count);
1729 send_indices[isend].resize(count);
1730 BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
1731 receiver, mpi_tag, mpi_comm, &mpi_status));
1732 m_comm_mv.total_counts_send += count;
1733 }
1734
1735 m_comm_mv.send_indices.resize(m_comm_mv.total_counts_send);
1736 Gpu::PinnedVector<Long> send_indices_all;
1737 send_indices_all.reserve(m_comm_mv.total_counts_send);
1738 for (auto const& vl : send_indices) {
1739 for (auto x : vl) {
1740 send_indices_all.push_back(x);
1741 }
1742 }
1743 Gpu::copyAsync(Gpu::hostToDevice, send_indices_all.begin(), send_indices_all.end(),
1744 m_comm_mv.send_indices.begin());
1746
1747 if (! mpi_requests.empty()) {
1748 Vector<MPI_Status> mpi_statuses(mpi_requests.size());
1749 BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
1750 mpi_statuses.data()));
1751 }
1752
1753 m_comm_mv.prepared = true;
1754}
1755
1756template <typename T, template<typename> class Allocator>
1758{
1759 auto* pdst = m_comm_mv.send_buffer;
1760 auto* pidx = m_comm_mv.send_indices.data();
1761 auto const& vv = v.view();
1762 auto const nsends = Long(m_comm_mv.send_indices.size());
1763 ParallelForOMP(nsends, [=] AMREX_GPU_DEVICE (Long i)
1764 {
1765 pdst[i] = vv(pidx[i]);
1766 });
1767}
1768
1769template <typename T, template<typename> class Allocator>
1771{
1772 auto const& csr = m_csr_remote;
1773 if (csr.nnz > 0) {
1774 T const* AMREX_RESTRICT mat = csr.mat.data();
1775 auto const* AMREX_RESTRICT col = csr.col_index.data();
1776 auto const* AMREX_RESTRICT row = csr.row_offset.data();
1777
1778 auto const* rtol = m_ri_rtol.data();
1779
1780 auto const* AMREX_RESTRICT px = m_comm_mv.recv_buffer;
1781 auto * AMREX_RESTRICT py = v.data();
1782
1783 auto const nrr = Long(csr.row_offset.size())-1;
1785 {
1786 T r = 0;
1787 for (Long j = row[i]; j < row[i+1]; ++j) {
1788 r += mat[j] * px[col[j]];
1789 }
1790 py[rtol[i]] += r;
1791 });
1792 }
1793}
1794
1795template <typename T, template<typename> class Allocator>
1797 AlgPartition const& col_partition)
1798{
1799 m_split = true;
1800 m_col_partition = col_partition;
1801 m_col_begin = m_col_partition[ParallelDescriptor::MyProc() ];
1802 m_col_end = m_col_partition[ParallelDescriptor::MyProc()+1];
1803
1804 m_ri_ltor.resize(m_csr.nrows(), -1);
1805 m_remote_cols_vv.resize(ParallelDescriptor::NProcs());
1806
1807 auto nnz = ctr.total_counts_recv[0];
1808 if (nnz == 0) { return; }
1809
1810 m_nnz += nnz;
1811 auto nb = int(ctr.recv_from.size()); // # of blocked CSRs to be merged
1812 auto total_local_rows = ctr.total_counts_recv[1];
1813
1814 // Build compressed row index map
1816 ctr.recv_buffer_idx_map + total_local_rows);
1817 RemoveDuplicates(ri_map);
1818 Long nrows = ri_map.size(); // # of unique rows.
1819
1820#ifdef AMREX_USE_GPU
1822#else
1823 auto& csrr = m_csr_remote;
1824#endif
1825 csrr.mat.resize(nnz);
1826 csrr.col_index.resize(nnz);
1827 csrr.row_offset.resize(nrows+1);
1828 csrr.nnz = nnz;
1829
1830 // Count nnz per compressed row
1831 Vector<int> row_nnz(nrows, 0);
1832 for (int i = 0; i < nb; ++i) {
1833 auto nrow_i = ctr.recv_counts[i][1];
1834 Long const* row_offset = ctr.recv_buffer_row_offset
1835 + ctr.recv_buffer_offset[i][2];
1836 Long const* idx_map = ctr.recv_buffer_idx_map
1837 + ctr.recv_buffer_offset[i][3];
1838 AMREX_ASSERT((row_offset[nrow_i] - row_offset[0]) == ctr.recv_counts[i][0]);
1839
1840 Long p = 0; // index into ri_map
1841 for (int lr = 0; lr < nrow_i; ++lr) {
1842 Long const gr = idx_map[lr];
1843 while (p < nrows && ri_map[p] < gr) { ++p; }
1844 AMREX_ASSERT(p < nrows && ri_map[p] == gr);
1845 // p is now compressed row index
1846 row_nnz[p] += int(row_offset[lr+1] - row_offset[lr]);
1847 }
1848 }
1849
1850 csrr.row_offset[0] = 0;
1851 std::partial_sum(row_nnz.begin(), row_nnz.end(), csrr.row_offset.begin()+1);
1852 AMREX_ASSERT(csrr.nnz == csrr.row_offset.back());
1853
1854 auto rowpos = csrr.row_offset; // make a copy to keep track of offset
1855
1856 for (int i = 0; i < nb; ++i) {
1857 auto nrow_i = ctr.recv_counts[i][1];
1858 T const* mat = ctr.recv_buffer_mat
1859 + ctr.recv_buffer_offset[i][0];
1860 Long const* col_index = ctr.recv_buffer_col_index
1861 + ctr.recv_buffer_offset[i][1];
1862 Long const* row_offset = ctr.recv_buffer_row_offset
1863 + ctr.recv_buffer_offset[i][2];
1864 Long const* idx_map = ctr.recv_buffer_idx_map
1865 + ctr.recv_buffer_offset[i][3];
1866
1867 Long p = 0; // index into ri_map
1868 for (int lr = 0; lr < nrow_i; ++lr) {
1869 Long const gr = idx_map[lr];
1870 while (p < nrows && ri_map[p] < gr) { ++p; }
1871 AMREX_ASSERT(p < nrows && ri_map[p] == gr);
1872 // p is now compressed row index
1873 auto os_src = row_offset[lr] - row_offset[0];
1874 auto nvals = row_offset[lr+1] - row_offset[lr];
1875 auto os_dst = rowpos[p];
1876 std::memcpy(csrr. mat.data()+os_dst, mat+os_src,
1877 sizeof(T) *nvals);
1878 std::memcpy(csrr.col_index.data()+os_dst, col_index+os_src,
1879 sizeof(Long)*nvals);
1880
1881 rowpos[p] += nvals;
1882 }
1883 }
1884
1885 m_ri_rtol.resize(nrows);
1886 Gpu::copyAsync(Gpu::hostToDevice, ri_map.begin(), ri_map.end(), m_ri_rtol.begin());
1887 {
1888 auto row_begin = m_row_begin;
1889 auto* AMREX_RESTRICT ltor = m_ri_ltor.data();
1890 auto* AMREX_RESTRICT rtol = m_ri_rtol.data();
1891 ParallelForOMP(nrows, [=] AMREX_GPU_DEVICE (Long i) {
1892 rtol[i] -= row_begin;
1893 ltor[rtol[i]] = i;
1894 });
1895 }
1896
1897 // The column index in csrr is still global.
1898 update_remote_col_index(csrr, false);
1899
1900#ifdef AMREX_USE_GPU
1901 amrex::duplicateCSR(Gpu::hostToDevice, m_csr_remote, csrr);
1903#endif
1904}
1905
1906#endif
1907
1908}
1909
1910#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_CUSPARSE_SAFE_CALL(call)
Definition AMReX_GpuError.H:101
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1139
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
Definition AMReX_AlgPartition.H:21
Long numGlobalRows() const
Total number of rows covered by the partition.
Definition AMReX_AlgPartition.H:47
Distributed dense vector that mirrors the layout of an AlgPartition.
Definition AMReX_AlgVector.H:29
Long numLocalRows() const
Number of entries stored on this rank.
Definition AMReX_AlgVector.H:74
T const * data() const
Definition AMReX_AlgVector.H:85
void define(Long global_size)
Resize/repartition the vector to span global_size rows.
Definition AMReX_AlgVector.H:257
Table1D< T const, Long > view() const
Definition AMReX_AlgVector.H:94
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
void reserve(size_type a_capacity, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:811
size_type size() const noexcept
Definition AMReX_PODVector.H:648
void shrink_to_fit()
Definition AMReX_PODVector.H:818
iterator begin() noexcept
Definition AMReX_PODVector.H:674
void resize(size_type a_new_size, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:728
iterator end() noexcept
Definition AMReX_PODVector.H:678
T * data() noexcept
Definition AMReX_PODVector.H:666
void push_back(const T &a_value)
Definition AMReX_PODVector.H:629
Distributed CSR matrix that manages storage and GPU-friendly partitions.
Definition AMReX_SpMatrix.H:61
void finishComm_tr(SpMatrix< T, Allocator > &AT)
Complete transpose communication, writing the assembled matrix into AT.
Definition AMReX_SpMatrix.H:1352
void split_csr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1393
Long globalRowBegin() const
Inclusive global index begin.
Definition AMReX_SpMatrix.H:196
void define_and_filter_doit(T const *mat, Long const *col_index, Long nentries, Long const *row_offset)
Private helper (exposed for CUDA) that copies/filters CSR arrays into device storage.
Definition AMReX_SpMatrix.H:829
Long * rowOffset()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:213
void sortCSR()
Definition AMReX_SpMatrix.H:821
void pack_buffer_mv(AlgVector< T, AllocT > const &v)
Definition AMReX_SpMatrix.H:1757
void unpack_buffer_mv(AlgVector< T, AllocT > &v)
Definition AMReX_SpMatrix.H:1770
void update_remote_col_index(C &csrr, bool in_device_memory)
Definition AMReX_SpMatrix.H:1573
void startComm_tr(AlgPartition const &col_partition)
Initiate communication required to build the transpose with column partition col_partition.
Definition AMReX_SpMatrix.H:1142
void define_doit(int nnz_per_row)
Private helper (exposed for CUDA) that allocates fixed-connectivity matrices with nnz_per_row entries...
Definition AMReX_SpMatrix.H:753
T * data()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:201
friend class AMG
Definition AMReX_SpMatrix.H:297
Long numGlobalRows() const
Global row count.
Definition AMReX_SpMatrix.H:191
SpMatrix & operator=(SpMatrix const &)=delete
friend SpMatrix< U, M > transpose(SpMatrix< U, M > const &A, AlgPartition col_partition)
~SpMatrix()=default
T value_type
Definition AMReX_SpMatrix.H:63
Allocator< U > allocator_type
Definition AMReX_SpMatrix.H:64
Long globalRowEnd() const
Exclusive global index end.
Definition AMReX_SpMatrix.H:198
Long numLocalNonZeros() const
Number of nonzeros stored locally.
Definition AMReX_SpMatrix.H:193
struct amrex::SpMatrix::CommMV m_comm_mv
AlgVector< T, AllocT > rowSum() const
Sum the values in each local row and return the result as an AlgVector.
Definition AMReX_SpMatrix.H:980
AlgPartition const & columnPartition() const
Return the column partition used for matrix-vector and matrix-matrix multiplications.
Definition AMReX_SpMatrix.H:186
void printToFile(std::string const &file) const
Definition AMReX_SpMatrix.H:869
ParCsr< T const > const_parcsr() const
Const-qualified alias of parcsr() for convenience.
Definition AMReX_SpMatrix.H:1028
SpMatrix()=default
SpMatrix(SpMatrix const &)=delete
ParCsr< T > parcsr()
Build GPU-friendly CSR views split into diagonal/off-diagonal blocks.
Definition AMReX_SpMatrix.H:1004
SpMatrix(SpMatrix &&)=default
Long numLocalRows() const
Number of rows owned by this rank.
Definition AMReX_SpMatrix.H:189
AlgPartition const & partition() const
Row partition describing how matrix rows are distributed across ranks.
Definition AMReX_SpMatrix.H:177
AlgVector< T, AllocT > const & diagonalVector() const
Return (and cache) the diagonal entries of a square matrix.
Definition AMReX_SpMatrix.H:954
Long * columnIndex()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:207
void finishComm_mv(AlgVector< T, AllocT > &y)
Finish halo exchanges and accumulate contributions into y.
Definition AMReX_SpMatrix.H:1108
Allocator< T > AllocT
Definition AMReX_SpMatrix.H:67
struct amrex::SpMatrix::CommTR m_comm_tr
void prepare_comm_mv(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1682
void startComm_mv(AlgVector< T, AllocT > const &x)
Prepare halo exchanges for a subsequent SpMV using x as the source vector.
Definition AMReX_SpMatrix.H:1059
void unpack_buffer_tr(CommTR const &ctr, AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1796
friend void SpMV(AlgVector< U, N > &y, SpMatrix< U, M > const &A, AlgVector< U, N > const &x)
void setVal(F const &f, CsrSorted is_sorted)
Initialize matrix entries using a row-wise functor.
Definition AMReX_SpMatrix.H:934
void define(AlgPartition partition, int nnz_per_row)
Allocate storage for a default-constructed matrix with a fixed number of nonzeros per row.
Definition AMReX_SpMatrix.H:731
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:865
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:845
Arena * The_Arena()
Definition AMReX_Arena.cpp:805
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:696
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
amrex::ArenaAllocator< T > DefaultAllocator
Definition AMReX_GpuAllocators.H:205
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
void duplicateCSR(C c, CSR< T, AD > &dst, CSR< T, AS > const &src)
Copy CSR buffers between memory spaces asynchronously.
Definition AMReX_CSR.H:116
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:211
V< Long > row_offset
Definition AMReX_CSR.H:52
Long nnz
Definition AMReX_CSR.H:53
CsrView< T > view()
Mutable view of the underlying buffers.
Definition AMReX_CSR.H:77
void resize(Long num_rows, Long num_non_zeros)
Resize the storage to accommodate num_rows and num_non_zeros entries.
Definition AMReX_CSR.H:69
V< Long > col_index
Definition AMReX_CSR.H:51
V< T > mat
Definition AMReX_CSR.H:50
Sorted CSR means for each row the column indices are sorted.
Definition AMReX_SpMatrix.H:45
bool b
Definition AMReX_SpMatrix.H:46
Valid CSR means all entries are valid. It may be sorted ro unsorted.
Definition AMReX_SpMatrix.H:51
bool b
Definition AMReX_SpMatrix.H:52
Lightweight non-owning CSR view that can point to host or device buffers.
Definition AMReX_CSR.H:33
GPU-ready non-owning CSR data container.
Definition AMReX_SpMatrix.H:35
Long const *__restrict__ col_map
Definition AMReX_SpMatrix.H:41
Long const *__restrict__ row_map
Definition AMReX_SpMatrix.H:40
CsrView< T > csr1
Definition AMReX_SpMatrix.H:37
Long col_begin
Definition AMReX_SpMatrix.H:39
Long row_begin
Definition AMReX_SpMatrix.H:38
CsrView< T > csr0
Definition AMReX_SpMatrix.H:36
Definition AMReX_SpMatrix.H:381
T * send_buffer
Definition AMReX_SpMatrix.H:390
bool prepared
Definition AMReX_SpMatrix.H:397
Vector< int > recv_counts
Definition AMReX_SpMatrix.H:387
Long total_counts_recv
Definition AMReX_SpMatrix.H:395
Vector< int > recv_from
Definition AMReX_SpMatrix.H:386
T * recv_buffer
Definition AMReX_SpMatrix.H:394
Vector< int > send_counts
Definition AMReX_SpMatrix.H:383
Long total_counts_send
Definition AMReX_SpMatrix.H:391
Gpu::DeviceVector< Long > send_indices
Definition AMReX_SpMatrix.H:384
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:393
Vector< int > send_to
Definition AMReX_SpMatrix.H:382
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:389
Definition AMReX_SpMatrix.H:400
Vector< std::array< int, 2 > > send_counts
Definition AMReX_SpMatrix.H:404
Long * recv_buffer_col_index
Definition AMReX_SpMatrix.H:417
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:405
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:409
Vector< int > send_to
Definition AMReX_SpMatrix.H:403
std::array< Long, 2 > total_counts_recv
Definition AMReX_SpMatrix.H:411
Long * recv_buffer_row_offset
Definition AMReX_SpMatrix.H:418
Vector< std::array< int, 2 > > recv_counts
Definition AMReX_SpMatrix.H:408
CsrView< T > csrt
Definition AMReX_SpMatrix.H:401
T * recv_buffer_mat
Definition AMReX_SpMatrix.H:416
Vector< std::array< Long, 4 > > recv_buffer_offset
Definition AMReX_SpMatrix.H:412
Vector< int > recv_from
Definition AMReX_SpMatrix.H:407
Long * recv_buffer_idx_map
Definition AMReX_SpMatrix.H:419
Definition AMReX_ccse-mpi.H:55