Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_SpMatrix.H
Go to the documentation of this file.
1#ifndef AMREX_SP_MATRIX_H_
2#define AMREX_SP_MATRIX_H_
3#include <AMReX_Config.H>
4
6#include <AMReX_AlgVector.H>
7#include <AMReX_CSR.H>
8#include <AMReX_Gpu.H>
9#include <AMReX_Scan.H>
10
11#if defined(AMREX_USE_CUDA)
12# include <cusparse.h>
13#elif defined(AMREX_USE_HIP)
14# include <rocsparse/rocsparse.h>
15#elif defined(AMREX_USE_SYCL)
16# include <mkl_version.h>
17# include <oneapi/mkl/spblas.hpp>
18#endif
19
20#include <fstream>
21#include <string>
22#include <type_traits>
23#include <unordered_map>
24
25namespace amrex {
26
28template <typename T>
29struct ParCsr {
30 CsrView<T> csr0; // diagonal part
31 CsrView<T> csr1; // off-diagonal part
32 Long row_begin = 0; // global row index begin
33 Long col_begin = 0; // global col index begin
34 Long const* AMREX_RESTRICT row_map = nullptr; // mapping from csr0's row to csr1
35 Long const* AMREX_RESTRICT col_map = nullptr; // mapping from csr1's col to global
36};
37
39struct CsrSorted {
40 bool b = true;
41 explicit operator bool() const { return b; }
42};
43
45struct CsrValid {
46 bool b = true;
47 explicit operator bool() const { return b; }
48};
49
50template <typename T, template<typename> class Allocator = DefaultAllocator>
52{
53public:
54 using value_type = T;
55 template <class U> using allocator_type = Allocator<U>;
56 template <class U> using container_type = PODVector<U,Allocator<U> >;
58 using AllocT = Allocator<T>;
59
60 SpMatrix () = default;
61
80 SpMatrix (AlgPartition partition, int nnz_per_row);
81
93
94 SpMatrix (SpMatrix const&) = delete;
95 SpMatrix& operator= (SpMatrix const&) = delete;
96
97 SpMatrix (SpMatrix &&) = default;
99
100 ~SpMatrix () = default;
101
116 void define (AlgPartition partition, int nnz_per_row);
117
150 void define (AlgPartition partition, T const* mat, Long const* col_index,
151 Long nentries, Long const* row_offset, CsrSorted is_sorted,
152 CsrValid is_valid);
153
165 void define (AlgPartition partition, csr_type csr, CsrSorted is_sorted);
166
167 [[nodiscard]] AlgPartition const& partition () const { return m_partition; }
168
176 [[nodiscard]] AlgPartition const& columnPartition () const { return m_col_partition; }
177
178 [[nodiscard]] Long numLocalRows () const { return m_row_end - m_row_begin; }
179 [[nodiscard]] Long numGlobalRows () const { return m_partition.numGlobalRows(); }
180 [[nodiscard]] Long numLocalNonZeros () const { return m_nnz; }
181
183 [[nodiscard]] Long globalRowBegin () const { return m_row_begin; }
185 [[nodiscard]] Long globalRowEnd () const { return m_row_end; }
186
188 [[nodiscard]] T* data () {
189 AMREX_ALWAYS_ASSERT(m_split == false);
190 return m_csr.mat.data();
191 }
192
194 [[nodiscard]] Long* columnIndex () {
195 AMREX_ALWAYS_ASSERT(m_split == false);
196 return m_csr.col_index.data();
197 }
198
200 [[nodiscard]] Long* rowOffset () {
201 AMREX_ALWAYS_ASSERT(m_split == false);
202 return m_csr.row_offset.data();
203 }
204
205 /*
206 * \brief Print the matrix to files.
207 *
208 * Each process writes its local portion of the matrix to a separate
209 * file. This function is provided for debugging purpose. Using it in
210 * large-scale runs may overwhelm the file system.
211 *
212 * File format:
213 * - The first line contains three integers describing the local matrix:
214 * the global row begin index, the global row end index, and the
215 * number of local nonzeros.
216 * - This is followed by one line per nonzero entry. Each line contains:
217 * the global row index, global column index, and matrix value.
218 *
219 * \param file Base file name. The full name on process `i` is `{file}.{i}`.
220 */
221 void printToFile (std::string const& file) const;
222
245 template <typename F>
246 void setVal (F const& f, CsrSorted is_sorted);
247
250 void sortCSR ();
251
253 [[nodiscard]] AlgVector<T,AllocT> const& diagonalVector () const;
254
256 [[nodiscard]] AlgVector<T,AllocT> rowSum () const;
257
258 [[nodiscard]] ParCsr<T > parcsr () ;
259 [[nodiscard]] ParCsr<T const> parcsr () const;
260 [[nodiscard]] ParCsr<T const> const_parcsr () const;
261
262 template <typename U, template<typename> class M, typename N> friend
264
265 template <typename U, template<typename> class M> friend
267
268 template <typename U> friend class AMG;
269
271 void define_doit (int nnz_per_row);
272
274 template <typename I>
275 void define_and_filter_doit (T const* mat, Long const* col_index,
276 Long nentries, Long const* row_offset);
277
278 void startComm_mv (AlgVector<T,AllocT> const& x);
280
281 void startComm_tr (AlgPartition const& col_partition);
283
284private:
285
286 void set_num_neighbors ();
287
288 AlgPartition m_partition;
289 AlgPartition m_col_partition;
290 Long m_row_begin = 0;
291 Long m_row_end = 0;
292 Long m_col_begin = 0;
293 Long m_col_end = 0;
294 Long m_nnz = 0;
295 csr_type m_csr;
296
297 mutable AlgVector<T,AllocT> m_diagonal;
298
299 bool m_split = false; // Has the matrix been split into diagonal and off-diagonal parts?
300
301#ifdef AMREX_USE_MPI
302 csr_type m_csr_remote;
303
304 // It should be noted that m_csr and m_csr_remote may have different
305 // number of rows, because some rows may be purely local and they do not
306 // appear in m_csr_remote. Thus, we need a mapping from local row index
307 // in m_csr_remote to local row index in m_csr. Then we will be able to
308 // know its global row index by adding m_row_begin. For example,
309 // m_row_begin + m_ri_rtol[i] is the global index for local row i in
310 // m_csr_remote.
311 container_type<Long> m_ri_rtol; // size: m_csr_remote.nrows()
312
313 // This is local row index mapping from m_csr to m_csr_remote. -1 means
314 // the row does not exist in m_csr_remote.
315 container_type<Long> m_ri_ltor; // size: m_csr.nrows()
316
317 // For column index, we also need to be careful with local vs. global,
318 // and there two types of locals: local in m_csr and local in
319 // m_csr_remote. For m_csr, col_index is the global index if m_split is
320 // false, and it becomes the local index if m_split is true. The
321 // conversion is global_col_index = local_col_index + m_col_begin.
322 //
323 // For m_csr_remote, col_index is also local. For a give col_index j,
324 // m_remote_cols_v[j] gives us the global index.
325 Vector<Long> m_remote_cols_v;
326#ifdef AMREX_USE_GPU
327 container_type<Long> m_remote_cols_dv;
328#endif
329 // The size of outer vector is # of procs. The indices are global.
330 Vector<Vector<Long>> m_remote_cols_vv;
331
332 int m_num_neighbors = -1; // No. of other processes involved in communication
333
334public: // NOLINT Private functions, but public for cuda
335
354
355 struct CommTR {
356 CsrView<T> csrt; // Own the memory inside
357
361
365
366 std::array<Long,2> total_counts_recv = {0,0};
368 // The raw pointers below are owning.
369 // xxxxx TODO GPU: Currently we use pinned memory. In the future, we
370 // may explore device memory for GPU aware MPI.
371 T* recv_buffer_mat = nullptr;
374 Long* recv_buffer_idx_map = nullptr; // local -> global for row of A^T.
375
377
378 void split_csr (AlgPartition const& col_partition);
379 template <typename C>
380 void update_remote_col_index (C& csrr, bool in_device_memory);
381 void prepare_comm_mv (AlgPartition const& col_partition);
382 void pack_buffer_mv (AlgVector<T,AllocT> const& v);
384 void unpack_buffer_tr (CommTR const& ctr, AlgPartition const& col_partition);
385
386#endif
387};
388
389namespace detail {
390template <typename T>
391void transpose (CsrView<T> const& csrt, CsrView<T const> const& csr)
392{
393 Long nrows = csr.nrows;
394 Long ncols = csrt.nrows;
395 Long nnz = csr.nnz;
396
397 if (nrows <= 0 || ncols <= 0 || nnz <= 0) {
398 if (ncols > 0) {
399 auto* p = csrt.row_offset;
400 ParallelForOMP(ncols+1, [=] AMREX_GPU_DEVICE (Long i) { p[i] = 0; });
401 }
402 return;
403 }
404
405#ifdef AMREX_USE_GPU
406
407#if defined(AMREX_USE_CUDA)
408
409 cusparseHandle_t handle;
410 AMREX_CUSPARSE_SAFE_CALL(cusparseCreate(&handle));
411 AMREX_CUSPARSE_SAFE_CALL(cusparseSetStream(handle, Gpu::gpuStream()));
412
413 cudaDataType data_type;
414 if constexpr (std::is_same_v<T,float>) {
415 data_type = CUDA_R_32F;
416 } else if constexpr (std::is_same_v<T,double>) {
417 data_type = CUDA_R_64F;
418 } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
419 data_type = CUDA_C_32F;
420 } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
421 data_type = CUDA_C_64F;
422 } else {
423 amrex::Abort("SpMatrix transpose: unsupported data type");
424 }
425
426 AMREX_ALWAYS_ASSERT(nrows < Long(std::numeric_limits<int>::max()) &&
427 ncols < Long(std::numeric_limits<int>::max()) &&
428 nnz < Long(std::numeric_limits<int>::max()));
429
430 auto* csr_col_index = (int*)The_Arena()->alloc(csr.nnz*sizeof(int));
431 auto* csr_row_offset = (int*)The_Arena()->alloc((csr.nrows+1)*sizeof(int));
432 auto* csrt_col_index = (int*)The_Arena()->alloc(csrt.nnz*sizeof(int));
433 auto* csrt_row_offset = (int*)The_Arena()->alloc((csrt.nrows+1)*sizeof(int));
434
435 ParallelFor(std::max(csr.nnz, csr.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
436 {
437 if (i < csr.nnz) {
438 csr_col_index[i] = int(csr.col_index[i]);
439 }
440 if (i < csr.nrows+1) {
441 csr_row_offset[i] = int(csr.row_offset[i]);
442 }
443 });
444
445 std::size_t buffer_size;
447 cusparseCsr2cscEx2_bufferSize(handle, int(nrows), int(ncols), int(nnz),
448 csr.mat, csr_row_offset, csr_col_index,
449 csrt.mat, csrt_row_offset, csrt_col_index,
450 data_type, CUSPARSE_ACTION_NUMERIC,
451 CUSPARSE_INDEX_BASE_ZERO,
452 CUSPARSE_CSR2CSC_ALG1,
453 &buffer_size));
454
455 auto* pbuffer = (void*)The_Arena()->alloc(buffer_size);
456
458 cusparseCsr2cscEx2(handle, int(nrows), int(ncols), int(nnz),
459 csr.mat, csr_row_offset, csr_col_index,
460 csrt.mat, csrt_row_offset, csrt_col_index,
461 data_type, CUSPARSE_ACTION_NUMERIC,
462 CUSPARSE_INDEX_BASE_ZERO,
463 CUSPARSE_CSR2CSC_ALG1,
464 pbuffer));
465
466 ParallelFor(std::max(csrt.nnz, csrt.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
467 {
468 if (i < csrt.nnz) {
469 csrt.col_index[i] = csrt_col_index[i];
470 }
471 if (i < csrt.nrows+1) {
472 csrt.row_offset[i] = csrt_row_offset[i];
473 }
474 });
475
477 AMREX_CUSPARSE_SAFE_CALL(cusparseDestroy(handle));
478 The_Arena()->free(pbuffer);
479 The_Arena()->free(csr_row_offset);
480 The_Arena()->free(csr_col_index);
481 The_Arena()->free(csrt_row_offset);
482 The_Arena()->free(csrt_col_index);
483
484#elif defined(AMREX_USE_HIP)
485
486 rocsparse_handle handle;
487 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&handle));
488 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, Gpu::gpuStream()));
489
490 AMREX_ALWAYS_ASSERT(nrows < Long(std::numeric_limits<rocsparse_int>::max()) &&
491 ncols < Long(std::numeric_limits<rocsparse_int>::max()) &&
492 nnz < Long(std::numeric_limits<rocsparse_int>::max()));
493
494 rocsparse_int* csr_col_index;
495 rocsparse_int* csr_row_offset;
496 rocsparse_int* csrt_col_index;
497 rocsparse_int* csrt_row_offset;
498 if (std::is_same_v<rocsparse_int,Long>) {
499 csr_col_index = (rocsparse_int*)csr.col_index;
500 csr_row_offset = (rocsparse_int*)csr.row_offset;
501 csrt_col_index = (rocsparse_int*)csrt.col_index;
502 csrt_row_offset = (rocsparse_int*)csrt.row_offset;
503 } else {
504 csr_col_index = (rocsparse_int*)The_Arena()->alloc(csr.nnz*sizeof(rocsparse_int));
505 csr_row_offset = (rocsparse_int*)The_Arena()->alloc((csr.nrows+1)*sizeof(rocsparse_int));
506 csrt_col_index = (rocsparse_int*)The_Arena()->alloc(csrt.nnz*sizeof(rocsparse_int));
507 csrt_row_offset = (rocsparse_int*)The_Arena()->alloc((csrt.nrows+1)*sizeof(rocsparse_int));
508 ParallelFor(std::max(csr.nnz, csr.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
509 {
510 if (i < csr.nnz) {
511 csr_col_index[i] = rocsparse_int(csr.col_index[i]);
512 }
513 if (i < csr.nrows+1) {
514 csr_row_offset[i] = rocsparse_int(csr.row_offset[i]);
515 }
516 });
517 }
518
519 std::size_t buffer_size;
520 AMREX_ROCSPARSE_SAFE_CALL(
521 rocsparse_csr2csc_buffer_size(handle, rocsparse_int(nrows),
522 rocsparse_int(ncols), rocsparse_int(nnz),
523 csr_row_offset, csr_col_index,
524 rocsparse_action_numeric,
525 &buffer_size));
526
527 auto* pbuffer = (void*)The_Arena()->alloc(buffer_size);
528
529 if constexpr (std::is_same_v<T,float>) {
530 AMREX_ROCSPARSE_SAFE_CALL(
531 rocsparse_scsr2csc(handle, rocsparse_int(nrows),
532 rocsparse_int(ncols), rocsparse_int(nnz),
533 csr.mat, csr_row_offset, csr_col_index,
534 csrt.mat, csrt_col_index, csrt_row_offset,
535 rocsparse_action_numeric,
536 rocsparse_index_base_zero,
537 pbuffer));
538 } else if constexpr (std::is_same_v<T,double>) {
539 AMREX_ROCSPARSE_SAFE_CALL(
540 rocsparse_dcsr2csc(handle, rocsparse_int(nrows),
541 rocsparse_int(ncols), rocsparse_int(nnz),
542 csr.mat, csr_row_offset, csr_col_index,
543 csrt.mat, csrt_col_index, csrt_row_offset,
544 rocsparse_action_numeric,
545 rocsparse_index_base_zero,
546 pbuffer));
547 } else if constexpr (std::is_same_v<T,GpuComplex<float>>) {
548 AMREX_ROCSPARSE_SAFE_CALL(
549 rocsparse_ccsr2csc(handle, rocsparse_int(nrows),
550 rocsparse_int(ncols), rocsparse_int(nnz),
551 (rocsparse_float_complex*)csr.mat, csr_row_offset, csr_col_index,
552 (rocsparse_float_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
553 rocsparse_action_numeric,
554 rocsparse_index_base_zero,
555 pbuffer));
556 } else if constexpr (std::is_same_v<T,GpuComplex<double>>) {
557 AMREX_ROCSPARSE_SAFE_CALL(
558 rocsparse_zcsr2csc(handle, rocsparse_int(nrows),
559 rocsparse_int(ncols), rocsparse_int(nnz),
560 (rocsparse_double_complex*)csr.mat, csr_row_offset, csr_col_index,
561 (rocsparse_double_complex*)csrt.mat, csrt_col_index, csrt_row_offset,
562 rocsparse_action_numeric,
563 rocsparse_index_base_zero,
564 pbuffer));
565 } else {
566 amrex::Abort("SpMatrix transpose: unsupported data type");
567 }
568
569 if (! std::is_same_v<rocsparse_int,Long>) {
570 ParallelFor(std::max(csrt.nnz, csrt.nrows+1), [=] AMREX_GPU_DEVICE (Long i)
571 {
572 if (i < csrt.nnz) {
573 csrt.col_index[i] = csrt_col_index[i];
574 }
575 if (i < csrt.nrows+1) {
576 csrt.row_offset[i] = csrt_row_offset[i];
577 }
578 });
579 }
580
582 AMREX_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(handle));
583 The_Arena()->free(pbuffer);
584 if (! std::is_same_v<rocsparse_int,Long>) {
585 The_Arena()->free(csr_row_offset);
586 The_Arena()->free(csr_col_index);
587 The_Arena()->free(csrt_row_offset);
588 The_Arena()->free(csrt_col_index);
589 }
590
591#elif defined(AMREX_USE_SYCL)
592
593 mkl::sparse::matrix_handle_t handle_in{};
594 mkl::sparse::matrix_handle_t handle_out{};
595 mkl::sparse::init_matrix_handle(&handle_in);
596 mkl::sparse::init_matrix_handle(&handle_out);
597
598#if defined(INTEL_MKL_VERSION) && (INTEL_MKL_VERSION < 20250300)
600 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols,
601 mkl::index_base::zero, (Long*)csr.row_offset,
602 (Long*)csr.col_index, (T*)csr.mat);
603 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows,
604 mkl::index_base::zero, (Long*)csrt.row_offset,
605 (Long*)csrt.col_index, (T*)csrt.mat);
606#else
607 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_in, nrows, ncols, nnz,
608 mkl::index_base::zero, (Long*)csr.row_offset,
609 (Long*)csr.col_index, (T*)csr.mat);
610 mkl::sparse::set_csr_data(Gpu::Device::streamQueue(), handle_out, ncols, nrows, nnz,
611 mkl::index_base::zero, (Long*)csrt.row_offset,
612 (Long*)csrt.col_index, (T*)csrt.mat);
613#endif
614
615 mkl::sparse::omatcopy(Gpu::Device::streamQueue(), mkl::transpose::trans,
616 handle_in, handle_out);
617
618 mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_in);
619 auto ev = mkl::sparse::release_matrix_handle(Gpu::Device::streamQueue(), &handle_out);
620 ev.wait();
621
622#endif
623
625
626#else
627
628 auto* p = csrt.row_offset;
629
630 ParallelForOMP(ncols+1, [=] AMREX_GPU_DEVICE (Long i) { p[i] = 0; });
631
632 // nonzeros per column
633#ifdef AMREX_USE_OMP
634#pragma omp parallel for
635#endif
636 for (Long i = 0; i < nnz; ++i) {
637 auto col = csr.col_index[i];
638#ifdef AMREX_USE_OMP
639#pragma omp atomic update
640#endif
641 ++p[col+1];
642 }
643
644 // build row_offset for transposed matrix. Also save a copy.
645 Vector<Long> current_pos(ncols+1);
646 current_pos[0] = 0;
647 for (Long i = 0; i < ncols; ++i) {
648 p[i+1] += p[i];
649 current_pos[i+1] = p[i+1];
650 }
651
652 // The following code is not OMP safe. It's difficult to use OMP and
653 // still keep CSR sorted.
654 for (Long i = 0; i < nrows; ++i) {
655 for (Long idx = csr.row_offset[i]; idx < csr.row_offset[i+1]; ++idx) {
656 auto col = csr.col_index[idx];
657 Long dest = current_pos[col]++;
658 csrt.mat[dest] = csr.mat[idx];
659 csrt.col_index[dest] = i;
660 }
661 }
662
663#endif
664}
665}
666
667template <typename T, template<typename> class Allocator>
669 : m_partition(std::move(partition)),
670 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
671 m_row_end(m_partition[ParallelDescriptor::MyProc()+1])
672{
673 define_doit(nnz_per_row);
674}
675
676template <typename T, template<typename> class Allocator>
678 : m_partition(std::move(partition)),
679 m_row_begin(m_partition[ParallelDescriptor::MyProc()]),
680 m_row_end(m_partition[ParallelDescriptor::MyProc()+1]),
681 m_nnz(csr.nnz),
682 m_csr(std::move(csr))
683{}
684
685template <typename T, template<typename> class Allocator>
686void SpMatrix<T,Allocator>::define (AlgPartition partition, int nnz_per_row)
687{
688 m_partition = std::move(partition);
689 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
690 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
691 define_doit(nnz_per_row);
692}
693
694template <typename T, template<typename> class Allocator>
696 CsrSorted is_sorted)
697{
698 m_partition = std::move(partition);
699 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
700 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
701 m_nnz = csr.nnz;
702 m_csr = std::move(csr);
703 if (! is_sorted) { m_csr.sort(); }
704}
705
706template <typename T, template<typename> class Allocator>
707void
709{
710 if (nnz_per_row <= 0) { return; };
711
712 AMREX_ALWAYS_ASSERT(m_split == false);
713
714 Long nlocalrows = this->numLocalRows();
715 m_nnz = nlocalrows*nnz_per_row;
716 m_csr.mat.resize(m_nnz);
717 m_csr.col_index.resize(m_nnz);
718 m_csr.row_offset.resize(nlocalrows+1);
719 m_csr.nnz = m_nnz;
720
721 auto* poffset = m_csr.row_offset.data();
722 ParallelForOMP(nlocalrows+1, [=] AMREX_GPU_DEVICE (Long lrow) noexcept
723 {
724 poffset[lrow] = lrow*nnz_per_row;
725 });
726}
727
728template <typename T, template<typename> class Allocator>
729void
731 Long const* col_index, Long nentries,
732 Long const* row_offset, CsrSorted is_sorted,
733 CsrValid is_valid)
734{
735 AMREX_ALWAYS_ASSERT(m_split == false);
736
737 m_partition = std::move(partition);
738 m_row_begin = m_partition[ParallelDescriptor::MyProc()];
739 m_row_end = m_partition[ParallelDescriptor::MyProc()+1];
740
741 bool synced = false;
742
743 if (is_valid) {
744 m_nnz = nentries;
745 Long nlocalrows = this->numLocalRows();
746 m_csr.mat.resize(nentries);
747 m_csr.col_index.resize(nentries);
748 m_csr.row_offset.resize(nlocalrows+1);
749 m_csr.nnz = nentries;
750 Gpu::copyAsync(Gpu::deviceToDevice, mat, mat+nentries, m_csr.mat.begin());
751 Gpu::copyAsync(Gpu::deviceToDevice, col_index, col_index+nentries,
752 m_csr.col_index.begin());
753 Gpu::copyAsync(Gpu::deviceToDevice, row_offset, row_offset+nlocalrows+1,
754 m_csr.row_offset.begin());
755 } else {
756 if (nentries < Long(std::numeric_limits<int>::max())) {
757 define_and_filter_doit<int>(mat, col_index, nentries, row_offset);
758 } else {
759 define_and_filter_doit<Long>(mat, col_index, nentries, row_offset);
760 }
761 synced = true;
762 }
763
764 if (! is_sorted) {
765 m_csr.sort();
766 synced = true;
767 }
768
769 if (! synced) {
771 }
772}
773
774template <typename T, template<typename> class Allocator>
775void
777{
778 m_csr.sort();
779}
780
781template <typename T, template<typename> class Allocator>
782template <typename I>
783void
785 Long nentries, Long const* row_offset)
786{
787 Gpu::DeviceVector<I> psum(nentries);
788 auto* ps = psum.data();
789 m_nnz = Scan::PrefixSum<I>(I(nentries),
790 [=] AMREX_GPU_DEVICE (I i) -> I {
791 return col_index[i] >= 0 && mat[i] != 0; },
792 [=] AMREX_GPU_DEVICE (I i, I x) {
793 ps[i] = x; },
795 Long nlocalrows = this->numLocalRows();
796 m_csr.mat.resize(m_nnz);
797 m_csr.col_index.resize(m_nnz);
798 m_csr.row_offset.resize(nlocalrows+1);
799 m_csr.nnz = m_nnz;
800 auto* pmat = m_csr.mat.data();
801 auto* pcol = m_csr.col_index.data();
802 auto* prow = m_csr.row_offset.data();
803 auto actual_nnz = m_nnz;
804 ParallelFor(std::max(nentries,nlocalrows), [=] AMREX_GPU_DEVICE (Long i)
805 {
806 if (i < nentries) {
807 if (col_index[i] >= 0 && mat[i] != 0) {
808 pmat[ps[i]] = mat[i];
809 pcol[ps[i]] = col_index[i];
810 }
811 }
812 if (i < nlocalrows) {
813 prow[i] = ps[row_offset[i]];
814 if (i == nlocalrows - 1) {
815 prow[nlocalrows] = actual_nnz;
816 }
817 }
818 });
820}
821
822template <typename T, template<typename> class Allocator>
823void
824SpMatrix<T,Allocator>::printToFile (std::string const& file) const
825{
826#ifdef AMREX_USE_GPU
829
830# ifdef AMREX_USE_MPI
832 if (m_split) {
833 amrex::duplicateCSR(Gpu::deviceToHost, csr_r, m_csr_remote);
834 }
835
836 Gpu::PinnedVector<Long> ri_ltor(m_ri_ltor.size());
837 if (m_split) {
839 m_ri_ltor.begin(),
840 m_ri_ltor.end(),
841 ri_ltor.begin());
842 }
843 auto const& remote_cols = m_remote_cols_v;
844# endif
845
847
848#else
849
850 auto const& csr = m_csr;
851# ifdef AMREX_USE_MPI
852 auto const& csr_r = m_csr_remote;
853 auto const& ri_ltor = m_ri_ltor;
854 auto const& remote_cols = m_remote_cols_v;
855# endif
856
857#endif
858
859 Long nnz = m_csr.nnz;
860#ifdef AMREX_USE_MPI
861 nnz += m_csr_remote.nnz;
862#endif
863
864 std::ofstream ofs(file+"."+std::to_string(ParallelDescriptor::MyProc()));
865 ofs << m_row_begin << " " << m_row_end << " " << nnz << "\n";
866 for (Long i = 0, nrows = numLocalRows(); i < nrows; ++i) {
867 Long nnz_row = csr.row_offset[i+1] - csr.row_offset[i];
868 T const* mat = csr.mat.data() + csr.row_offset[i];
869 Long const* col = csr.col_index.data() + csr.row_offset[i];
870 for (Long j = 0; j < nnz_row; ++j) {
871 ofs << i+m_row_begin << " " << col[j]+m_col_begin << " " << mat[j] << "\n";
872 }
873#ifdef AMREX_USE_MPI
874 if (i < Long(ri_ltor.size()) && ri_ltor[i] >= 0) {
875 Long ii = ri_ltor[i];
876 nnz_row = csr_r.row_offset[ii+1] - csr_r.row_offset[ii];
877 mat = csr_r.mat.data() + csr_r.row_offset[ii];
878 col = csr_r.col_index.data() + csr_r.row_offset[ii];
879 for (Long j = 0; j < nnz_row; ++j) {
880 ofs << i+m_row_begin << " " << remote_cols[col[j]] << " " << mat[j] << "\n";
881 }
882 }
883#endif
884 }
885}
886
887template <typename T, template<typename> class Allocator>
888template <typename F>
889void SpMatrix<T,Allocator>::setVal (F const& f, CsrSorted is_sorted)
890{
891 // xxxxx TODO: We can try to optimize this later by using shared memory.
892
893 AMREX_ALWAYS_ASSERT(m_split == false);
894
895 Long nlocalrows = this->numLocalRows();
896 Long rowbegin = this->globalRowBegin();
897 auto* pmat = m_csr.mat.data();
898 auto* pcolindex = m_csr.col_index.data();
899 auto* prowoffset = m_csr.row_offset.data();
900 ParallelForOMP(nlocalrows, [=] AMREX_GPU_DEVICE (int lrow) noexcept
901 {
902 f(rowbegin+lrow, pcolindex+prowoffset[lrow], pmat+prowoffset[lrow]);
903 });
904
905 if (! is_sorted) { m_csr.sort(); }
906}
907
908template <typename T, template<typename> class Allocator>
910{
911 if (m_diagonal.empty()) {
912 m_diagonal.define(this->partition());
913 auto* AMREX_RESTRICT p = m_diagonal.data();
914 auto const* AMREX_RESTRICT mat = m_csr.mat.data();
915 auto const* AMREX_RESTRICT col = m_csr.col_index.data();
916 auto const* AMREX_RESTRICT row = m_csr.row_offset.data();
917 auto offset = m_split ? Long(0) : m_row_begin; // assuming square matrix
918 Long nrows = this->numLocalRows();
920 {
921 T d = 0;
922 for (Long j = row[i]; j < row[i+1]; ++j) {
923 if (i == col[j] - offset) {
924 d = mat[j];
925 break;
926 }
927 }
928 p[i] = d;
929 });
930 }
931 return m_diagonal;
932}
933
934template <typename T, template<typename> class Allocator>
936{
937 AlgVector<T,Allocator<T>> r(this->partition());
938 auto* p = r.data();
939 auto const& a = this->const_parcsr();
941 {
942 T s = 0;
943 for (auto idx = a.csr0.row_offset[i];
944 idx < a.csr0.row_offset[i+1]; ++idx) {
945 s += a.csr0.mat[idx];
946 }
947 if (a.csr1.nnz > 0) {
948 for (auto idx = a.csr1.row_offset[i];
949 idx < a.csr1.row_offset[i+1]; ++idx) {
950 s += a.csr1.mat[idx];
951 }
952 }
953 p[i] = s;
954 });
955 return r;
956}
957
958template <typename T, template<typename> class Allocator>
960{
961 return ParCsr<T>{m_csr.view(),
962#ifdef AMREX_USE_MPI
963 m_csr_remote.view(),
964#else
965 CsrView<T>{},
966#endif
967 m_row_begin,
968 m_col_begin,
969#ifdef AMREX_USE_MPI
970 m_ri_ltor.data(),
971# ifdef AMREX_USE_GPU
972 m_remote_cols_dv.data()
973# else
974 m_remote_cols_v.data()
975# endif
976#else
977 nullptr, nullptr
978#endif
979 };
980}
981
982template <typename T, template<typename> class Allocator>
984{
985 using U = T const;
986 return ParCsr<U>{m_csr.const_view(),
987#ifdef AMREX_USE_MPI
988 m_csr_remote.const_view(),
989#else
990 CsrView<U>{},
991#endif
992 m_row_begin,
993 m_col_begin,
994#ifdef AMREX_USE_MPI
995 m_ri_ltor.data(),
996# ifdef AMREX_USE_GPU
997 m_remote_cols_dv.data()
998# else
999 m_remote_cols_v.data()
1000# endif
1001#else
1002 nullptr, nullptr
1003#endif
1004 };
1005}
1006
1007template <typename T, template<typename> class Allocator>
1009{
1010 return this->const_parcsr();
1011}
1012
1013template <typename T, template<typename> class Allocator>
1015{
1016#ifndef AMREX_USE_MPI
1018#else
1019 if (this->partition().numActiveProcs() <= 1) { return; }
1020
1021 this->prepare_comm_mv(x.partition());
1022
1023 auto const mpi_tag = ParallelDescriptor::SeqNum();
1024 auto const mpi_t_type = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
1025 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1026
1027 auto const nrecvs = int(m_comm_mv.recv_from.size());
1028 if (nrecvs > 0) {
1029 m_comm_mv.recv_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_comm_mv.total_counts_recv);
1030 m_comm_mv.recv_reqs.resize(nrecvs, MPI_REQUEST_NULL);
1031 auto* p_recv = m_comm_mv.recv_buffer;
1032 for (int irecv = 0; irecv < nrecvs; ++irecv) {
1033 BL_MPI_REQUIRE(MPI_Irecv(p_recv,
1034 m_comm_mv.recv_counts[irecv], mpi_t_type,
1035 m_comm_mv.recv_from[irecv], mpi_tag, mpi_comm,
1036 &(m_comm_mv.recv_reqs[irecv])));
1037 p_recv += m_comm_mv.recv_counts[irecv];
1038 }
1039 AMREX_ASSERT(p_recv == m_comm_mv.recv_buffer + m_comm_mv.total_counts_recv);
1040 }
1041
1042 auto const nsends = int(m_comm_mv.send_to.size());
1043 if (nsends > 0) {
1044 m_comm_mv.send_buffer = (T*)The_Comms_Arena()->alloc(sizeof(T)*m_comm_mv.total_counts_send);
1045
1046 pack_buffer_mv(x);
1048
1049 m_comm_mv.send_reqs.resize(nsends, MPI_REQUEST_NULL);
1050 auto* p_send = m_comm_mv.send_buffer;
1051 for (int isend = 0; isend < nsends; ++isend) {
1052 auto count = m_comm_mv.send_counts[isend];
1053 BL_MPI_REQUIRE(MPI_Isend(p_send, count, mpi_t_type, m_comm_mv.send_to[isend],
1054 mpi_tag, mpi_comm, &(m_comm_mv.send_reqs[isend])));
1055 p_send += count;
1056 }
1057 AMREX_ASSERT(p_send == m_comm_mv.send_buffer + m_comm_mv.total_counts_send);
1058 }
1059#endif
1060}
1061
1062template <typename T, template<typename> class Allocator>
1064{
1065 if (this->numLocalRows() == 0) { return; }
1066
1067#ifndef AMREX_USE_MPI
1069#else
1070 if (this->partition().numActiveProcs() <= 1) { return; }
1071
1072 if ( ! m_comm_mv.recv_reqs.empty()) {
1073 Vector<MPI_Status> mpi_statuses(m_comm_mv.recv_reqs.size());
1074 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_mv.recv_reqs.size()),
1075 m_comm_mv.recv_reqs.data(),
1076 mpi_statuses.data()));
1077 }
1078
1079 unpack_buffer_mv(y);
1080
1081 if ( ! m_comm_mv.send_reqs.empty()) {
1082 Vector<MPI_Status> mpi_statuses(m_comm_mv.send_reqs.size());
1083 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_mv.send_reqs.size()),
1084 m_comm_mv.send_reqs.data(),
1085 mpi_statuses.data()));
1086 }
1087
1089 The_Comms_Arena()->free(m_comm_mv.send_buffer);
1090 The_Comms_Arena()->free(m_comm_mv.recv_buffer);
1091 m_comm_mv.send_reqs.clear();
1092 m_comm_mv.recv_reqs.clear();
1093#endif
1094}
1095
1096template <typename T, template<typename> class Allocator>
1098{
1099#ifdef AMREX_USE_MPI
1100 if (this->partition().numActiveProcs() <= 1) { return; }
1101
1102 this->split_csr(col_partition);
1103
1104 int const nprocs = ParallelContext::NProcsSub();
1105 auto const mpi_tag = ParallelDescriptor::SeqNum();
1106 auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
1107 auto const mpi_t = ParallelDescriptor::Mpi_typemap<T>::type(); // NOLINT(readability-qualified-auto)
1108 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1109
1110 // transpose the off-diagonal part
1111 if (m_csr_remote.nnz > 0) {
1112 m_comm_tr.csrt.nnz = m_csr_remote.nnz;
1113 m_comm_tr.csrt.nrows = m_remote_cols_v.size();
1114 m_comm_tr.csrt.mat = (T*)The_Comms_Arena()->alloc
1115 (sizeof(T)*m_comm_tr.csrt.nnz);
1116 m_comm_tr.csrt.col_index = (Long*)The_Comms_Arena()->alloc
1117 (sizeof(Long)*m_comm_tr.csrt.nnz);
1118 m_comm_tr.csrt.row_offset = (Long*)The_Comms_Arena()->alloc
1119 (sizeof(Long)*(m_comm_tr.csrt.nrows+1));
1120#ifdef AMREX_USE_GPU
1121 csr_type csr_comm;
1122 csr_comm.resize(m_comm_tr.csrt.nrows, m_comm_tr.csrt.nnz);
1123 auto const& csrv_comm = csr_comm.view();
1124#else
1125 auto const& csrv_comm = m_comm_tr.csrt;
1126#endif
1127 detail::transpose(csrv_comm, m_csr_remote.const_view());
1128 auto row_begin = m_row_begin;
1129 auto ri_rtol = m_ri_rtol.data();
1130 auto* col_index = csrv_comm.col_index;
1131 ParallelForOMP(csrv_comm.nnz, [=] AMREX_GPU_DEVICE (Long idx)
1132 {
1133 auto gjt =ri_rtol[col_index[idx]] + row_begin;
1134 col_index[idx] = gjt; // global index
1135 });
1136#ifdef AMREX_USE_GPU
1138 csrv_comm. mat,
1139 csrv_comm. mat + csrv_comm.nnz,
1140 m_comm_tr.csrt.mat);
1142 csrv_comm. col_index,
1143 csrv_comm. col_index + csrv_comm.nnz,
1144 m_comm_tr.csrt.col_index);
1146 csrv_comm. row_offset,
1147 csrv_comm. row_offset + csrv_comm.nrows+1,
1148 m_comm_tr.csrt.row_offset);
1150#endif
1151 }
1152
1153 if (m_num_neighbors < 0) { set_num_neighbors(); }
1154
1155 // As a sender, I need to let other processes know that how many
1156 // elements I will send them.
1157
1158 Vector<MPI_Request> mpi_requests;
1159 mpi_requests.reserve(nprocs);
1160 if (m_csr_remote.nnz > 0) {
1161 Long it = 0;
1162 for (int iproc = 0; iproc < nprocs; ++iproc) {
1163 Long n = 0;
1164 for (Long i = 0; i < Long(m_remote_cols_vv[iproc].size()); ++i) {
1165 n += m_comm_tr.csrt.row_offset[it+1] - m_comm_tr.csrt.row_offset[it];
1166 ++it;
1167 }
1168 if (n > 0) {
1169 mpi_requests.push_back(MPI_REQUEST_NULL);
1170 AMREX_ALWAYS_ASSERT(n < std::numeric_limits<int>::max());
1171 std::array<int,2> nn{int(n), int(m_remote_cols_vv[iproc].size())};
1172 BL_MPI_REQUIRE(MPI_Isend(nn.data(), 2, MPI_INT, iproc, mpi_tag,
1173 mpi_comm, &(mpi_requests.back())));
1174 m_comm_tr.send_to.push_back(iproc);
1175 m_comm_tr.send_counts.push_back(nn);
1176 }
1177 }
1178 }
1179
1180 // As a receiver, m_num_neighbors is the number of processes from which
1181 // I will receive data.
1182
1183 for (int irecv = 0; irecv < m_num_neighbors; ++irecv) {
1184 MPI_Status mpi_status;
1185 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1186 int sender = mpi_status.MPI_SOURCE;
1187 std::array<int,2> nn;
1188 BL_MPI_REQUIRE(MPI_Recv(nn.data(), 2, MPI_INT, sender, mpi_tag,
1189 mpi_comm, &mpi_status));
1190 m_comm_tr.recv_from.push_back(sender);
1191 m_comm_tr.recv_counts.push_back(nn);
1192 m_comm_tr.total_counts_recv[0] += nn[0];
1193 m_comm_tr.total_counts_recv[1] += nn[1];
1194 }
1195
1196 if (! mpi_requests.empty()) {
1197 Vector<MPI_Status> mpi_statuses(mpi_requests.size());
1198 BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
1199 mpi_statuses.data()));
1200 }
1201
1202 auto const mpi_tag_m = ParallelDescriptor::SeqNum();
1203 auto const mpi_tag_c = ParallelDescriptor::SeqNum();
1204 auto const mpi_tag_r = ParallelDescriptor::SeqNum();
1205 auto const mpi_tag_p = ParallelDescriptor::SeqNum();
1206
1207 // We need to send m_comm_tr.csrt.mat, col_index & row_offset. We also
1208 // need to send m_remote_cols_vv, which maps row index (in transposed
1209 // matrix) form local to global.
1210
1211 auto const nrecvs = int(m_comm_tr.recv_from.size());
1212 if (nrecvs > 0) {
1213 m_comm_tr.recv_buffer_mat = (T*) The_Pinned_Arena()->alloc
1214 (sizeof(T) * m_comm_tr.total_counts_recv[0]);
1215 m_comm_tr.recv_buffer_col_index = (Long*) The_Pinned_Arena()->alloc
1216 (sizeof(Long) * m_comm_tr.total_counts_recv[0]);
1217 m_comm_tr.recv_buffer_row_offset = (Long*) The_Pinned_Arena()->alloc
1218 (sizeof(Long) * (m_comm_tr.total_counts_recv[1]+nrecvs));
1219 m_comm_tr.recv_buffer_idx_map = (Long*) The_Pinned_Arena()->alloc
1220 (sizeof(Long) * m_comm_tr.total_counts_recv[1]);
1221 m_comm_tr.recv_buffer_offset.push_back({0,0,0,0});
1222 m_comm_tr.recv_reqs.resize(4*nrecvs, MPI_REQUEST_NULL);
1223 for (int irecv = 0; irecv < nrecvs; ++irecv) {
1224 auto [os0, os1, os2, os3] = m_comm_tr.recv_buffer_offset.back();
1225 auto [n0, n1] = m_comm_tr.recv_counts[irecv];
1226 auto recv_from_rank = m_comm_tr.recv_from[irecv];
1227 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_mat + os0,
1228 n0,
1229 mpi_t,
1230 recv_from_rank,
1231 mpi_tag_m,
1232 mpi_comm,
1233 &(m_comm_tr.recv_reqs[irecv*4])));
1234 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_col_index + os1,
1235 n0,
1236 mpi_long,
1237 recv_from_rank,
1238 mpi_tag_c,
1239 mpi_comm,
1240 &(m_comm_tr.recv_reqs[irecv*4+1])));
1241 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_row_offset + os2,
1242 n1+1,
1243 mpi_long,
1244 recv_from_rank,
1245 mpi_tag_r,
1246 mpi_comm,
1247 &(m_comm_tr.recv_reqs[irecv*4+2])));
1248 BL_MPI_REQUIRE(MPI_Irecv(m_comm_tr.recv_buffer_idx_map + os3,
1249 n1,
1250 mpi_long,
1251 recv_from_rank,
1252 mpi_tag_p,
1253 mpi_comm,
1254 &(m_comm_tr.recv_reqs[irecv*4+3])));
1255 m_comm_tr.recv_buffer_offset.push_back({os0 + n0,
1256 os1 + n0,
1257 os2 + n1+1,
1258 os3 + n1});
1259 }
1260 }
1261
1262 auto const nsends = int(m_comm_tr.send_to.size());
1263 if (nsends > 0) {
1264 m_comm_tr.send_reqs.resize(4*nsends, MPI_REQUEST_NULL);
1265 Long os0 = 0, os1 = 0;
1266 for (int isend = 0; isend < nsends; ++isend) {
1267 auto [n0, n1] = m_comm_tr.send_counts[isend];
1268 auto send_to_rank = m_comm_tr.send_to[isend];
1269 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.mat + os0,
1270 n0,
1271 mpi_t,
1272 send_to_rank,
1273 mpi_tag_m,
1274 mpi_comm,
1275 &(m_comm_tr.send_reqs[isend*4])));
1276 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.col_index + os0,
1277 n0,
1278 mpi_long,
1279 send_to_rank,
1280 mpi_tag_c,
1281 mpi_comm,
1282 &(m_comm_tr.send_reqs[isend*4+1])));
1283 BL_MPI_REQUIRE(MPI_Isend(m_comm_tr.csrt.row_offset + os1,
1284 n1+1,
1285 mpi_long,
1286 send_to_rank,
1287 mpi_tag_r,
1288 mpi_comm,
1289 &(m_comm_tr.send_reqs[isend*4+2])));
1290 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[send_to_rank].data(),
1291 n1,
1292 mpi_long,
1293 send_to_rank,
1294 mpi_tag_p,
1295 mpi_comm,
1296 &(m_comm_tr.send_reqs[isend*4+3])));
1297 os0 += n0;
1298 os1 += n1;
1299 }
1300 }
1301#else
1302 amrex::ignore_unused(col_partition);
1303#endif
1304}
1305
1306template <typename T, template<typename> class Allocator>
1308{
1309#ifdef AMREX_USE_MPI
1310 if (this->partition().numActiveProcs() <= 1) { return; }
1311
1312 if (! m_comm_tr.recv_reqs.empty()) {
1313 Vector<MPI_Status> mpi_statuses(m_comm_tr.recv_reqs.size());
1314 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_tr.recv_reqs.size()),
1315 m_comm_tr.recv_reqs.data(),
1316 mpi_statuses.data()));
1317 }
1318
1319 AT.unpack_buffer_tr(m_comm_tr, this->m_partition);
1320
1321 if (! m_comm_tr.send_reqs.empty()) {
1322 Vector<MPI_Status> mpi_statuses(m_comm_tr.send_reqs.size());
1323 BL_MPI_REQUIRE(MPI_Waitall(int(m_comm_tr.send_reqs.size()),
1324 m_comm_tr.send_reqs.data(),
1325 mpi_statuses.data()));
1326 }
1327
1328 if (m_comm_tr.csrt.nnz > 0) {
1329 The_Comms_Arena()->free(m_comm_tr.csrt.mat);
1330 The_Comms_Arena()->free(m_comm_tr.csrt.col_index);
1331 The_Comms_Arena()->free(m_comm_tr.csrt.row_offset);
1332 }
1333 if (m_comm_tr.recv_buffer_mat) {
1334 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_mat);
1335 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_col_index);
1336 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_row_offset);
1337 The_Pinned_Arena()->free(m_comm_tr.recv_buffer_idx_map);
1338 }
1339 m_comm_tr = CommTR{};
1340#else
1342#endif
1343}
1344
1345#ifdef AMREX_USE_MPI
1346
1347template <typename T, template<typename> class Allocator>
1349{
1350 if (m_split) {
1352 (m_col_begin == col_partition[ParallelDescriptor::MyProc()] &&
1353 m_col_end == col_partition[ParallelDescriptor::MyProc()+1]);
1354 return;
1355 }
1356
1357 AMREX_ALWAYS_ASSERT(m_col_partition.empty());
1358
1359 m_col_partition = col_partition;
1360 m_col_begin = col_partition[ParallelDescriptor::MyProc()];
1361 m_col_end = col_partition[ParallelDescriptor::MyProc()+1];
1362
1363 // This function needs to be safe when nnz is zero.
1364
1365 // We need to split the matrix into two parts, a diagonal part for pure
1366 // local operations and another part for remote operations in
1367 // matrix-vector or matrix-matrix multiplication.
1368
1369 Long local_nnz;
1370 Gpu::DeviceVector<Long> pfsum(m_nnz);
1371 auto* p_pfsum = pfsum.data();
1372 auto col_begin = m_col_begin;
1373 auto col_end = m_col_end;
1374 if (m_csr.nnz < Long(std::numeric_limits<int>::max())) {
1375 auto const* pcol = m_csr.col_index.data();
1376 local_nnz = Scan::PrefixSum<int>(int(m_nnz),
1377 [=] AMREX_GPU_DEVICE (int i) -> int {
1378 return (pcol[i] >= col_begin &&
1379 pcol[i] < col_end); },
1380 [=] AMREX_GPU_DEVICE (int i, int const& x) {
1381 p_pfsum[i] = x; },
1383 } else {
1384 auto const* pcol = m_csr.col_index.data();
1385 local_nnz = Scan::PrefixSum<Long>(m_nnz,
1386 [=] AMREX_GPU_DEVICE (Long i) -> Long {
1387 return (pcol[i] >= col_begin &&
1388 pcol[i] < col_end); },
1389 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
1390 p_pfsum[i] = x; },
1392 }
1393
1394 m_csr.nnz = local_nnz;
1395 Long remote_nnz = m_nnz - local_nnz;
1396 m_csr_remote.nnz = remote_nnz;
1397
1398 if (local_nnz != m_nnz) {
1399 m_csr_remote.mat.resize(remote_nnz);
1400 m_csr_remote.col_index.resize(remote_nnz);
1401 container_type<T> new_mat(local_nnz);
1402 container_type<Long> new_col(local_nnz);
1403 auto const* pmat = m_csr.mat.data();
1404 auto const* pcol = m_csr.col_index.data();
1405 auto* pmat_l = new_mat.data();
1406 auto* pcol_l = new_col.data();
1407 auto* pmat_r = m_csr_remote.mat.data();
1408 auto* pcol_r = m_csr_remote.col_index.data();
1409 ParallelForOMP(m_nnz, [=] AMREX_GPU_DEVICE (Long i)
1410 {
1411 auto ps = p_pfsum[i];
1412 auto local = (pcol[i] >= col_begin &&
1413 pcol[i] < col_end);
1414 if (local) {
1415 pmat_l[ps] = pmat[i];
1416 pcol_l[ps] = pcol[i] - col_begin; // shift the column index to local
1417 } else {
1418 pmat_r[i-ps] = pmat[i];
1419 pcol_r[i-ps] = pcol[i];
1420 }
1421 });
1422 auto noffset = Long(m_csr.row_offset.size());
1423 auto* pro = m_csr.row_offset.data();
1424 m_csr_remote.row_offset.resize(noffset);
1425 auto* pro_r = m_csr_remote.row_offset.data();
1426 ParallelForOMP(noffset, [=] AMREX_GPU_DEVICE (Long i)
1427 {
1428 if (i < noffset-1) {
1429 auto ro_l = p_pfsum[pro[i]];
1430 pro_r[i] = pro[i] - ro_l;
1431 pro[i] = ro_l;
1432 } else {
1433 pro[i] = local_nnz;
1434 pro_r[i] = remote_nnz;
1435 }
1436 });
1438 m_csr.mat.swap(new_mat);
1439 m_csr.col_index.swap(new_col);
1440
1441 // In the remote part, it's expected that some rows don't have
1442 // nonzeros. So we trim them off.
1443 {
1444 Long old_size = m_csr_remote.row_offset.size();
1445 m_ri_ltor.resize(old_size-1);
1446 m_ri_rtol.resize(old_size-1);
1447 auto* p_ltor = m_ri_ltor.data();
1448 auto* p_rtol = m_ri_rtol.data();
1449 container_type<Long> trimmed_row_offset(old_size);
1450 auto const* p_ro = m_csr_remote.row_offset.data();
1451 auto* p_tro = trimmed_row_offset.data();
1452 Long new_size;
1453 if (old_size < Long(std::numeric_limits<int>::max())) {
1454 // This is basically std::unique.
1455 new_size = Scan::PrefixSum<int>(int(old_size),
1456 [=] AMREX_GPU_DEVICE (int i) -> int {
1457 if (i+1 < old_size) {
1458 return (p_ro[i+1] > p_ro[i]);
1459 } else {
1460 return 1;
1461 }
1462 },
1463 [=] AMREX_GPU_DEVICE (int i, int const& x) {
1464 if (i == 0) {
1465 p_tro[0] = 0;
1466 } else if (p_ro[i] > p_ro[i-1]) {
1467 p_tro[x] = p_ro[i];
1468 }
1469 if (i+1 < old_size) {
1470 if (p_ro[i+1] > p_ro[i]) {
1471 p_rtol[x] = i;
1472 p_ltor[i] = x;
1473 } else {
1474 p_ltor[i] = -1;
1475 }
1476 }
1477 },
1479 } else {
1480 // This is basically std::unique.
1481 new_size = Scan::PrefixSum<Long>(old_size,
1482 [=] AMREX_GPU_DEVICE (Long i) -> Long {
1483 if (i+1 < old_size) {
1484 return (p_ro[i+1] > p_ro[i]);
1485 } else {
1486 return 1;
1487 }
1488 },
1489 [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
1490 if (i == 0) {
1491 p_tro[0] = 0;
1492 } else if (p_ro[i] > p_ro[i-1]) {
1493 p_tro[x] = p_ro[i];
1494 }
1495 if (i+1 < old_size) {
1496 if (p_ro[i+1] > p_ro[i]) {
1497 p_rtol[x] = i;
1498 p_ltor[i] = x;
1499 } else {
1500 p_ltor[i] = -1;
1501 }
1502 }
1503 },
1505 }
1506
1507 m_ri_rtol.resize(new_size-1);
1508 trimmed_row_offset.resize(new_size);
1509#ifdef AMREX_USE_GPU
1510 m_ri_rtol.shrink_to_fit();
1511 trimmed_row_offset.shrink_to_fit();
1512#endif
1513 m_csr_remote.row_offset.swap(trimmed_row_offset);
1514 }
1515
1516 } else if (col_begin > 0) {
1517 auto* pcol = m_csr.col_index.data();
1518 ParallelForOMP(m_nnz, [=] AMREX_GPU_DEVICE (Long i) { pcol[i] -= col_begin; });
1519 }
1520
1521 update_remote_col_index(m_csr_remote, true);
1522
1523 m_split = true;
1524}
1525
1526template <typename T, template<typename> class Allocator>
1527template <typename C>
1528void SpMatrix<T,Allocator>::update_remote_col_index (C& csrr, bool in_device_memory)
1529{
1530 int const nprocs = ParallelContext::NProcsSub();
1531
1532 // This function also needs to update m_remote_cols_*.
1533
1534 m_remote_cols_v.clear();
1535 m_remote_cols_vv.clear();
1536 m_remote_cols_vv.resize(nprocs);
1537#ifdef AMREX_USE_GPU
1538 m_remote_cols_dv.clear();
1539#endif
1540
1541 if (csrr.nnz == 0) { return; }
1542
1543 amrex::ignore_unused(in_device_memory);
1544
1545#ifdef AMREX_USE_GPU
1546 if (in_device_memory) {
1547 m_remote_cols_v.resize(csrr.nnz);
1549 csrr.col_index.begin(),
1550 csrr.col_index.end(),
1551 m_remote_cols_v.begin());
1553 } else
1554#endif
1555 {
1556 m_remote_cols_v.assign(csrr.col_index.begin(),
1557 csrr.col_index.end());
1558 }
1559
1560 amrex::RemoveDuplicates(m_remote_cols_v);
1561
1562#ifdef AMREX_USE_GPU
1563 m_remote_cols_dv.resize(m_remote_cols_v.size());
1565 m_remote_cols_v.begin(),
1566 m_remote_cols_v.end(),
1567 m_remote_cols_dv.data());
1568#endif
1569
1570 // Note that amrex::RemoveDuplicates sorts the data.
1571 auto const& cp = this->m_col_partition.dataVector();
1572 AMREX_ALWAYS_ASSERT(m_remote_cols_v.front() >= cp.front() &&
1573 m_remote_cols_v.back() < cp.back());
1574 auto it = cp.cbegin();
1575 for (auto c : m_remote_cols_v) {
1576 it = std::find_if(it, cp.cend(), [&] (auto x) { return x > c; });
1577 if (it != cp.cend()) {
1578 int iproc = int(std::distance(cp.cbegin(),it)) - 1;
1579 m_remote_cols_vv[iproc].push_back(c);
1580 } else {
1581 amrex::Abort("SpMatrix::update_remote_col_index: how did this happen?");
1582 }
1583 }
1584
1585 // Now we convert the remote indices from global to local.
1586 std::map<Long,Long> gtol;
1587 for (Long i = 0, N = Long(m_remote_cols_v.size()); i < N; ++i) {
1588 gtol[m_remote_cols_v[i]] = i;
1589 }
1590
1591#ifdef AMREX_USE_GPU
1592 if (in_device_memory) {
1593 Gpu::PinnedVector<Long> host_col_index(csrr.nnz);
1595 csrr.col_index.begin(),
1596 csrr.col_index.end(),
1597 host_col_index.begin());
1599 for (auto& c : host_col_index) {
1600 c = gtol[c];
1601 }
1603 host_col_index.begin(),
1604 host_col_index.end(),
1605 csrr.col_index.begin());
1607 } else
1608#endif
1609 {
1610 for (auto& c : csrr.col_index) {
1611 c = gtol[c];
1612 }
1613 }
1614}
1615
1616template <typename T, template<typename> class Allocator>
1618{
1619 if (m_num_neighbors >= 0) { return; }
1620
1621 int const nprocs = ParallelContext::NProcsSub();
1622 auto const mpi_int = ParallelDescriptor::Mpi_typemap<int>::type(); // NOLINT(readability-qualified-auto)
1623 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1624
1625 amrex::Vector<int> connection(nprocs);
1626 for (int iproc = 0; iproc < nprocs; ++iproc) {
1627 connection[iproc] = m_remote_cols_vv[iproc].empty() ? 0 : 1;
1628 }
1629 amrex::Vector<int> reduce_scatter_counts(nprocs,1);
1630 m_num_neighbors = 0;
1631 BL_MPI_REQUIRE(MPI_Reduce_scatter
1632 (connection.data(), &m_num_neighbors, reduce_scatter_counts.data(),
1633 mpi_int, MPI_SUM, mpi_comm));
1634}
1635
1636template <typename T, template<typename> class Allocator>
1638{
1639 if (m_comm_mv.prepared) { return; }
1640
1641 // This function needs to be safe when nnz is zero.
1642
1643 this->split_csr(col_partition);
1644
1645 int const nprocs = ParallelContext::NProcsSub();
1646 auto const mpi_tag = ParallelDescriptor::SeqNum();
1647 auto const mpi_long = ParallelDescriptor::Mpi_typemap<Long>::type(); // NOLINT(readability-qualified-auto)
1648 auto const mpi_comm = ParallelContext::CommunicatorSub(); // NOLINT(readability-qualified-auto)
1649
1650 if (m_num_neighbors < 0) { set_num_neighbors(); }
1651
1652 Vector<MPI_Request> mpi_requests;
1653 mpi_requests.reserve(nprocs);
1654 for (int iproc = 0; iproc < nprocs; ++iproc) {
1655 if ( ! m_remote_cols_vv[iproc].empty()) {
1656 mpi_requests.push_back(MPI_REQUEST_NULL);
1657 // I need to let other processes know what I need from them.
1658 BL_MPI_REQUIRE(MPI_Isend(m_remote_cols_vv[iproc].data(),
1659 int(m_remote_cols_vv[iproc].size()),
1660 mpi_long, iproc, mpi_tag, mpi_comm,
1661 &(mpi_requests.back())));
1662 m_comm_mv.recv_from.push_back(iproc);
1663 m_comm_mv.recv_counts.push_back(int(m_remote_cols_vv[iproc].size()));
1664 }
1665 }
1666
1667 m_comm_mv.total_counts_recv = Long(m_remote_cols_v.size());
1668
1669 Vector<Vector<Long>> send_indices(m_num_neighbors);
1670 m_comm_mv.total_counts_send = 0;
1671 for (int isend = 0; isend < m_num_neighbors; ++isend) {
1672 MPI_Status mpi_status;
1673 BL_MPI_REQUIRE(MPI_Probe(MPI_ANY_SOURCE, mpi_tag, mpi_comm, &mpi_status));
1674 int receiver = mpi_status.MPI_SOURCE;
1675 int count;
1676 BL_MPI_REQUIRE(MPI_Get_count(&mpi_status, mpi_long, &count));
1677 m_comm_mv.send_to.push_back(receiver);
1678 m_comm_mv.send_counts.push_back(count);
1679 send_indices[isend].resize(count);
1680 BL_MPI_REQUIRE(MPI_Recv(send_indices[isend].data(), count, mpi_long,
1681 receiver, mpi_tag, mpi_comm, &mpi_status));
1682 m_comm_mv.total_counts_send += count;
1683 }
1684
1685 m_comm_mv.send_indices.resize(m_comm_mv.total_counts_send);
1686 Gpu::PinnedVector<Long> send_indices_all;
1687 send_indices_all.reserve(m_comm_mv.total_counts_send);
1688 for (auto const& vl : send_indices) {
1689 for (auto x : vl) {
1690 send_indices_all.push_back(x);
1691 }
1692 }
1693 Gpu::copyAsync(Gpu::hostToDevice, send_indices_all.begin(), send_indices_all.end(),
1694 m_comm_mv.send_indices.begin());
1696
1697 if (! mpi_requests.empty()) {
1698 Vector<MPI_Status> mpi_statuses(mpi_requests.size());
1699 BL_MPI_REQUIRE(MPI_Waitall(int(mpi_requests.size()), mpi_requests.data(),
1700 mpi_statuses.data()));
1701 }
1702
1703 m_comm_mv.prepared = true;
1704}
1705
1706template <typename T, template<typename> class Allocator>
1708{
1709 auto* pdst = m_comm_mv.send_buffer;
1710 auto* pidx = m_comm_mv.send_indices.data();
1711 auto const& vv = v.view();
1712 auto const nsends = Long(m_comm_mv.send_indices.size());
1713 ParallelForOMP(nsends, [=] AMREX_GPU_DEVICE (Long i)
1714 {
1715 pdst[i] = vv(pidx[i]);
1716 });
1717}
1718
1719template <typename T, template<typename> class Allocator>
1721{
1722 auto const& csr = m_csr_remote;
1723 if (csr.nnz > 0) {
1724 T const* AMREX_RESTRICT mat = csr.mat.data();
1725 auto const* AMREX_RESTRICT col = csr.col_index.data();
1726 auto const* AMREX_RESTRICT row = csr.row_offset.data();
1727
1728 auto const* rtol = m_ri_rtol.data();
1729
1730 auto const* AMREX_RESTRICT px = m_comm_mv.recv_buffer;
1731 auto * AMREX_RESTRICT py = v.data();
1732
1733 auto const nrr = Long(csr.row_offset.size())-1;
1735 {
1736 T r = 0;
1737 for (Long j = row[i]; j < row[i+1]; ++j) {
1738 r += mat[j] * px[col[j]];
1739 }
1740 py[rtol[i]] += r;
1741 });
1742 }
1743}
1744
1745template <typename T, template<typename> class Allocator>
1747 AlgPartition const& col_partition)
1748{
1749 m_split = true;
1750 m_col_partition = col_partition;
1751 m_col_begin = m_col_partition[ParallelDescriptor::MyProc() ];
1752 m_col_end = m_col_partition[ParallelDescriptor::MyProc()+1];
1753
1754 m_ri_ltor.resize(m_csr.nrows(), -1);
1755 m_remote_cols_vv.resize(ParallelDescriptor::NProcs());
1756
1757 auto nnz = ctr.total_counts_recv[0];
1758 if (nnz == 0) { return; }
1759
1760 m_nnz += nnz;
1761 auto nb = int(ctr.recv_from.size()); // # of blocked CSRs to be merged
1762 auto total_local_rows = ctr.total_counts_recv[1];
1763
1764 // Build compressed row index map
1766 ctr.recv_buffer_idx_map + total_local_rows);
1767 RemoveDuplicates(ri_map);
1768 Long nrows = ri_map.size(); // # of unique rows.
1769
1770#ifdef AMREX_USE_GPU
1772#else
1773 auto& csrr = m_csr_remote;
1774#endif
1775 csrr.mat.resize(nnz);
1776 csrr.col_index.resize(nnz);
1777 csrr.row_offset.resize(nrows+1);
1778 csrr.nnz = nnz;
1779
1780 // Count nnz per compressed row
1781 Vector<int> row_nnz(nrows, 0);
1782 for (int i = 0; i < nb; ++i) {
1783 auto nrow_i = ctr.recv_counts[i][1];
1784 Long const* row_offset = ctr.recv_buffer_row_offset
1785 + ctr.recv_buffer_offset[i][2];
1786 Long const* idx_map = ctr.recv_buffer_idx_map
1787 + ctr.recv_buffer_offset[i][3];
1788 AMREX_ASSERT((row_offset[nrow_i] - row_offset[0]) == ctr.recv_counts[i][0]);
1789
1790 Long p = 0; // index into ri_map
1791 for (int lr = 0; lr < nrow_i; ++lr) {
1792 Long const gr = idx_map[lr];
1793 while (p < nrows && ri_map[p] < gr) { ++p; }
1794 AMREX_ASSERT(p < nrows && ri_map[p] == gr);
1795 // p is now compressed row index
1796 row_nnz[p] += int(row_offset[lr+1] - row_offset[lr]);
1797 }
1798 }
1799
1800 csrr.row_offset[0] = 0;
1801 std::partial_sum(row_nnz.begin(), row_nnz.end(), csrr.row_offset.begin()+1);
1802 AMREX_ASSERT(csrr.nnz == csrr.row_offset.back());
1803
1804 auto rowpos = csrr.row_offset; // make a copy to keep track of offset
1805
1806 for (int i = 0; i < nb; ++i) {
1807 auto nrow_i = ctr.recv_counts[i][1];
1808 T const* mat = ctr.recv_buffer_mat
1809 + ctr.recv_buffer_offset[i][0];
1810 Long const* col_index = ctr.recv_buffer_col_index
1811 + ctr.recv_buffer_offset[i][1];
1812 Long const* row_offset = ctr.recv_buffer_row_offset
1813 + ctr.recv_buffer_offset[i][2];
1814 Long const* idx_map = ctr.recv_buffer_idx_map
1815 + ctr.recv_buffer_offset[i][3];
1816
1817 Long p = 0; // index into ri_map
1818 for (int lr = 0; lr < nrow_i; ++lr) {
1819 Long const gr = idx_map[lr];
1820 while (p < nrows && ri_map[p] < gr) { ++p; }
1821 AMREX_ASSERT(p < nrows && ri_map[p] == gr);
1822 // p is now compressed row index
1823 auto os_src = row_offset[lr] - row_offset[0];
1824 auto nvals = row_offset[lr+1] - row_offset[lr];
1825 auto os_dst = rowpos[p];
1826 std::memcpy(csrr. mat.data()+os_dst, mat+os_src,
1827 sizeof(T) *nvals);
1828 std::memcpy(csrr.col_index.data()+os_dst, col_index+os_src,
1829 sizeof(Long)*nvals);
1830
1831 rowpos[p] += nvals;
1832 }
1833 }
1834
1835 m_ri_rtol.resize(nrows);
1836 Gpu::copyAsync(Gpu::hostToDevice, ri_map.begin(), ri_map.end(), m_ri_rtol.begin());
1837 {
1838 auto row_begin = m_row_begin;
1839 auto* AMREX_RESTRICT ltor = m_ri_ltor.data();
1840 auto* AMREX_RESTRICT rtol = m_ri_rtol.data();
1841 ParallelForOMP(nrows, [=] AMREX_GPU_DEVICE (Long i) {
1842 rtol[i] -= row_begin;
1843 ltor[rtol[i]] = i;
1844 });
1845 }
1846
1847 // The column index in csrr is still global.
1848 update_remote_col_index(csrr, false);
1849
1850#ifdef AMREX_USE_GPU
1851 amrex::duplicateCSR(Gpu::hostToDevice, m_csr_remote, csrr);
1853#endif
1854}
1855
1856#endif
1857
1858}
1859
1860#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_CUSPARSE_SAFE_CALL(call)
Definition AMReX_GpuError.H:101
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:151
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
Array4< int const > offset
Definition AMReX_HypreMLABecLap.cpp:1089
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
Definition AMReX_AlgPartition.H:14
Long numGlobalRows() const
Definition AMReX_AlgPartition.H:28
Definition AMReX_AlgVector.H:20
Long numLocalRows() const
Definition AMReX_AlgVector.H:46
T const * data() const
Definition AMReX_AlgVector.H:56
void define(Long global_size)
Definition AMReX_AlgVector.H:228
Table1D< T const, Long > view() const
Definition AMReX_AlgVector.H:65
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
Dynamically allocated vector for trivially copyable data.
Definition AMReX_PODVector.H:308
void reserve(size_type a_capacity, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:811
size_type size() const noexcept
Definition AMReX_PODVector.H:648
void shrink_to_fit()
Definition AMReX_PODVector.H:818
iterator begin() noexcept
Definition AMReX_PODVector.H:674
void resize(size_type a_new_size, GrowthStrategy strategy=GrowthStrategy::Poisson)
Definition AMReX_PODVector.H:728
iterator end() noexcept
Definition AMReX_PODVector.H:678
T * data() noexcept
Definition AMReX_PODVector.H:666
void push_back(const T &a_value)
Definition AMReX_PODVector.H:629
Definition AMReX_SpMatrix.H:52
void finishComm_tr(SpMatrix< T, Allocator > &AT)
Definition AMReX_SpMatrix.H:1307
void split_csr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1348
Long globalRowBegin() const
Inclusive global index begin.
Definition AMReX_SpMatrix.H:183
void define_and_filter_doit(T const *mat, Long const *col_index, Long nentries, Long const *row_offset)
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:784
Long * rowOffset()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:200
void sortCSR()
Definition AMReX_SpMatrix.H:776
void pack_buffer_mv(AlgVector< T, AllocT > const &v)
Definition AMReX_SpMatrix.H:1707
void unpack_buffer_mv(AlgVector< T, AllocT > &v)
Definition AMReX_SpMatrix.H:1720
void update_remote_col_index(C &csrr, bool in_device_memory)
Definition AMReX_SpMatrix.H:1528
void startComm_tr(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1097
void define_doit(int nnz_per_row)
Private function, but public for cuda.
Definition AMReX_SpMatrix.H:708
T * data()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:188
friend class AMG
Definition AMReX_SpMatrix.H:268
Long numGlobalRows() const
Definition AMReX_SpMatrix.H:179
SpMatrix & operator=(SpMatrix const &)=delete
friend SpMatrix< U, M > transpose(SpMatrix< U, M > const &A, AlgPartition col_partition)
~SpMatrix()=default
T value_type
Definition AMReX_SpMatrix.H:54
Allocator< U > allocator_type
Definition AMReX_SpMatrix.H:55
Long globalRowEnd() const
Exclusive global index end.
Definition AMReX_SpMatrix.H:185
Long numLocalNonZeros() const
Definition AMReX_SpMatrix.H:180
struct amrex::SpMatrix::CommMV m_comm_mv
AlgVector< T, AllocT > rowSum() const
Return row-sum vector.
Definition AMReX_SpMatrix.H:935
AlgPartition const & columnPartition() const
Return the column partition used for matrix-vector and matrix-matrix multiplications.
Definition AMReX_SpMatrix.H:176
void printToFile(std::string const &file) const
Definition AMReX_SpMatrix.H:824
ParCsr< T const > const_parcsr() const
Definition AMReX_SpMatrix.H:983
SpMatrix()=default
SpMatrix(SpMatrix const &)=delete
ParCsr< T > parcsr()
Definition AMReX_SpMatrix.H:959
SpMatrix(SpMatrix &&)=default
Long numLocalRows() const
Definition AMReX_SpMatrix.H:178
AlgPartition const & partition() const
Definition AMReX_SpMatrix.H:167
AlgVector< T, AllocT > const & diagonalVector() const
Return diagonal elements in a square matrix.
Definition AMReX_SpMatrix.H:909
Long * columnIndex()
Don't use this beyond initial setup.
Definition AMReX_SpMatrix.H:194
void finishComm_mv(AlgVector< T, AllocT > &y)
Definition AMReX_SpMatrix.H:1063
Allocator< T > AllocT
Definition AMReX_SpMatrix.H:58
struct amrex::SpMatrix::CommTR m_comm_tr
void prepare_comm_mv(AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1637
void startComm_mv(AlgVector< T, AllocT > const &x)
Definition AMReX_SpMatrix.H:1014
void unpack_buffer_tr(CommTR const &ctr, AlgPartition const &col_partition)
Definition AMReX_SpMatrix.H:1746
friend void SpMV(AlgVector< U, N > &y, SpMatrix< U, M > const &A, AlgVector< U, N > const &x)
void setVal(F const &f, CsrSorted is_sorted)
Initialize matrix entries using a row-wise functor.
Definition AMReX_SpMatrix.H:889
void define(AlgPartition partition, int nnz_per_row)
Allocate storage for a default-constructed matrix with a fixed number of nonzeros per row.
Definition AMReX_SpMatrix.H:686
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
Long size() const noexcept
Definition AMReX_Vector.H:53
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:243
Arena * The_Comms_Arena()
Definition AMReX_Arena.cpp:843
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:823
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void copyAsync(HostToDevice, InIter begin, InIter end, OutIter result) noexcept
A host-to-device copy routine. Note this is just a wrapper around memcpy, so it assumes contiguous st...
Definition AMReX_GpuContainers.H:228
static constexpr DeviceToDevice deviceToDevice
Definition AMReX_GpuContainers.H:107
static constexpr DeviceToHost deviceToHost
Definition AMReX_GpuContainers.H:106
static constexpr HostToDevice hostToDevice
Definition AMReX_GpuContainers.H:105
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:696
static constexpr struct amrex::Scan::Type::Exclusive exclusive
static constexpr RetSum retSum
Definition AMReX_Scan.H:32
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
amrex::ArenaAllocator< T > DefaultAllocator
Definition AMReX_GpuAllocators.H:199
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:193
void duplicateCSR(C c, CSR< T, AD > &dst, CSR< T, AS > const &src)
Definition AMReX_CSR.H:71
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
void RemoveDuplicates(Vector< T > &vec)
Definition AMReX_Vector.H:211
V< Long > row_offset
Definition AMReX_CSR.H:33
Long nnz
Definition AMReX_CSR.H:34
CsrView< T > view()
Definition AMReX_CSR.H:47
void resize(Long num_rows, Long num_non_zeros)
Definition AMReX_CSR.H:40
V< Long > col_index
Definition AMReX_CSR.H:32
V< T > mat
Definition AMReX_CSR.H:31
Sorted CSR means for each row the column indices are sorted.
Definition AMReX_SpMatrix.H:39
bool b
Definition AMReX_SpMatrix.H:40
Valid CSR means all entries are valid. It may be sorted ro unsorted.
Definition AMReX_SpMatrix.H:45
bool b
Definition AMReX_SpMatrix.H:46
Definition AMReX_CSR.H:20
GPU-ready non-owning CSR data container.
Definition AMReX_SpMatrix.H:29
Long const *__restrict__ col_map
Definition AMReX_SpMatrix.H:35
Long const *__restrict__ row_map
Definition AMReX_SpMatrix.H:34
CsrView< T > csr1
Definition AMReX_SpMatrix.H:31
Long col_begin
Definition AMReX_SpMatrix.H:33
Long row_begin
Definition AMReX_SpMatrix.H:32
CsrView< T > csr0
Definition AMReX_SpMatrix.H:30
Definition AMReX_SpMatrix.H:336
T * send_buffer
Definition AMReX_SpMatrix.H:345
bool prepared
Definition AMReX_SpMatrix.H:352
Vector< int > recv_counts
Definition AMReX_SpMatrix.H:342
Long total_counts_recv
Definition AMReX_SpMatrix.H:350
Vector< int > recv_from
Definition AMReX_SpMatrix.H:341
T * recv_buffer
Definition AMReX_SpMatrix.H:349
Vector< int > send_counts
Definition AMReX_SpMatrix.H:338
Long total_counts_send
Definition AMReX_SpMatrix.H:346
Gpu::DeviceVector< Long > send_indices
Definition AMReX_SpMatrix.H:339
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:348
Vector< int > send_to
Definition AMReX_SpMatrix.H:337
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:344
Definition AMReX_SpMatrix.H:355
Vector< std::array< int, 2 > > send_counts
Definition AMReX_SpMatrix.H:359
Long * recv_buffer_col_index
Definition AMReX_SpMatrix.H:372
Vector< MPI_Request > send_reqs
Definition AMReX_SpMatrix.H:360
Vector< MPI_Request > recv_reqs
Definition AMReX_SpMatrix.H:364
Vector< int > send_to
Definition AMReX_SpMatrix.H:358
std::array< Long, 2 > total_counts_recv
Definition AMReX_SpMatrix.H:366
Long * recv_buffer_row_offset
Definition AMReX_SpMatrix.H:373
Vector< std::array< int, 2 > > recv_counts
Definition AMReX_SpMatrix.H:363
CsrView< T > csrt
Definition AMReX_SpMatrix.H:356
T * recv_buffer_mat
Definition AMReX_SpMatrix.H:371
Vector< std::array< Long, 4 > > recv_buffer_offset
Definition AMReX_SpMatrix.H:367
Vector< int > recv_from
Definition AMReX_SpMatrix.H:362
Long * recv_buffer_idx_map
Definition AMReX_SpMatrix.H:374
Definition AMReX_ccse-mpi.H:55