Block-Structured AMR Software Framework
AMReX_ParallelDescriptor.H
Go to the documentation of this file.
1 #ifndef BL_PARALLELDESCRIPTOR_H
2 #define BL_PARALLELDESCRIPTOR_H
3 #include <AMReX_Config.H>
4 
5 #include <AMReX_ccse-mpi.H>
7 #include <AMReX_BLBackTrace.H>
8 #include <AMReX_BLProfiler.H>
9 #include <AMReX_BLassert.H>
10 #include <AMReX_Extension.H>
11 #include <AMReX_INT.H>
12 #include <AMReX_REAL.H>
13 #include <AMReX_GpuComplex.H>
14 #include <AMReX_Array.H>
15 #include <AMReX_Vector.H>
16 #include <AMReX_ValLocPair.H>
17 
18 #ifndef BL_AMRPROF
19 #include <AMReX_Box.H>
20 #endif
21 
22 #ifdef BL_LAZY
23 #include <AMReX_Lazy.H>
24 #endif
25 
26 #ifdef AMREX_PMI
27 #include <pmi.h>
28 #endif
29 
30 #ifdef AMREX_USE_OMP
31 #include <omp.h>
32 #endif
33 
34 #include <algorithm>
35 #include <atomic>
36 #include <csignal>
37 #include <functional>
38 #include <limits>
39 #include <numeric>
40 #include <string>
41 #include <typeinfo>
42 #include <vector>
43 
44 namespace amrex {
45 
46 template <typename T> class LayoutData;
47 
49 namespace ParallelDescriptor
50 {
51  //
53  const std::string Unnamed("Unnamed");
54 
56  class Message
57  {
58  public:
59 
60  Message () = default;
62  m_finished(false),
63  m_type(type_),
64  m_req(req_) {}
65  Message (MPI_Status stat_, MPI_Datatype type_) :
66  m_type(type_),
67  m_stat(stat_) {}
68  void wait ();
69  bool test ();
70  size_t count () const;
71  int tag () const;
72  int pid () const;
73  MPI_Datatype type () const { return m_type; }
74  MPI_Request req () const { return m_req; }
75  MPI_Status stat () const { return m_stat; }
76 
77  private:
78 
79  bool m_finished = true;
82  mutable MPI_Status m_stat{};
83  };
84 
85 #ifdef BL_USE_MPI
86  void MPI_Error(const char* file, int line, const char* str, int rc);
87 
88 #define BL_MPI_REQUIRE(x) \
89 do \
90 { \
91  if ( int l_status_ = (x) ) \
92  { \
93  amrex::ParallelDescriptor::MPI_Error(__FILE__,__LINE__,#x, l_status_); \
94  } \
95 } \
96 while ( false )
97 #endif
98 
103  void StartParallel (int* argc = nullptr,
104  char*** argv = nullptr,
105  MPI_Comm mpi_comm = MPI_COMM_WORLD);
106 
107  void Initialize ();
108  void Finalize ();
109 
110  extern AMREX_EXPORT bool use_gpu_aware_mpi;
111  inline bool UseGpuAwareMpi () { return use_gpu_aware_mpi; }
112 
114  void StartTeams ();
115  void EndTeams ();
116 
121  void EndParallel ();
122 
124  inline int
125  MyProc () noexcept
126  {
128  }
129  inline int
130  MyProc (MPI_Comm comm) noexcept
131  {
132 #ifdef BL_USE_MPI
133  int r;
134  MPI_Comm_rank(comm,&r);
135  return r;
136 #else
137  amrex::ignore_unused(comm);
138  return 0;
139 #endif
140  }
141 
143  struct ProcessTeam
144  {
145  using team_t = MPI_Comm;
146 
148  void Barrier () const {
149  if (m_size > 1) {
150 #if defined(BL_USE_MPI3)
151  MPI_Barrier(m_team_comm);
152 #endif
153  }
154  }
155 
157  void MemoryBarrier () const {
158  if (m_size > 1) {
159 #ifdef AMREX_USE_OMP
160  if (omp_in_parallel()) {
161  #pragma omp barrier
162  }
163  #pragma omp master
164 #endif
165  {
166 #if defined(BL_USE_MPI3)
167  std::atomic_thread_fence(std::memory_order_release);
168  MPI_Barrier(m_team_comm);
169  std::atomic_thread_fence(std::memory_order_acquire);
170 #endif
171  }
172  }
173  }
174 
176  void clear () {
177 #if defined(BL_USE_MPI3)
178  if (m_size > 1) {
179  MPI_Comm_free(&m_team_comm);
180  if (m_rankInTeam==0) { MPI_Comm_free(&m_lead_comm); }
181  }
182 #endif
183  }
184 
185  [[nodiscard]] const team_t& get() const {
186  return m_team_comm;
187  }
189  [[nodiscard]] const MPI_Comm& get_team_comm() const { return m_team_comm; }
190  [[nodiscard]] const MPI_Comm& get_lead_comm() const { return m_lead_comm; }
191 
193  int m_size;
194  int m_color;
195  int m_lead;
198 
201  };
202 
204 
205  extern AMREX_EXPORT int m_MinTag, m_MaxTag;
206  inline int MinTag () noexcept { return m_MinTag; }
207  inline int MaxTag () noexcept { return m_MaxTag; }
208 
210  inline MPI_Comm Communicator () noexcept { return m_comm; }
211 
212  extern AMREX_EXPORT int m_nprocs_per_node;
216  inline int NProcsPerNode () noexcept { return m_nprocs_per_node; }
217 
218  extern AMREX_EXPORT int m_rank_in_node;
222  inline int MyRankInNode () noexcept { return m_rank_in_node; }
223 
228  inline int NProcsPerProcessor () noexcept { return m_nprocs_per_processor; }
229 
234  inline int MyRankInProcessor () noexcept { return m_rank_in_processor; }
235 
236 #ifdef AMREX_USE_MPI
238  extern Vector<MPI_Op*> m_mpi_ops;
239 #endif
240 
242  inline int
243  NProcs () noexcept
244  {
246  }
247 
248  inline int
249  NProcs (MPI_Comm comm) noexcept
250  {
251 #ifdef BL_USE_MPI
252  int s;
253  BL_MPI_REQUIRE(MPI_Comm_size(comm, &s));
254  return s;
255 #else
256  amrex::ignore_unused(comm);
257  return 1;
258 #endif
259  }
264  extern AMREX_EXPORT const int ioProcessor;
265  inline int
266  IOProcessorNumber () noexcept
267  {
268  return ioProcessor;
269  }
274  inline bool
275  IOProcessor () noexcept
276  {
277  return MyProc() == IOProcessorNumber();
278  }
279 
280  inline int
281  IOProcessorNumber (MPI_Comm comm) noexcept
282  {
283  return (comm == ParallelDescriptor::Communicator()) ? ioProcessor : 0;
284  }
285 
286  inline bool
287  IOProcessor (MPI_Comm comm) noexcept
288  {
289  return MyProc(comm) == IOProcessorNumber(comm);
290  }
291 
292  //
293  inline int
294  TeamSize () noexcept
295  {
296  return m_Team.m_size;
297  }
298  inline int
299  NTeams () noexcept
300  {
301  return m_Team.m_numTeams;
302  }
303  inline int
304  MyTeamColor () noexcept
305  {
306  return m_Team.m_color;
307  }
308  inline int
309  MyTeamLead () noexcept
310  {
311  return m_Team.m_lead;
312  }
313  inline int
314  MyRankInTeam () noexcept
315  {
316  return m_Team.m_rankInTeam;
317  }
318  inline int
319  TeamLead (int rank) noexcept
320  {
321  return (rank >= 0) ? (rank - rank % m_Team.m_size) : MPI_PROC_NULL;
322  }
323  inline bool
324  isTeamLead () noexcept
325  {
326  return MyRankInTeam() == 0;
327  }
328  inline bool
329  sameTeam (int rank) noexcept
330  {
331  return MyTeamLead() == TeamLead(rank);
332  }
333  inline bool
334  sameTeam (int rankA, int rankB) noexcept
335  {
336  return TeamLead(rankA) == TeamLead(rankB);
337  }
338  inline int
339  RankInLeadComm (int rank) noexcept
340  {
341  return (rank >= 0) ? (rank / m_Team.m_size) : MPI_PROC_NULL;
342  }
343  inline bool
344  doTeamReduce () noexcept
345  {
346  return m_Team.m_do_team_reduce;
347  }
348  inline const ProcessTeam&
349  MyTeam () noexcept
350  {
351  return m_Team;
352  }
353  inline std::pair<int,int>
354  team_range (int begin, int end, int rit = -1, int nworkers = 0) noexcept
355  {
356  int rb, re;
357  {
358  if (rit < 0) { rit = ParallelDescriptor::MyRankInTeam(); }
359  if (nworkers == 0) { nworkers = ParallelDescriptor::TeamSize(); }
360  BL_ASSERT(rit<nworkers);
361  if (nworkers == 1) {
362  rb = begin;
363  re = end;
364  } else {
365  int ntot = end - begin;
366  int nr = ntot / nworkers;
367  int nlft = ntot - nr * nworkers;
368  if (rit < nlft) { // get nr+1 items
369  rb = begin + rit * (nr + 1);
370  re = rb + nr + 1;
371  } else { // get nr items
372  rb = begin + rit * nr + nlft;
373  re = rb + nr;
374  }
375  }
376  }
377 
378 #ifdef AMREX_USE_OMP
379  int nthreads = omp_get_num_threads();
380  if (nthreads > 1) {
381  int tid = omp_get_thread_num();
382  int ntot = re - rb;
383  int nr = ntot / nthreads;
384  int nlft = ntot - nr * nthreads;
385  if (tid < nlft) { // get nr+1 items
386  rb += tid * (nr + 1);
387  re = rb + nr + 1;
388  } else { // get nr items
389  rb += tid * nr + nlft;
390  re = rb + nr;
391  }
392  }
393 #endif
394 
395  return std::make_pair(rb,re);
396  }
397  template <typename F>
398  void team_for (int begin, int end, const F& f)
399  {
400  const auto& range = team_range(begin, end);
401  for (int i = range.first; i < range.second; ++i) {
402  f(i);
403  }
404  }
405  template <typename F> // rit: rank in team
406  void team_for (int begin, int end, int rit, const F& f)
407  {
408  const auto& range = team_range(begin, end, rit);
409  for (int i = range.first; i < range.second; ++i) {
410  f(i);
411  }
412  }
413  template <typename F> // rit: rank in team
414  void team_for (int begin, int end, int rit, int nworkers, const F& f)
415  {
416  const auto& range = team_range(begin, end, rit, nworkers);
417  for (int i = range.first; i < range.second; ++i) {
418  f(i);
419  }
420  }
421 
422  void Barrier (const std::string& message = Unnamed);
423  void Barrier (const MPI_Comm &comm, const std::string& message = Unnamed);
424  Message Abarrier ();
425  Message Abarrier (const MPI_Comm &comm);
426 
427  void Test (MPI_Request& request, int& flag, MPI_Status& status);
428  void Test (Vector<MPI_Request>& request, int& flag, Vector<MPI_Status>& status);
429 
430  void Comm_dup (MPI_Comm comm, MPI_Comm& newcomm);
432  void Abort (int errorcode = SIGABRT, bool backtrace = true);
434  const char* ErrorString (int errorcode);
436  double second () noexcept;
437 
439  void ReduceBoolAnd (bool& rvar);
441  void ReduceBoolAnd (bool& rvar, int cpu);
442 
444  void ReduceBoolOr (bool& rvar);
446  void ReduceBoolOr (bool& rvar, int cpu);
447 
449  template <typename T>
450  std::enable_if_t<std::is_floating_point_v<T>>
451  ReduceRealSum (T& rvar);
452 
453  template <typename T>
454  std::enable_if_t<std::is_floating_point_v<T>>
455  ReduceRealSum (T* rvar, int cnt);
456 
457  // Having this for backward compatibility
458  void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar);
459  //
460  template <typename T>
461  std::enable_if_t<std::is_floating_point_v<T>>
462  ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar);
463 
465  template <typename T>
466  std::enable_if_t<std::is_floating_point_v<T>>
467  ReduceRealSum (T& rvar, int cpu);
468 
469  template <typename T>
470  std::enable_if_t<std::is_floating_point_v<T>>
471  ReduceRealSum (T* rvar, int cnt, int cpu);
472 
473  // Having this for backward compatibility
474  void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
475  //
476  template <typename T>
477  std::enable_if_t<std::is_floating_point_v<T>>
478  ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
479 
481  template <typename T>
482  std::enable_if_t<std::is_floating_point_v<T>>
483  ReduceRealMax (T& rvar);
484 
485  template <typename T>
486  std::enable_if_t<std::is_floating_point_v<T>>
487  ReduceRealMax (T* rvar, int cnt);
488 
489  // Having this for backward compatibility
490  void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar);
491  //
492  template <typename T>
493  std::enable_if_t<std::is_floating_point_v<T>>
494  ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar);
495 
497  template <typename T>
498  std::enable_if_t<std::is_floating_point_v<T>>
499  ReduceRealMax (T& rvar, int cpu);
500 
501  template <typename T>
502  std::enable_if_t<std::is_floating_point_v<T>>
503  ReduceRealMax (T* rvar, int cnt, int cpu);
504 
505  // Having this for backward compatibility
506  void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
507  //
508  template <typename T>
509  std::enable_if_t<std::is_floating_point_v<T>>
510  ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
511 
513  template <typename T>
514  std::enable_if_t<std::is_floating_point_v<T>>
515  ReduceRealMin (T& rvar);
516 
517  template <typename T>
518  std::enable_if_t<std::is_floating_point_v<T>>
519  ReduceRealMin (T* rvar, int cnt);
520 
521  // Having this for backward compatibility
522  void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar);
523  //
524  template <typename T>
525  std::enable_if_t<std::is_floating_point_v<T>>
526  ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar);
527 
529  template <typename T>
530  std::enable_if_t<std::is_floating_point_v<T>>
531  ReduceRealMin (T& rvar, int cpu);
532 
533  template <typename T>
534  std::enable_if_t<std::is_floating_point_v<T>>
535  ReduceRealMin (T* rvar, int cnt, int cpu);
536 
537  // Having this for backward compatibility
538  void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
539  //
540  template <typename T>
541  std::enable_if_t<std::is_floating_point_v<T>>
542  ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
543 
545  void ReduceIntSum (int& rvar);
546  void ReduceIntSum (int* rvar, int cnt);
547  void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar);
549  void ReduceIntSum (int& rvar, int cpu);
550  void ReduceIntSum (int* rvar, int cnt, int cpu);
551  void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
552 
554  void ReduceIntMax (int& rvar);
555  void ReduceIntMax (int* rvar, int cnt);
556  void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar);
558  void ReduceIntMax (int& rvar, int cpu);
559  void ReduceIntMax (int* rvar, int cnt, int cpu);
560  void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
561 
563  void ReduceIntMin (int& rvar);
564  void ReduceIntMin (int* rvar, int cnt);
565  void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar);
567  void ReduceIntMin (int& rvar, int cpu);
568  void ReduceIntMin (int* rvar, int cnt, int cpu);
569  void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
570 
572  void ReduceLongSum (Long& rvar);
573  void ReduceLongSum (Long* rvar, int cnt);
574  void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar);
576  void ReduceLongSum (Long& rvar, int cpu);
577  void ReduceLongSum (Long* rvar, int cnt, int cpu);
578  void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
579 
581  void ReduceLongMax (Long& rvar);
582  void ReduceLongMax (Long* rvar, int cnt);
583  void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar);
585  void ReduceLongMax (Long& rvar, int cpu);
586  void ReduceLongMax (Long* rvar, int cnt, int cpu);
587  void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
588 
590  void ReduceLongMin (Long& rvar);
591  void ReduceLongMin (Long* rvar, int cnt);
592  void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar);
594  void ReduceLongMin (Long& rvar, int cpu);
595  void ReduceLongMin (Long* rvar, int cnt, int cpu);
596  void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
597 
599  void ReduceLongAnd (Long& rvar);
600  void ReduceLongAnd (Long* rvar, int cnt);
601  void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar);
603  void ReduceLongAnd (Long& rvar, int cpu);
604  void ReduceLongAnd (Long* rvar, int cnt, int cpu);
605  void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
606 
608  void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root);
613  inline int SeqNum () noexcept { return ParallelContext::get_inc_mpi_tag(); }
614 
615  template <class T> Message Asend(const T*, size_t n, int pid, int tag);
616  template <class T> Message Asend(const T*, size_t n, int pid, int tag, MPI_Comm comm);
617  template <class T> Message Asend(const std::vector<T>& buf, int pid, int tag);
618 
619  template <class T> Message Arecv(T*, size_t n, int pid, int tag);
620  template <class T> Message Arecv(T*, size_t n, int pid, int tag, MPI_Comm comm);
621  template <class T> Message Arecv(std::vector<T>& buf, int pid, int tag);
622 
623  template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag);
624  template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
625  template <class T> Message Send(const std::vector<T>& buf, int dst_pid, int tag);
626 
627  template <class T> Message Recv(T*, size_t n, int pid, int tag);
628  template <class T> Message Recv(T*, size_t n, int pid, int tag, MPI_Comm comm);
629  template <class T> Message Recv(std::vector<T>& buf, int pid, int tag);
630 
631  template <class T> void Bcast(T*, size_t n, int root = 0);
632  template <class T> void Bcast(T*, size_t n, int root, const MPI_Comm &comm);
633  void Bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
634 
635  template <class T, class T1> void Scatter(T*, size_t n, const T1*, size_t n1, int root);
636 
637  template <class T, class T1> void Gather(const T*, size_t n, T1*, size_t n1, int root);
638  template <class T> std::vector<T> Gather(const T&, int root);
639 
640  template <class T> void Gatherv (const T* send, int sc,
641  T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
642  int root);
643 
645  template <class T> void GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
646  Vector<T>& recvbuf,
647  int root);
648 
649  void Wait (MPI_Request& req, MPI_Status& status);
650  void Waitall (Vector<MPI_Request>& reqs, Vector<MPI_Status>& status);
651  void Waitany (Vector<MPI_Request>& reqs, int &index, MPI_Status& status);
653 
654  void ReadAndBcastFile(const std::string &filename, Vector<char> &charBuf,
655  bool bExitOnError = true,
656  const MPI_Comm &comm = Communicator() );
657  void IProbe(int src_pid, int tag, int &mflag, MPI_Status &status);
658  void IProbe(int src_pid, int tag, MPI_Comm comm, int &mflag, MPI_Status &status);
659 
665  std::string
666  mpi_level_to_string (int mtlev);
667 
668  // PMI = Process Management Interface, available on Crays. Provides API to
669  // query topology of the job.
670 #ifdef AMREX_PMI
671  void PMI_Initialize();
672  void PMI_PrintMeshcoords(const pmi_mesh_coord_t *pmi_mesh_coord);
673 #endif
674 
675 #ifdef BL_USE_MPI
676  int select_comm_data_type (std::size_t nbytes);
677  std::size_t sizeof_selected_comm_data_type (std::size_t nbytes);
678 #endif
679 }
680 }
681 
682 namespace amrex {
683 
684 #ifdef BL_USE_MPI
685 
686 template <class T>
687 ParallelDescriptor::Message
688 ParallelDescriptor::Asend (const T* buf, size_t n, int dst_pid, int tag)
689 {
690  return Asend(buf, n, dst_pid, tag, Communicator());
691 }
692 
693 namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
694 template <>
695 Message
696 Asend<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
697 
698 template <class T>
699 Message
700 Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
701 {
702  static_assert(!std::is_same_v<char,T>, "Asend: char version has been specialized");
703 
704  BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T);
705  BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag);
706 
707  MPI_Request req;
708  BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
709  n,
711  dst_pid,
712  tag,
713  comm,
714  &req) );
715  BL_COMM_PROFILE(BLProfiler::AsendTsiiM, BLProfiler::AfterCall(), dst_pid, tag);
716  return Message(req, Mpi_typemap<T>::type());
717 }
718 }
719 
720 template <class T>
721 ParallelDescriptor::Message
722 ParallelDescriptor::Asend (const std::vector<T>& buf, int dst_pid, int tag)
723 {
724  return Asend(buf.data(), buf.size(), dst_pid, tag, Communicator());
725 }
726 
727 template <class T>
728 ParallelDescriptor::Message
729 ParallelDescriptor::Send (const T* buf, size_t n, int dst_pid, int tag)
730 {
731  return Send(buf, n, dst_pid, tag, Communicator());
732 }
733 
734 namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
735 template <>
736 Message
737 Send<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
738 
739 template <class T>
740 Message
741 Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
742 {
743  static_assert(!std::is_same_v<char,T>, "Send: char version has been specialized");
744 
745  BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T);
746 
747 #ifdef BL_COMM_PROFILING
748  int dst_pid_world(-1);
749  MPI_Group groupComm, groupWorld;
750  BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
751  BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
752  BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &dst_pid, groupWorld, &dst_pid_world) );
753 
754  BL_COMM_PROFILE(BLProfiler::SendTsii, n * sizeof(T), dst_pid_world, tag);
755 #endif
756 
757  BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(buf),
758  n,
760  dst_pid,
761  tag,
762  comm) );
763  BL_COMM_PROFILE(BLProfiler::SendTsii, BLProfiler::AfterCall(), dst_pid, tag);
764  return Message();
765 }
766 }
767 
768 template <class T>
769 ParallelDescriptor::Message
770 ParallelDescriptor::Send (const std::vector<T>& buf, int dst_pid, int tag)
771 {
772  return Send(buf.data(), buf.size(), dst_pid, tag, Communicator());
773 }
774 
775 template <class T>
776 ParallelDescriptor::Message
777 ParallelDescriptor::Arecv (T* buf, size_t n, int src_pid, int tag)
778 {
779  return Arecv(buf, n, src_pid, tag, Communicator());
780 }
781 
782 namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
783 template <>
784 Message
785 Arecv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
786 
787 template <class T>
788 Message
789 Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
790 {
791  static_assert(!std::is_same_v<char,T>, "Arecv: char version has been specialized");
792 
793  BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T);
794  BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag);
795 
796  MPI_Request req;
797  BL_MPI_REQUIRE( MPI_Irecv(buf,
798  n,
800  src_pid,
801  tag,
802  comm,
803  &req) );
804  BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, BLProfiler::AfterCall(), src_pid, tag);
805  return Message(req, Mpi_typemap<T>::type());
806 }
807 }
808 
809 template <class T>
810 ParallelDescriptor::Message
811 ParallelDescriptor::Arecv (std::vector<T>& buf, int src_pid, int tag)
812 {
813  return Arecv(buf.data(), buf.size(), src_pid, tag, Communicator());
814 }
815 
816 template <class T>
817 ParallelDescriptor::Message
818 ParallelDescriptor::Recv (T* buf, size_t n, int src_pid, int tag)
819 {
820  return Recv(buf, n, src_pid, tag, Communicator());
821 }
822 
823 namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
824 template <>
825 Message
826 Recv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
827 
828 template <class T>
829 Message
830 Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
831 {
832  static_assert(!std::is_same_v<char,T>, "Recv: char version has been specialized");
833 
834  BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T);
835  BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag);
836 
837  MPI_Status stat;
838  BL_MPI_REQUIRE( MPI_Recv(buf,
839  n,
841  src_pid,
842  tag,
843  comm,
844  &stat) );
845 #ifdef BL_COMM_PROFILING
846  int src_pid_comm(stat.MPI_SOURCE);
847  int src_pid_world(stat.MPI_SOURCE);
848  if(src_pid_comm != MPI_ANY_SOURCE) {
849  MPI_Group groupComm, groupWorld;
850  BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
851  BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
852  BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &src_pid_comm, groupWorld, &src_pid_world) );
853  }
854 
855  BL_COMM_PROFILE(BLProfiler::RecvTsii, n * sizeof(T), src_pid_world, stat.MPI_TAG);
856 #endif
857  return Message(stat, Mpi_typemap<T>::type());
858 }
859 }
860 
861 template <class T>
862 ParallelDescriptor::Message
863 ParallelDescriptor::Recv (std::vector<T>& buf, int src_pid, int tag)
864 {
865  return Recv(buf.data(), buf.size(), src_pid, tag, Communicator());
866 }
867 
868 template <class T>
869 void
871  size_t n,
872  int root)
873 {
874 #ifdef BL_LAZY
876 #endif
877 
878  BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
879 
880  BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
881  BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
882 
883  BL_MPI_REQUIRE( MPI_Bcast(t,
884  n,
885  Mpi_typemap<T>::type(),
886  root,
887  Communicator()) );
888  BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
889 }
890 
891 template <class T>
892 void
894  size_t n,
895  int root,
896  const MPI_Comm &comm)
897 {
898 #ifdef BL_LAZY
899  int r;
900  MPI_Comm_compare(comm, Communicator(), &r);
901  if (r == MPI_IDENT) { Lazy::EvalReduction(); }
902 #endif
903 
904  BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
905 
906  BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
907  BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
908 
909  BL_MPI_REQUIRE( MPI_Bcast(t,
910  n,
911  Mpi_typemap<T>::type(),
912  root,
913  comm) );
914  BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
915 }
916 
917 template <class T, class T1>
918 void
919 ParallelDescriptor::Gather (const T* t,
920  size_t n,
921  T1* t1,
922  size_t n1,
923  int root)
924 {
925  BL_PROFILE_T_S("ParallelDescriptor::Gather(TsT1si)", T);
926  BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
927 
928  BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
929  BL_ASSERT(n1 < static_cast<size_t>(std::numeric_limits<int>::max()));
930 
931  BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(t),
932  n,
933  Mpi_typemap<T>::type(),
934  t1,
935  n1,
936  Mpi_typemap<T1>::type(),
937  root,
938  Communicator()) );
939  BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, n * sizeof(T), root, BLProfiler::NoTag());
940 }
941 
942 template <class T>
943 std::vector<T>
944 ParallelDescriptor::Gather (const T& t, int root)
945 {
946  BL_PROFILE_T_S("ParallelDescriptor::Gather(Ti)", T);
947  BL_COMM_PROFILE(BLProfiler::GatherTi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
948 
949  std::vector<T> resl;
950  if ( root == MyProc() ) { resl.resize(NProcs()); }
951  BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(&t),
952  1,
953  Mpi_typemap<T>::type(),
954  resl.data(),
955  1,
956  Mpi_typemap<T>::type(),
957  root,
958  Communicator()) );
959  BL_COMM_PROFILE(BLProfiler::GatherTi, sizeof(T), root, BLProfiler::NoTag());
960  return resl;
961 }
962 
963 template <class T>
964 void
965 ParallelDescriptor::Gatherv (const T* send, int sc,
966  T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
967  int root)
968 {
969  BL_PROFILE_T_S("ParallelDescriptor::Gatherv(Ti)", T);
970  BL_COMM_PROFILE(BLProfiler::Gatherv, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
971 
972  MPI_Gatherv(send, sc, ParallelDescriptor::Mpi_typemap<T>::type(),
973  recv, rc.data(), disp.data(), ParallelDescriptor::Mpi_typemap<T>::type(),
974  root, Communicator());
975 
976  BL_COMM_PROFILE(BLProfiler::Gatherv, std::accumulate(rc.begin(),rc.end(),0)*sizeof(T), root, BLProfiler::NoTag());
977 }
978 
979 template <class T>
980 void
981 ParallelDescriptor::GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
982  Vector<T>& recvbuf, int root)
983 {
984  BL_PROFILE_T_S("ParallelDescriptor::GatherLayoutData(Ti)", T);
985 
986  // Gather prelims
987  Vector<T> T_to_send;
988  T_to_send.reserve(sendbuf.local_size());
989 
990  for (int i : sendbuf.IndexArray())
991  {
992  T_to_send.push_back(sendbuf[i]);
993  }
994 
995  int nprocs = ParallelContext::NProcsSub();
996  Vector<int> recvcount(nprocs, 0);
997  recvbuf.resize(sendbuf.size());
998  const Vector<int>& old_pmap = sendbuf.DistributionMap().ProcessorMap();
999  for (int i : old_pmap)
1000  {
1001  ++recvcount[i];
1002  }
1003 
1004  // Make a map from post-gather to pre-gather index
1005  Vector<Vector<int>> new_ind_to_old_ind(nprocs);
1006  for (int i=0; i<nprocs; ++i)
1007  {
1008  new_ind_to_old_ind[i].reserve(recvcount[i]);
1009  }
1010  for (int i=0; i<old_pmap.size(); ++i)
1011  {
1012  new_ind_to_old_ind[old_pmap[i]].push_back(i);
1013  }
1014 
1015  // Flatten
1016  Vector<int> new_index_to_old_index;
1017  new_index_to_old_index.reserve(old_pmap.size());
1018  for (const Vector<int>& v : new_ind_to_old_ind)
1019  {
1020  if (!v.empty())
1021  {
1022  for (int el : v)
1023  {
1024  new_index_to_old_index.push_back(el);
1025  }
1026  }
1027  }
1028 
1029  Vector<int> disp(nprocs);
1030  if (!disp.empty()) { disp[0] = 0; }
1031  std::partial_sum(recvcount.begin(), recvcount.end()-1, disp.begin()+1);
1032  Vector<T> new_index_to_T(sendbuf.size());
1033 
1034  MPI_Gatherv(T_to_send.data(), T_to_send.size(),
1036  new_index_to_T.data(), recvcount.data(), disp.data(),
1039 
1040  // Now work just on the root, which now has global information on the collected;
1041  // LayoutData information; with new_index_to_old_index and new_index_to_T,
1042  // sort the gathered vector in pre-gather index space
1043  if (ParallelContext::MyProcSub() == root)
1044  {
1045  // Invert the map (new_index) --> (old_index)
1046  Vector<int> old_index_to_new_index(sendbuf.size());
1047 
1048  for (int i=0; i<old_index_to_new_index.size(); ++i)
1049  {
1050  old_index_to_new_index[new_index_to_old_index[i]] = i;
1051  }
1052 
1053  for (int i=0; i<recvbuf.size(); ++i)
1054  {
1055  recvbuf[i] = new_index_to_T[old_index_to_new_index[i]];
1056  }
1057  }
1058 }
1059 
1060 template <class T, class T1>
1061 void
1063  size_t n,
1064  const T1* t1,
1065  size_t n1,
1066  int root)
1067 {
1068  BL_PROFILE_T_S("ParallelDescriptor::Scatter(TsT1si)", T);
1069  BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1070 
1071  BL_MPI_REQUIRE( MPI_Scatter(const_cast<T1*>(t1),
1072  n1,
1073  Mpi_typemap<T1>::type(),
1074  t,
1075  n,
1076  Mpi_typemap<T>::type(),
1077  root,
1078  Communicator()) );
1079  BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, n * sizeof(T), root, BLProfiler::NoTag());
1080 }
1081 
1082 #else
1083 
1084 namespace ParallelDescriptor
1085 {
1086 template <class T>
1087 Message
1088 Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1089 {
1090  return Message();
1091 }
1092 
1093 template <class T>
1094 Message
1095 Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1096 {
1097  return Message();
1098 }
1099 
1100 template <class T>
1101 Message
1102 Asend(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1103 {
1104  return Message();
1105 }
1106 
1107 template <class T>
1108 Message
1109 Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1110 {
1111  return Message();
1112 }
1113 
1114 template <class T>
1115 Message
1116 Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1117 {
1118  return Message();
1119 }
1120 
1121 template <class T>
1122 Message
1123 Send(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1124 {
1125  return Message();
1126 }
1127 
1128 template <class T>
1129 Message
1130 Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1131 {
1132  return Message();
1133 }
1134 
1135 template <class T>
1136 Message
1137 Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1138 {
1139  return Message();
1140 }
1141 
1142 template <class T>
1143 Message
1144 Arecv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1145 {
1146  return Message();
1147 }
1148 
1149 template <class T>
1150 Message
1151 Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1152 {
1153  return Message();
1154 }
1155 
1156 template <class T>
1157 Message
1158 Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1159 {
1160  return Message();
1161 }
1162 
1163 template <class T>
1164 Message
1165 Recv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1166 {
1167  return Message();
1168 }
1169 
1170 template <class T>
1171 void
1172 Bcast(T* /*t*/, size_t /*n*/, int /*root*/)
1173 {}
1174 
1175 template <class T>
1176 void
1177 Bcast(T* /*t*/, size_t /*n*/, int /*root*/, const MPI_Comm & /*comm*/)
1178 {}
1179 
1180 template <class T, class T1>
1181 void
1182 Gather (const T* t, size_t n, T1* t1, size_t n1, int /*root*/)
1183 {
1184  BL_ASSERT(n == n1);
1186 
1187  int const sc = static_cast<int>(n);
1188  for (int j=0; j<sc; ++j) { t1[j] = t[j]; }
1189 }
1190 
1191 template <class T>
1192 std::vector<T>
1193 Gather(const T& t, int /*root*/)
1194 {
1195  std::vector<T> resl(1);
1196  resl[0] = t;
1197  return resl;
1198 }
1199 
1200 template <class T>
1201 void
1202 Gatherv (const T* send, int sc,
1203  T* recv, const std::vector<int>& /*rc*/,
1204  const std::vector<int>& /*disp*/, int /*root*/)
1205 {
1206  for (int j=0; j<sc; ++j) { recv[j] = send[j]; }
1207 }
1208 
1209 template <class T>
1210 void
1212  Vector<T>& recvbuf, int /*root*/)
1213 {
1214  recvbuf.resize(sendbuf.size());
1215 
1216  for (int i=0; i<sendbuf.size(); ++i)
1217  {
1218  recvbuf[i] = sendbuf[i];
1219  }
1220 }
1221 
1222 template <class T, class T1>
1223 void
1224 Scatter(T* /*t*/, size_t /*n*/, const T1* /*t1*/, size_t /*n1*/, int /*root*/)
1225 {}
1226 
1227 }
1228 #endif
1229 
1230 namespace ParallelDescriptor {
1231 
1232 #ifdef AMREX_USE_MPI
1233 
1234 namespace detail {
1235 
1236 template<typename T>
1237 void DoAllReduce (T* r, MPI_Op op, int cnt)
1238 {
1239 #ifdef BL_LAZY
1241 #endif
1242 
1243  BL_ASSERT(cnt > 0);
1244 
1245  BL_MPI_REQUIRE( MPI_Allreduce(MPI_IN_PLACE, r, cnt,
1246  Mpi_typemap<T>::type(), op,
1247  Communicator()) );
1248 }
1249 
1250 template<typename T>
1251 void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
1252 {
1253 #ifdef BL_LAZY
1255 #endif
1256 
1257  BL_ASSERT(cnt > 0);
1258 
1259  if (MyProc() == cpu) {
1260  BL_MPI_REQUIRE( MPI_Reduce(MPI_IN_PLACE, r, cnt,
1261  Mpi_typemap<T>::type(), op,
1262  cpu, Communicator()) );
1263  } else {
1264  BL_MPI_REQUIRE( MPI_Reduce(r, r, cnt,
1265  Mpi_typemap<T>::type(), op,
1266  cpu, Communicator()) );
1267  }
1268 }
1269 
1270 }
1271 
1273  template <typename T>
1274  std::enable_if_t<std::is_floating_point_v<T>>
1275  ReduceRealSum (T& rvar) {
1276  detail::DoAllReduce<T>(&rvar,MPI_SUM,1);
1277  }
1278 
1279  template <typename T>
1280  std::enable_if_t<std::is_floating_point_v<T>>
1281  ReduceRealSum (T* rvar, int cnt) {
1282  detail::DoAllReduce<T>(rvar,MPI_SUM,cnt);
1283  }
1284 
1285  template <typename T>
1286  std::enable_if_t<std::is_floating_point_v<T>>
1287  ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar)
1288  {
1289  int cnt = rvar.size();
1290  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1291  detail::DoAllReduce<T>(tmp.data(),MPI_SUM,cnt);
1292  for (int i = 0; i < cnt; ++i) {
1293  rvar[i].get() = tmp[i];
1294  }
1295  }
1296 
1298  template <typename T>
1299  std::enable_if_t<std::is_floating_point_v<T>>
1300  ReduceRealSum (T& rvar, int cpu) {
1301  detail::DoReduce<T>(&rvar,MPI_SUM,1,cpu);
1302  }
1303 
1304  template <typename T>
1305  std::enable_if_t<std::is_floating_point_v<T>>
1306  ReduceRealSum (T* rvar, int cnt, int cpu) {
1307  detail::DoReduce<T>(rvar,MPI_SUM,cnt,cpu);
1308  }
1309 
1310  template <typename T>
1311  std::enable_if_t<std::is_floating_point_v<T>>
1312  ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1313  {
1314  int cnt = rvar.size();
1315  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1316  detail::DoReduce<T>(tmp.data(),MPI_SUM,cnt,cpu);
1317  for (int i = 0; i < cnt; ++i) {
1318  rvar[i].get() = tmp[i];
1319  }
1320  }
1321 
1323  template <typename T>
1324  std::enable_if_t<std::is_floating_point_v<T>>
1325  ReduceRealMax (T& rvar) {
1326  detail::DoAllReduce<T>(&rvar,MPI_MAX,1);
1327  }
1328 
1329  template <typename T>
1330  std::enable_if_t<std::is_floating_point_v<T>>
1331  ReduceRealMax (T* rvar, int cnt) {
1332  detail::DoAllReduce<T>(rvar,MPI_MAX,cnt);
1333  }
1334 
1335  template <typename T>
1336  std::enable_if_t<std::is_floating_point_v<T>>
1337  ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar)
1338  {
1339  int cnt = rvar.size();
1340  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1341  detail::DoAllReduce<T>(tmp.data(),MPI_MAX,cnt);
1342  for (int i = 0; i < cnt; ++i) {
1343  rvar[i].get() = tmp[i];
1344  }
1345  }
1346 
1348  template <typename T>
1349  std::enable_if_t<std::is_floating_point_v<T>>
1350  ReduceRealMax (T& rvar, int cpu) {
1351  detail::DoReduce<T>(&rvar,MPI_MAX,1,cpu);
1352  }
1353 
1354  template <typename T>
1355  std::enable_if_t<std::is_floating_point_v<T>>
1356  ReduceRealMax (T* rvar, int cnt, int cpu) {
1357  detail::DoReduce<T>(rvar,MPI_MAX,cnt,cpu);
1358  }
1359 
1360  template <typename T>
1361  std::enable_if_t<std::is_floating_point_v<T>>
1362  ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1363  {
1364  int cnt = rvar.size();
1365  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1366  detail::DoReduce<T>(tmp.data(),MPI_MAX,cnt,cpu);
1367  for (int i = 0; i < cnt; ++i) {
1368  rvar[i].get() = tmp[i];
1369  }
1370  }
1371 
1373  template <typename T>
1374  std::enable_if_t<std::is_floating_point_v<T>>
1375  ReduceRealMin (T& rvar) {
1376  detail::DoAllReduce<T>(&rvar,MPI_MIN,1);
1377  }
1378 
1379  template <typename T>
1380  std::enable_if_t<std::is_floating_point_v<T>>
1381  ReduceRealMin (T* rvar, int cnt) {
1382  detail::DoAllReduce<T>(rvar,MPI_MIN,cnt);
1383  }
1384 
1385  template <typename T>
1386  std::enable_if_t<std::is_floating_point_v<T>>
1387  ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar)
1388  {
1389  int cnt = rvar.size();
1390  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1391  detail::DoAllReduce<T>(tmp.data(),MPI_MIN,cnt);
1392  for (int i = 0; i < cnt; ++i) {
1393  rvar[i].get() = tmp[i];
1394  }
1395  }
1396 
1398  template <typename T>
1399  std::enable_if_t<std::is_floating_point_v<T>>
1400  ReduceRealMin (T& rvar, int cpu) {
1401  detail::DoReduce<T>(&rvar,MPI_MIN,1,cpu);
1402  }
1403 
1404  template <typename T>
1405  std::enable_if_t<std::is_floating_point_v<T>>
1406  ReduceRealMin (T* rvar, int cnt, int cpu) {
1407  detail::DoReduce<T>(rvar,MPI_MIN,cnt,cpu);
1408  }
1409 
1410  template <typename T>
1411  std::enable_if_t<std::is_floating_point_v<T>>
1412  ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1413  {
1414  int cnt = rvar.size();
1415  Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1416  detail::DoReduce<T>(tmp.data(),MPI_MIN,cnt,cpu);
1417  for (int i = 0; i < cnt; ++i) {
1418  rvar[i].get() = tmp[i];
1419  }
1420  }
1421 
1422 #else
1423 
1425  template <typename T>
1426  std::enable_if_t<std::is_floating_point_v<T>>
1427  ReduceRealSum (T& ) {}
1428 
1429  template <typename T>
1430  std::enable_if_t<std::is_floating_point_v<T>>
1431  ReduceRealSum (T*, int) {}
1432 
1433  template <typename T>
1434  std::enable_if_t<std::is_floating_point_v<T>>
1435  ReduceRealSum (Vector<std::reference_wrapper<T> > const&) {}
1436 
1438  template <typename T>
1439  std::enable_if_t<std::is_floating_point_v<T>>
1440  ReduceRealSum (T&, int) {}
1441 
1442  template <typename T>
1443  std::enable_if_t<std::is_floating_point_v<T>>
1444  ReduceRealSum (T*, int, int) {}
1445 
1446  template <typename T>
1447  std::enable_if_t<std::is_floating_point_v<T>>
1448  ReduceRealSum (Vector<std::reference_wrapper<T> > const&, int) {}
1449 
1451  template <typename T>
1452  std::enable_if_t<std::is_floating_point_v<T>>
1453  ReduceRealMax (T&) {}
1454 
1455  template <typename T>
1456  std::enable_if_t<std::is_floating_point_v<T>>
1457  ReduceRealMax (T*, int) {}
1458 
1459  template <typename T>
1460  std::enable_if_t<std::is_floating_point_v<T>>
1461  ReduceRealMax (Vector<std::reference_wrapper<T> > const&) {}
1462 
1464  template <typename T>
1465  std::enable_if_t<std::is_floating_point_v<T>>
1466  ReduceRealMax (T&, int) {}
1467 
1468  template <typename T>
1469  std::enable_if_t<std::is_floating_point_v<T>>
1470  ReduceRealMax (T*, int, int) {}
1471 
1472  template <typename T>
1473  std::enable_if_t<std::is_floating_point_v<T>>
1474  ReduceRealMax (Vector<std::reference_wrapper<T> > const&, int) {}
1475 
1477  template <typename T>
1478  std::enable_if_t<std::is_floating_point_v<T>>
1479  ReduceRealMin (T&) {}
1480 
1481  template <typename T>
1482  std::enable_if_t<std::is_floating_point_v<T>>
1483  ReduceRealMin (T*, int) {}
1484 
1485  template <typename T>
1486  std::enable_if_t<std::is_floating_point_v<T>>
1487  ReduceRealMin (Vector<std::reference_wrapper<T> > const&) {}
1488 
1490  template <typename T>
1491  std::enable_if_t<std::is_floating_point_v<T>>
1492  ReduceRealMin (T&, int) {}
1493 
1494  template <typename T>
1495  std::enable_if_t<std::is_floating_point_v<T>>
1496  ReduceRealMin (T*, int, int) {}
1497 
1498  template <typename T>
1499  std::enable_if_t<std::is_floating_point_v<T>>
1500  ReduceRealMin (Vector<std::reference_wrapper<T> > const&, int) {}
1501 
1502 #endif
1503 }
1504 
1505 #ifdef AMREX_USE_MPI
1506 namespace ParallelDescriptor {
1507 
1508 template <class T>
1510 {
1511  static MPI_Datatype type ()
1512  {
1513  static_assert(std::is_same<T,double>() ||
1514  std::is_same<T,float >(),
1515  "Unsupported type T for GpuComplex");
1516  if constexpr (std::is_same<T,double>()) {
1517  return MPI_C_DOUBLE_COMPLEX;
1518  } else {
1519  return MPI_C_FLOAT_COMPLEX;
1520  }
1521  }
1522 };
1523 
1524 template<typename TV, typename TI>
1525 struct Mpi_typemap<ValLocPair<TV,TI>>
1526 {
1527  static MPI_Datatype type ()
1528  {
1529  static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
1530  if (mpi_type == MPI_DATATYPE_NULL) {
1531  using T = ValLocPair<TV,TI>;
1532  static_assert(std::is_trivially_copyable_v<T>,
1533  "To communicate with MPI, ValLocPair must be trivially copyable.");
1534  static_assert(std::is_standard_layout_v<T>,
1535  "To communicate with MPI, ValLocPair must be standard layout");
1536 
1537  T vlp[2];
1538  MPI_Datatype types[] = {
1541  };
1542  int blocklens[] = { 1, 1 };
1543  MPI_Aint disp[2];
1544  BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].value, &disp[0]) );
1545  BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].index, &disp[1]) );
1546  disp[1] -= disp[0];
1547  disp[0] = 0;
1548  BL_MPI_REQUIRE( MPI_Type_create_struct(2, blocklens, disp, types,
1549  &mpi_type) );
1550  MPI_Aint lb, extent;
1551  BL_MPI_REQUIRE( MPI_Type_get_extent(mpi_type, &lb, &extent) );
1552  if (extent != sizeof(T)) {
1553  MPI_Datatype tmp = mpi_type;
1554  BL_MPI_REQUIRE( MPI_Type_create_resized(tmp, 0, sizeof(vlp[0]), &mpi_type) );
1555  BL_MPI_REQUIRE( MPI_Type_free(&tmp) );
1556  }
1557  BL_MPI_REQUIRE( MPI_Type_commit( &mpi_type ) );
1558 
1559  m_mpi_types.push_back(&mpi_type);
1560  }
1561  return mpi_type;
1562  }
1563 };
1564 
1565 template <typename T, typename F>
1567 {
1568  static MPI_Op mpi_op = MPI_OP_NULL;
1569  if (mpi_op == MPI_OP_NULL) {
1570  static auto user_fn = [] (void *invec, void *inoutvec, int* len, // NOLINT
1571  MPI_Datatype * /*datatype*/)
1572  {
1573  auto in = static_cast<T const*>(invec);
1574  auto out = static_cast<T*>(inoutvec);
1575  for (int i = 0; i < *len; ++i) {
1576  out[i] = F()(in[i],out[i]);
1577  }
1578  };
1579  BL_MPI_REQUIRE( MPI_Op_create(user_fn, 1, &mpi_op) );
1580  m_mpi_ops.push_back(&mpi_op);
1581  }
1582  return mpi_op;
1583 }
1584 
1585 }
1586 #endif
1587 
1588 }
1589 
1590 #endif /*BL_PARALLELDESCRIPTOR_H*/
#define BL_COMM_PROFILE(cft, size, pid, tag)
Definition: AMReX_BLProfiler.H:587
#define BL_PROFILE_T_S(fname, T)
Definition: AMReX_BLProfiler.H:554
#define BL_ASSERT(EX)
Definition: AMReX_BLassert.H:39
#define AMREX_EXPORT
Definition: AMReX_Extension.H:191
int MPI_Comm
Definition: AMReX_ccse-mpi.H:47
int MPI_Request
Definition: AMReX_ccse-mpi.H:50
int MPI_Group
Definition: AMReX_ccse-mpi.H:48
int MPI_Op
Definition: AMReX_ccse-mpi.H:46
static constexpr int MPI_COMM_WORLD
Definition: AMReX_ccse-mpi.H:54
static constexpr int MPI_PROC_NULL
Definition: AMReX_ccse-mpi.H:57
static constexpr int MPI_DATATYPE_NULL
Definition: AMReX_ccse-mpi.H:52
int MPI_Datatype
Definition: AMReX_ccse-mpi.H:49
static constexpr int MPI_REQUEST_NULL
Definition: AMReX_ccse-mpi.H:53
int size() const noexcept
Return the number of FABs in the FabArray.
Definition: AMReX_FabArrayBase.H:109
a one-thingy-per-box distributed object
Definition: AMReX_LayoutData.H:13
Hold the description and status of communication data.
Definition: AMReX_ParallelDescriptor.H:57
Message(MPI_Request req_, MPI_Datatype type_)
Definition: AMReX_ParallelDescriptor.H:61
MPI_Datatype type() const
Definition: AMReX_ParallelDescriptor.H:73
MPI_Request req() const
Definition: AMReX_ParallelDescriptor.H:74
bool test()
Definition: AMReX_ParallelDescriptor.cpp:1178
MPI_Status m_stat
Definition: AMReX_ParallelDescriptor.H:82
MPI_Request m_req
Definition: AMReX_ParallelDescriptor.H:81
MPI_Status stat() const
Definition: AMReX_ParallelDescriptor.H:75
void wait()
Definition: AMReX_ParallelDescriptor.cpp:1174
MPI_Datatype m_type
Definition: AMReX_ParallelDescriptor.H:80
Message(MPI_Status stat_, MPI_Datatype type_)
Definition: AMReX_ParallelDescriptor.H:65
bool m_finished
Definition: AMReX_ParallelDescriptor.H:79
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition: AMReX_Vector.H:27
void EvalReduction()
Definition: AMReX_Lazy.cpp:20
int MyProcAll() noexcept
my rank in world communicator
Definition: AMReX_ParallelContext.H:61
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition: AMReX_ParallelContext.H:70
int get_inc_mpi_tag() noexcept
get and increment mpi tag in current frame
Definition: AMReX_ParallelContext.H:93
int MyProcSub() noexcept
my sub-rank in current frame
Definition: AMReX_ParallelContext.H:76
int NProcsAll() noexcept
number of ranks in world communicator
Definition: AMReX_ParallelContext.H:59
int NProcsSub() noexcept
number of ranks in current frame
Definition: AMReX_ParallelContext.H:74
void DoAllReduce(T *r, MPI_Op op, int cnt)
Definition: AMReX_ParallelDescriptor.H:1237
void DoReduce(T *r, MPI_Op op, int cnt, int cpu)
Definition: AMReX_ParallelDescriptor.H:1251
bool sameTeam(int rank) noexcept
Definition: AMReX_ParallelDescriptor.H:329
int RankInLeadComm(int rank) noexcept
Definition: AMReX_ParallelDescriptor.H:339
void Test(MPI_Request &, int &, MPI_Status &)
Definition: AMReX_ParallelDescriptor.cpp:1207
const char * ErrorString(int)
ErrorString return string associated with error internal error condition.
Definition: AMReX_ParallelDescriptor.cpp:1200
int m_rank_in_processor
Definition: AMReX_ParallelDescriptor.cpp:73
Message Asend(const T *, size_t n, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1088
const int ioProcessor
The MPI rank number of the I/O Processor (probably rank 0). This rank is usually used to write to std...
Definition: AMReX_ParallelDescriptor.cpp:82
MPI_Comm Communicator() noexcept
Definition: AMReX_ParallelDescriptor.H:210
void ReduceIntSum(int &)
Integer sum reduction.
Definition: AMReX_ParallelDescriptor.cpp:1252
Vector< MPI_Datatype * > m_mpi_types
Definition: AMReX_ParallelDescriptor.cpp:76
void EndTeams()
Definition: AMReX_ParallelDescriptor.cpp:1585
void Waitany(Vector< MPI_Request > &, int &, MPI_Status &)
Definition: AMReX_ParallelDescriptor.cpp:1299
void Gatherv(const T *send, int sc, T *recv, const std::vector< int > &rc, const std::vector< int > &disp, int root)
Definition: AMReX_ParallelDescriptor.H:1202
Message Abarrier()
Definition: AMReX_ParallelDescriptor.cpp:1204
void ReduceRealMin(Vector< std::reference_wrapper< Real > > const &)
Definition: AMReX_ParallelDescriptor.cpp:1216
void StartParallel(int *, char ***, MPI_Comm)
Perform any needed parallel initialization. This MUST be the first routine in this class called from ...
Definition: AMReX_ParallelDescriptor.cpp:1152
const ProcessTeam & MyTeam() noexcept
Definition: AMReX_ParallelDescriptor.H:349
bool isTeamLead() noexcept
Definition: AMReX_ParallelDescriptor.H:324
int m_nprocs_per_node
Definition: AMReX_ParallelDescriptor.cpp:69
void Wait(MPI_Request &, MPI_Status &)
Definition: AMReX_ParallelDescriptor.cpp:1291
int MyProc() noexcept
return the rank number local to the current Parallel Context
Definition: AMReX_ParallelDescriptor.H:125
Vector< MPI_Op * > m_mpi_ops
Definition: AMReX_ParallelDescriptor.cpp:77
std::string mpi_level_to_string(int mtlev)
Definition: AMReX_ParallelDescriptor.cpp:1591
ProcessTeam m_Team
Definition: AMReX_ParallelDescriptor.cpp:65
int m_nprocs_per_processor
Definition: AMReX_ParallelDescriptor.cpp:72
void ReduceRealSum(Vector< std::reference_wrapper< Real > > const &)
Definition: AMReX_ParallelDescriptor.cpp:1214
void Waitsome(Vector< MPI_Request > &, int &, Vector< int > &, Vector< MPI_Status > &)
Definition: AMReX_ParallelDescriptor.cpp:1303
bool UseGpuAwareMpi()
Definition: AMReX_ParallelDescriptor.H:111
void ReduceLongMin(Long &)
Long min reduction.
Definition: AMReX_ParallelDescriptor.cpp:1225
void ReadAndBcastFile(const std::string &filename, Vector< char > &charBuf, bool bExitOnError, const MPI_Comm &comm)
Definition: AMReX_ParallelDescriptor.cpp:1459
void ReduceLongAnd(Long &)
Long and-wise reduction.
Definition: AMReX_ParallelDescriptor.cpp:1222
int MyTeamLead() noexcept
Definition: AMReX_ParallelDescriptor.H:309
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition: AMReX_ParallelDescriptor.cpp:1295
int MyTeamColor() noexcept
Definition: AMReX_ParallelDescriptor.H:304
bool doTeamReduce() noexcept
Definition: AMReX_ParallelDescriptor.H:344
int MinTag() noexcept
Definition: AMReX_ParallelDescriptor.H:206
void ReduceIntMax(int &)
Integer max reduction.
Definition: AMReX_ParallelDescriptor.cpp:1253
void ReduceLongMax(Long &)
Long max reduction.
Definition: AMReX_ParallelDescriptor.cpp:1224
int NProcs(MPI_Comm comm) noexcept
Definition: AMReX_ParallelDescriptor.H:249
int TeamLead(int rank) noexcept
Definition: AMReX_ParallelDescriptor.H:319
int MyRankInProcessor() noexcept
Definition: AMReX_ParallelDescriptor.H:234
void ReduceBoolAnd(bool &)
And-wise boolean reduction.
Definition: AMReX_ParallelDescriptor.cpp:1276
Message Send(const T *buf, size_t n, int dst_pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1109
MPI_Comm m_comm
Definition: AMReX_ParallelDescriptor.cpp:67
void ReduceBoolOr(bool &)
Or-wise boolean reduction.
Definition: AMReX_ParallelDescriptor.cpp:1277
void Initialize()
Definition: AMReX_ParallelDescriptor.cpp:1511
void Finalize()
Definition: AMReX_ParallelDescriptor.cpp:1522
bool use_gpu_aware_mpi
Definition: AMReX_ParallelDescriptor.cpp:60
int TeamSize() noexcept
Definition: AMReX_ParallelDescriptor.H:294
void ReduceLongSum(Long &)
Long sum reduction.
Definition: AMReX_ParallelDescriptor.cpp:1223
void EndParallel()
Perform any needed parallel finalization. This MUST be the last routine in this class called from wit...
Definition: AMReX_ParallelDescriptor.cpp:1184
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition: AMReX_ParallelDescriptor.cpp:1282
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition: AMReX_ParallelDescriptor.H:613
int NProcs() noexcept
return the number of MPI ranks local to the current Parallel Context
Definition: AMReX_ParallelDescriptor.H:243
void IProbe(int, int, int &, MPI_Status &)
Definition: AMReX_ParallelDescriptor.cpp:1209
int MyRankInNode() noexcept
Definition: AMReX_ParallelDescriptor.H:222
void GatherLayoutDataToVector(const LayoutData< T > &sendbuf, Vector< T > &recvbuf, int root)
Gather LayoutData values to a vector on root.
Definition: AMReX_ParallelDescriptor.H:1211
Message Send(const std::vector< T > &buf, int dst_pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1123
int MaxTag() noexcept
Definition: AMReX_ParallelDescriptor.H:207
void Gather(Real const *sendbuf, int nsend, Real *recvbuf, int root)
Parallel gather.
Definition: AMReX_ParallelDescriptor.cpp:1160
void Comm_dup(MPI_Comm, MPI_Comm &)
Definition: AMReX_ParallelDescriptor.cpp:1212
int IOProcessorNumber() noexcept
Definition: AMReX_ParallelDescriptor.H:266
int NProcsPerProcessor() noexcept
Definition: AMReX_ParallelDescriptor.H:228
void Barrier(const std::string &)
Definition: AMReX_ParallelDescriptor.cpp:1202
int m_rank_in_node
Definition: AMReX_ParallelDescriptor.cpp:70
double second() noexcept
Returns wall-clock seconds since start of execution.
Definition: AMReX_ParallelDescriptor.cpp:1285
const std::string Unnamed("Unnamed")
Used as default argument to ParallelDescriptor::Barrier().
Message Asend(const std::vector< T > &buf, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1102
bool IOProcessor() noexcept
Is this CPU the I/O Processor? To get the rank number, call IOProcessorNumber()
Definition: AMReX_ParallelDescriptor.H:275
int MyRankInTeam() noexcept
Definition: AMReX_ParallelDescriptor.H:314
Message Arecv(std::vector< T > &buf, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1144
int m_MaxTag
Definition: AMReX_ParallelDescriptor.cpp:80
void Scatter(T *, size_t n, const T1 *, size_t n1, int root)
Definition: AMReX_ParallelDescriptor.H:1224
int NProcsPerNode() noexcept
Definition: AMReX_ParallelDescriptor.H:216
int MyProc(MPI_Comm comm) noexcept
Definition: AMReX_ParallelDescriptor.H:130
void Abort(int errorcode, bool backtrace)
Abort with specified error code.
Definition: AMReX_ParallelDescriptor.cpp:1191
std::pair< int, int > team_range(int begin, int end, int rit=-1, int nworkers=0) noexcept
Definition: AMReX_ParallelDescriptor.H:354
void team_for(int begin, int end, const F &f)
Definition: AMReX_ParallelDescriptor.H:398
Message Recv(std::vector< T > &buf, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1165
int NTeams() noexcept
Definition: AMReX_ParallelDescriptor.H:299
Message Recv(T *, size_t n, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1151
void StartTeams()
Split the process pool into teams.
Definition: AMReX_ParallelDescriptor.cpp:1531
Message Arecv(T *, size_t n, int pid, int tag)
Definition: AMReX_ParallelDescriptor.H:1130
MPI_Op Mpi_op()
Definition: AMReX_ParallelDescriptor.H:1566
int m_MinTag
Definition: AMReX_ParallelDescriptor.cpp:80
void ReduceIntMin(int &)
Integer min reduction.
Definition: AMReX_ParallelDescriptor.cpp:1254
void ReduceRealMax(Vector< std::reference_wrapper< Real > > const &)
Definition: AMReX_ParallelDescriptor.cpp:1215
static int f(amrex::Real t, N_Vector y_data, N_Vector y_rhs, void *user_data)
Definition: AMReX_SundialsIntegrator.H:44
@ max
Definition: AMReX_ParallelReduce.H:17
integer function omp_get_num_threads()
Definition: AMReX_omp_mod.F90:29
integer function omp_get_thread_num()
Definition: AMReX_omp_mod.F90:37
logical function omp_in_parallel()
Definition: AMReX_omp_mod.F90:41
Definition: AMReX_Amr.cpp:49
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1890
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition: AMReX.H:111
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition: AMReX_Box.H:1881
Definition: AMReX_FabArrayCommI.H:896
Definition: AMReX_ccse-mpi.H:51
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition: AMReX_GpuComplex.H:29
static MPI_Datatype type()
Definition: AMReX_ParallelDescriptor.H:1511
static MPI_Datatype type()
Definition: AMReX_ParallelDescriptor.H:1527
Communication datatype (note: this structure also works without MPI)
Definition: AMReX_ccse-mpi.H:68
Provide functionalities needed to construct a team of processes to perform a particular job.
Definition: AMReX_ParallelDescriptor.H:144
MPI_Comm team_t
Definition: AMReX_ParallelDescriptor.H:145
int m_color
Definition: AMReX_ParallelDescriptor.H:194
int m_size
Definition: AMReX_ParallelDescriptor.H:193
void clear()
free a communicator
Definition: AMReX_ParallelDescriptor.H:176
const MPI_Comm & get_lead_comm() const
Definition: AMReX_ParallelDescriptor.H:190
void Barrier() const
synchronize processes within the team
Definition: AMReX_ParallelDescriptor.H:148
int m_do_team_reduce
Definition: AMReX_ParallelDescriptor.H:197
const team_t & get() const
Definition: AMReX_ParallelDescriptor.H:185
int m_numTeams
Definition: AMReX_ParallelDescriptor.H:192
MPI_Comm m_team_comm
Definition: AMReX_ParallelDescriptor.H:199
const MPI_Comm & get_team_comm() const
return the communicator
Definition: AMReX_ParallelDescriptor.H:189
int m_rankInTeam
Definition: AMReX_ParallelDescriptor.H:196
MPI_Comm m_lead_comm
Definition: AMReX_ParallelDescriptor.H:200
int m_lead
Definition: AMReX_ParallelDescriptor.H:195
void MemoryBarrier() const
memory fence
Definition: AMReX_ParallelDescriptor.H:157
Definition: AMReX_ValLocPair.H:10