Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_ParallelDescriptor.H
Go to the documentation of this file.
1#ifndef BL_PARALLELDESCRIPTOR_H
2#define BL_PARALLELDESCRIPTOR_H
3#include <AMReX_Config.H>
4
5#include <AMReX_ccse-mpi.H>
7#include <AMReX_BLBackTrace.H>
8#include <AMReX_BLProfiler.H>
9#include <AMReX_BLassert.H>
10#include <AMReX_Extension.H>
11#include <AMReX_INT.H>
12#include <AMReX_REAL.H>
13#include <AMReX_GpuComplex.H>
14#include <AMReX_Array.H>
15#include <AMReX_Vector.H>
16#include <AMReX_ValLocPair.H>
17
18#ifndef BL_AMRPROF
19#include <AMReX_Box.H>
20#endif
21
22#ifdef BL_LAZY
23#include <AMReX_Lazy.H>
24#endif
25
26#ifdef AMREX_PMI
27#include <pmi.h>
28#endif
29
30#ifdef AMREX_USE_OMP
31#include <omp.h>
32#endif
33
34#include <algorithm>
35#include <atomic>
36#include <csignal>
37#include <functional>
38#include <limits>
39#include <numeric>
40#include <string>
41#include <typeinfo>
42#include <vector>
43
44namespace amrex {
45
46template <typename T> class LayoutData;
47
49namespace ParallelDescriptor
50{
51 //
53 const std::string Unnamed("Unnamed");
54
56 class Message
57 {
58 public:
59
60 Message () = default;
62 m_finished(false),
63 m_type(type_),
64 m_req(req_) {}
66 m_type(type_),
67 m_stat(stat_) {}
68 void wait ();
69 bool test ();
70 size_t count () const;
71 int tag () const;
72 int pid () const;
73 MPI_Datatype type () const { return m_type; }
74 MPI_Request req () const { return m_req; }
75 MPI_Status stat () const { return m_stat; }
76
77 private:
78
79 bool m_finished = true;
82 mutable MPI_Status m_stat{};
83 };
84
85#ifdef BL_USE_MPI
86 void MPI_Error(const char* file, int line, const char* str, int rc);
87
88#define BL_MPI_REQUIRE(x) \
89do \
90{ \
91 if ( int l_status_ = (x) ) \
92 { \
93 amrex::ParallelDescriptor::MPI_Error(__FILE__,__LINE__,#x, l_status_); \
94 } \
95} \
96while ( false )
97#endif
98
103 void StartParallel (int* argc = nullptr,
104 char*** argv = nullptr,
105 MPI_Comm mpi_comm = MPI_COMM_WORLD);
106
107 void Initialize ();
108 void Finalize ();
109
111 extern AMREX_EXPORT bool use_gpu_aware_mpi;
113 inline bool UseGpuAwareMpi () { return use_gpu_aware_mpi; }
114
116 void StartTeams ();
117 void EndTeams ();
118
123 void EndParallel ();
124
127 inline int
128 MyProc () noexcept
129 {
131 }
133 inline int
134 MyProc (MPI_Comm comm) noexcept
135 {
136#ifdef BL_USE_MPI
137 int r;
138 MPI_Comm_rank(comm,&r);
139 return r;
140#else
142 return 0;
143#endif
144 }
145
148 {
150
152 void Barrier () const {
153 if (m_size > 1) {
154#if defined(BL_USE_MPI3)
155 MPI_Barrier(m_team_comm);
156#endif
157 }
158 }
159
161 void MemoryBarrier () const {
162 if (m_size > 1) {
163#ifdef AMREX_USE_OMP
164 if (omp_in_parallel()) {
165 #pragma omp barrier
166 }
167 #pragma omp master
168#endif
169 {
170#if defined(BL_USE_MPI3)
171 std::atomic_thread_fence(std::memory_order_release);
172 MPI_Barrier(m_team_comm);
173 std::atomic_thread_fence(std::memory_order_acquire);
174#endif
175 }
176 }
177 }
178
180 void clear () {
181#if defined(BL_USE_MPI3)
182 if (m_size > 1) {
183 MPI_Comm_free(&m_team_comm);
184 if (m_rankInTeam==0) { MPI_Comm_free(&m_lead_comm); }
185 }
186#endif
187 }
188
189 [[nodiscard]] const team_t& get() const {
190 return m_team_comm;
191 }
193 [[nodiscard]] const MPI_Comm& get_team_comm() const { return m_team_comm; }
194 [[nodiscard]] const MPI_Comm& get_lead_comm() const { return m_lead_comm; }
195
197 int m_numTeams;
198 int m_size;
199 int m_color;
200 int m_lead;
201 int m_rankInTeam;
202 int m_do_team_reduce;
203
204 MPI_Comm m_team_comm;
205 MPI_Comm m_lead_comm;
207 };
208
210 extern AMREX_EXPORT ProcessTeam m_Team;
211 extern AMREX_EXPORT int m_MinTag, m_MaxTag;
212 extern AMREX_EXPORT MPI_Comm m_comm;
213 extern AMREX_EXPORT int m_nprocs_per_node;
214 extern AMREX_EXPORT int m_rank_in_node;
215 extern AMREX_EXPORT int m_nprocs_per_processor;
216 extern AMREX_EXPORT int m_rank_in_processor;
217 extern AMREX_EXPORT const int ioProcessor;
219
220 inline int MinTag () noexcept { return m_MinTag; }
221 inline int MaxTag () noexcept { return m_MaxTag; }
222
223 inline MPI_Comm Communicator () noexcept { return m_comm; }
224
228 inline int NProcsPerNode () noexcept { return m_nprocs_per_node; }
229
233 inline int MyRankInNode () noexcept { return m_rank_in_node; }
234
238 inline int NProcsPerProcessor () noexcept { return m_nprocs_per_processor; }
239
243 inline int MyRankInProcessor () noexcept { return m_rank_in_processor; }
244
245#ifdef AMREX_USE_MPI
247 extern AMREX_EXPORT Vector<MPI_Datatype*> m_mpi_types;
248 extern AMREX_EXPORT Vector<MPI_Op*> m_mpi_ops;
250#endif
251
254 inline int
255 NProcs () noexcept
256 {
258 }
259
261 inline int
262 NProcs (MPI_Comm comm) noexcept
263 {
264#ifdef BL_USE_MPI
265 int s;
266 BL_MPI_REQUIRE(MPI_Comm_size(comm, &s));
267 return s;
268#else
270 return 1;
271#endif
272 }
278 inline int
280 {
281 return ioProcessor;
282 }
288 inline bool
289 IOProcessor () noexcept
290 {
291 return MyProc() == IOProcessorNumber();
292 }
293
295 inline int
297 {
298 return (comm == ParallelDescriptor::Communicator()) ? ioProcessor : 0;
299 }
300
302 inline bool
303 IOProcessor (MPI_Comm comm) noexcept
304 {
305 return MyProc(comm) == IOProcessorNumber(comm);
306 }
307
308 //
309 inline int
310 TeamSize () noexcept
311 {
312 return m_Team.m_size;
313 }
314 inline int
315 NTeams () noexcept
316 {
317 return m_Team.m_numTeams;
318 }
319 inline int
320 MyTeamColor () noexcept
321 {
322 return m_Team.m_color;
323 }
324 inline int
325 MyTeamLead () noexcept
326 {
327 return m_Team.m_lead;
328 }
329 inline int
330 MyRankInTeam () noexcept
331 {
332 return m_Team.m_rankInTeam;
333 }
334 inline int
335 TeamLead (int rank) noexcept
336 {
337 return (rank >= 0) ? (rank - rank % m_Team.m_size) : MPI_PROC_NULL;
338 }
339 inline bool
340 isTeamLead () noexcept
341 {
342 return MyRankInTeam() == 0;
343 }
344 inline bool
345 sameTeam (int rank) noexcept
346 {
347 return MyTeamLead() == TeamLead(rank);
348 }
349 inline bool
350 sameTeam (int rankA, int rankB) noexcept
351 {
352 return TeamLead(rankA) == TeamLead(rankB);
353 }
354 inline int
355 RankInLeadComm (int rank) noexcept
356 {
357 return (rank >= 0) ? (rank / m_Team.m_size) : MPI_PROC_NULL;
358 }
359 inline bool
360 doTeamReduce () noexcept
361 {
362 return m_Team.m_do_team_reduce;
363 }
364 inline const ProcessTeam&
365 MyTeam () noexcept
366 {
367 return m_Team;
368 }
369 inline std::pair<int,int>
370 team_range (int begin, int end, int rit = -1, int nworkers = 0) noexcept
371 {
372 int rb, re;
373 {
374 if (rit < 0) { rit = ParallelDescriptor::MyRankInTeam(); }
375 if (nworkers == 0) { nworkers = ParallelDescriptor::TeamSize(); }
376 BL_ASSERT(rit<nworkers);
377 if (nworkers == 1) {
378 rb = begin;
379 re = end;
380 } else {
381 int ntot = end - begin;
382 int nr = ntot / nworkers;
383 int nlft = ntot - nr * nworkers;
384 if (rit < nlft) { // get nr+1 items
385 rb = begin + rit * (nr + 1);
386 re = rb + nr + 1;
387 } else { // get nr items
388 rb = begin + rit * nr + nlft;
389 re = rb + nr;
390 }
391 }
392 }
393
394#ifdef AMREX_USE_OMP
395 int nthreads = omp_get_num_threads();
396 if (nthreads > 1) {
397 int tid = omp_get_thread_num();
398 int ntot = re - rb;
399 int nr = ntot / nthreads;
400 int nlft = ntot - nr * nthreads;
401 if (tid < nlft) { // get nr+1 items
402 rb += tid * (nr + 1);
403 re = rb + nr + 1;
404 } else { // get nr items
405 rb += tid * nr + nlft;
406 re = rb + nr;
407 }
408 }
409#endif
410
411 return std::make_pair(rb,re);
412 }
413 template <typename F>
414 void team_for (int begin, int end, const F& f)
415 {
416 const auto& range = team_range(begin, end);
417 for (int i = range.first; i < range.second; ++i) {
418 f(i);
419 }
420 }
421 template <typename F> // rit: rank in team
422 void team_for (int begin, int end, int rit, const F& f)
423 {
424 const auto& range = team_range(begin, end, rit);
425 for (int i = range.first; i < range.second; ++i) {
426 f(i);
427 }
428 }
429 template <typename F> // rit: rank in team
430 void team_for (int begin, int end, int rit, int nworkers, const F& f)
431 {
432 const auto& range = team_range(begin, end, rit, nworkers);
433 for (int i = range.first; i < range.second; ++i) {
434 f(i);
435 }
436 }
437
439 void Barrier (const std::string& message = Unnamed);
441 void Barrier (const MPI_Comm &comm, const std::string& message = Unnamed);
442 Message Abarrier ();
443 Message Abarrier (const MPI_Comm &comm);
444
445 void Test (MPI_Request& request, int& flag, MPI_Status& status);
446 void Test (Vector<MPI_Request>& request, int& flag, Vector<MPI_Status>& status);
447
448 void Comm_dup (MPI_Comm comm, MPI_Comm& newcomm);
450 void Abort (int errorcode = SIGABRT, bool backtrace = true);
452 const char* ErrorString (int errorcode);
454 double second () noexcept;
455
458 void ReduceBoolAnd (bool& rvar);
461 void ReduceBoolAnd (bool& rvar, int cpu);
462
465 void ReduceBoolOr (bool& rvar);
468 void ReduceBoolOr (bool& rvar, int cpu);
469
472 template <typename T>
473 std::enable_if_t<std::is_floating_point_v<T>>
474 ReduceRealSum (T& rvar);
475
477 template <typename T>
478 std::enable_if_t<std::is_floating_point_v<T>>
479 ReduceRealSum (T* rvar, int cnt);
480
481 // Having this for backward compatibility
482 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar);
483 //
485 template <typename T>
486 std::enable_if_t<std::is_floating_point_v<T>>
487 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar);
488
491 template <typename T>
492 std::enable_if_t<std::is_floating_point_v<T>>
493 ReduceRealSum (T& rvar, int cpu);
494
496 template <typename T>
497 std::enable_if_t<std::is_floating_point_v<T>>
498 ReduceRealSum (T* rvar, int cnt, int cpu);
499
500 // Having this for backward compatibility
501 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
502 //
504 template <typename T>
505 std::enable_if_t<std::is_floating_point_v<T>>
506 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
507
510 template <typename T>
511 std::enable_if_t<std::is_floating_point_v<T>>
512 ReduceRealMax (T& rvar);
513
515 template <typename T>
516 std::enable_if_t<std::is_floating_point_v<T>>
517 ReduceRealMax (T* rvar, int cnt);
518
519 // Having this for backward compatibility
520 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar);
521 //
523 template <typename T>
524 std::enable_if_t<std::is_floating_point_v<T>>
525 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar);
526
529 template <typename T>
530 std::enable_if_t<std::is_floating_point_v<T>>
531 ReduceRealMax (T& rvar, int cpu);
532
534 template <typename T>
535 std::enable_if_t<std::is_floating_point_v<T>>
536 ReduceRealMax (T* rvar, int cnt, int cpu);
537
538 // Having this for backward compatibility
539 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
540 //
542 template <typename T>
543 std::enable_if_t<std::is_floating_point_v<T>>
544 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
545
548 template <typename T>
549 std::enable_if_t<std::is_floating_point_v<T>>
550 ReduceRealMin (T& rvar);
551
553 template <typename T>
554 std::enable_if_t<std::is_floating_point_v<T>>
555 ReduceRealMin (T* rvar, int cnt);
556
557 // Having this for backward compatibility
558 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar);
559 //
561 template <typename T>
562 std::enable_if_t<std::is_floating_point_v<T>>
563 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar);
564
567 template <typename T>
568 std::enable_if_t<std::is_floating_point_v<T>>
569 ReduceRealMin (T& rvar, int cpu);
570
572 template <typename T>
573 std::enable_if_t<std::is_floating_point_v<T>>
574 ReduceRealMin (T* rvar, int cnt, int cpu);
575
576 // Having this for backward compatibility
577 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
578 //
580 template <typename T>
581 std::enable_if_t<std::is_floating_point_v<T>>
582 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
583
586 void ReduceIntSum (int& rvar);
588 void ReduceIntSum (int* rvar, int cnt);
590 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar);
593 void ReduceIntSum (int& rvar, int cpu);
595 void ReduceIntSum (int* rvar, int cnt, int cpu);
597 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
598
601 void ReduceIntMax (int& rvar);
603 void ReduceIntMax (int* rvar, int cnt);
605 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar);
608 void ReduceIntMax (int& rvar, int cpu);
610 void ReduceIntMax (int* rvar, int cnt, int cpu);
612 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
613
616 void ReduceIntMin (int& rvar);
618 void ReduceIntMin (int* rvar, int cnt);
620 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar);
623 void ReduceIntMin (int& rvar, int cpu);
625 void ReduceIntMin (int* rvar, int cnt, int cpu);
627 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
628
631 void ReduceLongSum (Long& rvar);
633 void ReduceLongSum (Long* rvar, int cnt);
635 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar);
638 void ReduceLongSum (Long& rvar, int cpu);
640 void ReduceLongSum (Long* rvar, int cnt, int cpu);
642 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
643
646 void ReduceLongMax (Long& rvar);
648 void ReduceLongMax (Long* rvar, int cnt);
650 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar);
653 void ReduceLongMax (Long& rvar, int cpu);
655 void ReduceLongMax (Long* rvar, int cnt, int cpu);
657 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
658
661 void ReduceLongMin (Long& rvar);
663 void ReduceLongMin (Long* rvar, int cnt);
665 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar);
668 void ReduceLongMin (Long& rvar, int cpu);
670 void ReduceLongMin (Long* rvar, int cnt, int cpu);
672 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
673
676 void ReduceLongAnd (Long& rvar);
678 void ReduceLongAnd (Long* rvar, int cnt);
680 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar);
683 void ReduceLongAnd (Long& rvar, int cpu);
685 void ReduceLongAnd (Long* rvar, int cnt, int cpu);
687 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
688
691 void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root);
696 inline int SeqNum () noexcept { return ParallelContext::get_inc_mpi_tag(); }
697
698 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag);
699 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag, MPI_Comm comm);
700 template <class T> [[nodiscard]] Message Asend(const std::vector<T>& buf, int pid, int tag);
701
702 template <class T> Message Arecv(T*, size_t n, int pid, int tag);
703 template <class T> Message Arecv(T*, size_t n, int pid, int tag, MPI_Comm comm);
704 template <class T> Message Arecv(std::vector<T>& buf, int pid, int tag);
705
706 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag);
707 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
708 template <class T> Message Send(const std::vector<T>& buf, int dst_pid, int tag);
709
710 template <class T> Message Recv(T*, size_t n, int pid, int tag);
711 template <class T> Message Recv(T*, size_t n, int pid, int tag, MPI_Comm comm);
712 template <class T> Message Recv(std::vector<T>& buf, int pid, int tag);
713
714 template <class T> void Bcast(T*, size_t n, int root = 0);
715 template <class T> void Bcast(T*, size_t n, int root, const MPI_Comm &comm);
716 void Bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
717
718 template <class T, class T1> void Scatter(T*, size_t n, const T1*, size_t n1, int root);
719
720 template <class T, class T1> void Gather(const T*, size_t n, T1*, size_t n1, int root);
721 template <class T> std::vector<T> Gather(const T&, int root);
722
723 template <class T> void Gatherv (const T* send, int sc,
724 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
725 int root);
726
728 template <class T> void GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
729 Vector<T>& recvbuf,
730 int root);
731
732 void Wait (MPI_Request& req, MPI_Status& status);
733 void Waitall (Vector<MPI_Request>& reqs, Vector<MPI_Status>& status);
734 void Waitany (Vector<MPI_Request>& reqs, int &index, MPI_Status& status);
736
738 void ReadAndBcastFile(const std::string &filename, Vector<char> &charBuf,
739 bool bExitOnError = true,
740 const MPI_Comm &comm = Communicator() );
741 void IProbe(int src_pid, int tag, int &mflag, MPI_Status &status);
742 void IProbe(int src_pid, int tag, MPI_Comm comm, int &mflag, MPI_Status &status);
743
749 std::string
750 mpi_level_to_string (int mtlev);
751
752 // PMI = Process Management Interface, available on Crays. Provides API to
753 // query topology of the job.
754#ifdef AMREX_PMI
755 void PMI_Initialize();
756 void PMI_PrintMeshcoords(const pmi_mesh_coord_t *pmi_mesh_coord);
757#endif
758
759#ifdef BL_USE_MPI
760 int select_comm_data_type (std::size_t nbytes);
761 std::size_t sizeof_selected_comm_data_type (std::size_t nbytes);
762#endif
763}
764}
765
766namespace amrex {
767
768#ifdef BL_USE_MPI
769
770template <class T>
771ParallelDescriptor::Message
772ParallelDescriptor::Asend (const T* buf, size_t n, int dst_pid, int tag)
773{
774 return Asend(buf, n, dst_pid, tag, Communicator());
775}
776
777namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
778template <>
779Message
780Asend<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
781
782template <class T>
783Message
784Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
785{
786 static_assert(!std::is_same_v<char,T>, "Asend: char version has been specialized");
787
788 BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T);
789 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag);
790
791 MPI_Request req;
792 BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
793 n,
795 dst_pid,
796 tag,
797 comm,
798 &req) );
799 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, BLProfiler::AfterCall(), dst_pid, tag);
800 return Message(req, Mpi_typemap<T>::type());
801}
802}
803
804template <class T>
805ParallelDescriptor::Message
806ParallelDescriptor::Asend (const std::vector<T>& buf, int dst_pid, int tag)
807{
808 return Asend(buf.data(), buf.size(), dst_pid, tag, Communicator());
809}
810
811template <class T>
812ParallelDescriptor::Message
813ParallelDescriptor::Send (const T* buf, size_t n, int dst_pid, int tag)
814{
815 return Send(buf, n, dst_pid, tag, Communicator());
816}
817
818namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
819template <>
820Message
821Send<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
822
823template <class T>
824Message
825Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
826{
827 static_assert(!std::is_same_v<char,T>, "Send: char version has been specialized");
828
829 BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T);
830
831#ifdef BL_COMM_PROFILING
832 int dst_pid_world(-1);
833 MPI_Group groupComm, groupWorld;
834 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
835 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
836 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &dst_pid, groupWorld, &dst_pid_world) );
837
838 BL_COMM_PROFILE(BLProfiler::SendTsii, n * sizeof(T), dst_pid_world, tag);
839#endif
840
841 BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(buf),
842 n,
844 dst_pid,
845 tag,
846 comm) );
847 BL_COMM_PROFILE(BLProfiler::SendTsii, BLProfiler::AfterCall(), dst_pid, tag);
848 return Message();
849}
850}
851
852template <class T>
853ParallelDescriptor::Message
854ParallelDescriptor::Send (const std::vector<T>& buf, int dst_pid, int tag)
855{
856 return Send(buf.data(), buf.size(), dst_pid, tag, Communicator());
857}
858
859template <class T>
860ParallelDescriptor::Message
861ParallelDescriptor::Arecv (T* buf, size_t n, int src_pid, int tag)
862{
863 return Arecv(buf, n, src_pid, tag, Communicator());
864}
865
866namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
867template <>
868Message
869Arecv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
870
871template <class T>
872Message
873Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
874{
875 static_assert(!std::is_same_v<char,T>, "Arecv: char version has been specialized");
876
877 BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T);
878 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag);
879
880 MPI_Request req;
881 BL_MPI_REQUIRE( MPI_Irecv(buf,
882 n,
884 src_pid,
885 tag,
886 comm,
887 &req) );
888 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, BLProfiler::AfterCall(), src_pid, tag);
889 return Message(req, Mpi_typemap<T>::type());
890}
891}
892
893template <class T>
894ParallelDescriptor::Message
895ParallelDescriptor::Arecv (std::vector<T>& buf, int src_pid, int tag)
896{
897 return Arecv(buf.data(), buf.size(), src_pid, tag, Communicator());
898}
899
900template <class T>
901ParallelDescriptor::Message
902ParallelDescriptor::Recv (T* buf, size_t n, int src_pid, int tag)
903{
904 return Recv(buf, n, src_pid, tag, Communicator());
905}
906
907namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
908template <>
909Message
910Recv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
911
912template <class T>
913Message
914Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
915{
916 static_assert(!std::is_same_v<char,T>, "Recv: char version has been specialized");
917
918 BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T);
919 BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag);
920
921 MPI_Status stat;
922 BL_MPI_REQUIRE( MPI_Recv(buf,
923 n,
925 src_pid,
926 tag,
927 comm,
928 &stat) );
929#ifdef BL_COMM_PROFILING
930 int src_pid_comm(stat.MPI_SOURCE);
931 int src_pid_world(stat.MPI_SOURCE);
932 if(src_pid_comm != MPI_ANY_SOURCE) {
933 MPI_Group groupComm, groupWorld;
934 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
935 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
936 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &src_pid_comm, groupWorld, &src_pid_world) );
937 }
938
939 BL_COMM_PROFILE(BLProfiler::RecvTsii, n * sizeof(T), src_pid_world, stat.MPI_TAG);
940#endif
941 return Message(stat, Mpi_typemap<T>::type());
942}
943}
944
945template <class T>
946ParallelDescriptor::Message
947ParallelDescriptor::Recv (std::vector<T>& buf, int src_pid, int tag)
948{
949 return Recv(buf.data(), buf.size(), src_pid, tag, Communicator());
950}
951
952template <class T>
953void
955 size_t n,
956 int root)
957{
958#ifdef BL_LAZY
960#endif
961
962 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
963
964 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
965 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
966
967 BL_MPI_REQUIRE( MPI_Bcast(t,
968 n,
969 Mpi_typemap<T>::type(),
970 root,
971 Communicator()) );
972 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
973}
974
975template <class T>
976void
978 size_t n,
979 int root,
980 const MPI_Comm &comm)
981{
982#ifdef BL_LAZY
983 int r;
984 MPI_Comm_compare(comm, Communicator(), &r);
985 if (r == MPI_IDENT) { Lazy::EvalReduction(); }
986#endif
987
988 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
989
990 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
991 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
992
993 BL_MPI_REQUIRE( MPI_Bcast(t,
994 n,
995 Mpi_typemap<T>::type(),
996 root,
997 comm) );
998 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
999}
1000
1001template <class T, class T1>
1002void
1003ParallelDescriptor::Gather (const T* t,
1004 size_t n,
1005 T1* t1,
1006 size_t n1,
1007 int root)
1008{
1009 BL_PROFILE_T_S("ParallelDescriptor::Gather(TsT1si)", T);
1010 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1011
1012 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
1013 BL_ASSERT(n1 < static_cast<size_t>(std::numeric_limits<int>::max()));
1014
1015 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(t),
1016 n,
1017 Mpi_typemap<T>::type(),
1018 t1,
1019 n1,
1020 Mpi_typemap<T1>::type(),
1021 root,
1022 Communicator()) );
1023 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, n * sizeof(T), root, BLProfiler::NoTag());
1024}
1025
1026template <class T>
1027std::vector<T>
1028ParallelDescriptor::Gather (const T& t, int root)
1029{
1030 BL_PROFILE_T_S("ParallelDescriptor::Gather(Ti)", T);
1031 BL_COMM_PROFILE(BLProfiler::GatherTi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1032
1033 std::vector<T> resl;
1034 if ( root == MyProc() ) { resl.resize(NProcs()); }
1035 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(&t),
1036 1,
1037 Mpi_typemap<T>::type(),
1038 resl.data(),
1039 1,
1040 Mpi_typemap<T>::type(),
1041 root,
1042 Communicator()) );
1043 BL_COMM_PROFILE(BLProfiler::GatherTi, sizeof(T), root, BLProfiler::NoTag());
1044 return resl;
1045}
1046
1047template <class T>
1048void
1049ParallelDescriptor::Gatherv (const T* send, int sc,
1050 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
1051 int root)
1052{
1053 BL_PROFILE_T_S("ParallelDescriptor::Gatherv(Ti)", T);
1054 BL_COMM_PROFILE(BLProfiler::Gatherv, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1055
1056 MPI_Gatherv(send, sc, ParallelDescriptor::Mpi_typemap<T>::type(),
1057 recv, rc.data(), disp.data(), ParallelDescriptor::Mpi_typemap<T>::type(),
1058 root, Communicator());
1059
1060 BL_COMM_PROFILE(BLProfiler::Gatherv, std::accumulate(rc.begin(),rc.end(),0)*sizeof(T), root, BLProfiler::NoTag());
1061}
1062
1063template <class T>
1064void
1065ParallelDescriptor::GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
1066 Vector<T>& recvbuf, int root)
1067{
1068 BL_PROFILE_T_S("ParallelDescriptor::GatherLayoutData(Ti)", T);
1069
1070 // Gather prelims
1071 Vector<T> T_to_send;
1072 T_to_send.reserve(sendbuf.local_size());
1073
1074 for (int i : sendbuf.IndexArray())
1075 {
1076 T_to_send.push_back(sendbuf[i]);
1077 }
1078
1079 int nprocs = ParallelContext::NProcsSub();
1080 Vector<int> recvcount(nprocs, 0);
1081 recvbuf.resize(sendbuf.size());
1082 const Vector<int>& old_pmap = sendbuf.DistributionMap().ProcessorMap();
1083 for (int i : old_pmap)
1084 {
1085 ++recvcount[i];
1086 }
1087
1088 // Make a map from post-gather to pre-gather index
1089 Vector<Vector<int>> new_ind_to_old_ind(nprocs);
1090 for (int i=0; i<nprocs; ++i)
1091 {
1092 new_ind_to_old_ind[i].reserve(recvcount[i]);
1093 }
1094 for (int i=0; i<old_pmap.size(); ++i)
1095 {
1096 new_ind_to_old_ind[old_pmap[i]].push_back(i);
1097 }
1098
1099 // Flatten
1100 Vector<int> new_index_to_old_index;
1101 new_index_to_old_index.reserve(old_pmap.size());
1102 for (const Vector<int>& v : new_ind_to_old_ind)
1103 {
1104 if (!v.empty())
1105 {
1106 for (int el : v)
1107 {
1108 new_index_to_old_index.push_back(el);
1109 }
1110 }
1111 }
1112
1113 Vector<int> disp(nprocs);
1114 if (!disp.empty()) { disp[0] = 0; }
1115 std::partial_sum(recvcount.begin(), recvcount.end()-1, disp.begin()+1);
1116 Vector<T> new_index_to_T(sendbuf.size());
1117
1118 MPI_Gatherv(T_to_send.data(), T_to_send.size(),
1120 new_index_to_T.data(), recvcount.data(), disp.data(),
1123
1124 // Now work just on the root, which now has global information on the collected;
1125 // LayoutData information; with new_index_to_old_index and new_index_to_T,
1126 // sort the gathered vector in pre-gather index space
1127 if (ParallelContext::MyProcSub() == root)
1128 {
1129 // Invert the map (new_index) --> (old_index)
1130 Vector<int> old_index_to_new_index(sendbuf.size());
1131
1132 for (int i=0; i<old_index_to_new_index.size(); ++i)
1133 {
1134 old_index_to_new_index[new_index_to_old_index[i]] = i;
1135 }
1136
1137 for (int i=0; i<recvbuf.size(); ++i)
1138 {
1139 recvbuf[i] = new_index_to_T[old_index_to_new_index[i]];
1140 }
1141 }
1142}
1143
1144template <class T, class T1>
1145void
1147 size_t n,
1148 const T1* t1,
1149 size_t n1,
1150 int root)
1151{
1152 BL_PROFILE_T_S("ParallelDescriptor::Scatter(TsT1si)", T);
1153 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1154
1155 BL_MPI_REQUIRE( MPI_Scatter(const_cast<T1*>(t1),
1156 n1,
1157 Mpi_typemap<T1>::type(),
1158 t,
1159 n,
1160 Mpi_typemap<T>::type(),
1161 root,
1162 Communicator()) );
1163 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, n * sizeof(T), root, BLProfiler::NoTag());
1164}
1165
1166#else
1167
1168namespace ParallelDescriptor
1169{
1170template <class T>
1171Message
1172Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1173{
1174 return Message();
1175}
1176
1177template <class T>
1178Message
1179Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1180{
1181 return Message();
1182}
1183
1184template <class T>
1185Message
1186Asend(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1187{
1188 return Message();
1189}
1190
1191template <class T>
1192Message
1193Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1194{
1195 return Message();
1196}
1197
1198template <class T>
1199Message
1200Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1201{
1202 return Message();
1203}
1204
1205template <class T>
1206Message
1207Send(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1208{
1209 return Message();
1210}
1211
1212template <class T>
1213Message
1214Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1215{
1216 return Message();
1217}
1218
1219template <class T>
1220Message
1221Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1222{
1223 return Message();
1224}
1225
1226template <class T>
1227Message
1228Arecv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1229{
1230 return Message();
1231}
1232
1233template <class T>
1234Message
1235Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1236{
1237 return Message();
1238}
1239
1240template <class T>
1241Message
1242Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1243{
1244 return Message();
1245}
1246
1247template <class T>
1248Message
1249Recv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1250{
1251 return Message();
1252}
1253
1254template <class T>
1255void
1256Bcast(T* /*t*/, size_t /*n*/, int /*root*/)
1257{}
1258
1259template <class T>
1260void
1261Bcast(T* /*t*/, size_t /*n*/, int /*root*/, const MPI_Comm & /*comm*/)
1262{}
1263
1264template <class T, class T1>
1265void
1266Gather (const T* t, size_t n, T1* t1, size_t n1, int /*root*/)
1267{
1268 BL_ASSERT(n == n1);
1270
1271 int const sc = static_cast<int>(n);
1272 for (int j=0; j<sc; ++j) { t1[j] = t[j]; }
1273}
1274
1275template <class T>
1276std::vector<T>
1277Gather(const T& t, int /*root*/)
1278{
1279 std::vector<T> resl(1);
1280 resl[0] = t;
1281 return resl;
1282}
1283
1284template <class T>
1285void
1286Gatherv (const T* send, int sc,
1287 T* recv, const std::vector<int>& /*rc*/,
1288 const std::vector<int>& /*disp*/, int /*root*/)
1289{
1290 for (int j=0; j<sc; ++j) { recv[j] = send[j]; }
1291}
1292
1293template <class T>
1294void
1296 Vector<T>& recvbuf, int /*root*/)
1297{
1298 recvbuf.resize(sendbuf.size());
1299
1300 for (int i=0; i<sendbuf.size(); ++i)
1301 {
1302 recvbuf[i] = sendbuf[i];
1303 }
1304}
1305
1306template <class T, class T1>
1307void
1308Scatter(T* /*t*/, size_t /*n*/, const T1* /*t1*/, size_t /*n1*/, int /*root*/)
1309{}
1310
1311}
1312#endif
1313
1314namespace ParallelDescriptor {
1315
1316#ifdef AMREX_USE_MPI
1317
1319namespace detail {
1320
1321template<typename T>
1322void DoAllReduce (T* r, MPI_Op op, int cnt)
1323{
1324#ifdef BL_LAZY
1325 Lazy::EvalReduction();
1326#endif
1327
1328 BL_ASSERT(cnt > 0);
1329
1330 BL_MPI_REQUIRE( MPI_Allreduce(MPI_IN_PLACE, r, cnt,
1331 Mpi_typemap<T>::type(), op,
1332 Communicator()) );
1333}
1334
1335template<typename T>
1336void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
1337{
1338#ifdef BL_LAZY
1339 Lazy::EvalReduction();
1340#endif
1341
1342 BL_ASSERT(cnt > 0);
1343
1344 if (MyProc() == cpu) {
1345 BL_MPI_REQUIRE( MPI_Reduce(MPI_IN_PLACE, r, cnt,
1346 Mpi_typemap<T>::type(), op,
1347 cpu, Communicator()) );
1348 } else {
1349 BL_MPI_REQUIRE( MPI_Reduce(r, r, cnt,
1350 Mpi_typemap<T>::type(), op,
1351 cpu, Communicator()) );
1352 }
1353}
1354
1355}
1357
1359 template <typename T>
1360 std::enable_if_t<std::is_floating_point_v<T>>
1361 ReduceRealSum (T& rvar) {
1362 detail::DoAllReduce<T>(&rvar,MPI_SUM,1);
1363 }
1364
1365 template <typename T>
1366 std::enable_if_t<std::is_floating_point_v<T>>
1367 ReduceRealSum (T* rvar, int cnt) {
1368 detail::DoAllReduce<T>(rvar,MPI_SUM,cnt);
1369 }
1370
1371 template <typename T>
1372 std::enable_if_t<std::is_floating_point_v<T>>
1373 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar)
1374 {
1375 int cnt = rvar.size();
1376 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1377 detail::DoAllReduce<T>(tmp.data(),MPI_SUM,cnt);
1378 for (int i = 0; i < cnt; ++i) {
1379 rvar[i].get() = tmp[i];
1380 }
1381 }
1382
1384 template <typename T>
1385 std::enable_if_t<std::is_floating_point_v<T>>
1386 ReduceRealSum (T& rvar, int cpu) {
1387 detail::DoReduce<T>(&rvar,MPI_SUM,1,cpu);
1388 }
1389
1390 template <typename T>
1391 std::enable_if_t<std::is_floating_point_v<T>>
1392 ReduceRealSum (T* rvar, int cnt, int cpu) {
1393 detail::DoReduce<T>(rvar,MPI_SUM,cnt,cpu);
1394 }
1395
1396 template <typename T>
1397 std::enable_if_t<std::is_floating_point_v<T>>
1398 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1399 {
1400 int cnt = rvar.size();
1401 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1402 detail::DoReduce<T>(tmp.data(),MPI_SUM,cnt,cpu);
1403 for (int i = 0; i < cnt; ++i) {
1404 rvar[i].get() = tmp[i];
1405 }
1406 }
1407
1409 template <typename T>
1410 std::enable_if_t<std::is_floating_point_v<T>>
1411 ReduceRealMax (T& rvar) {
1412 detail::DoAllReduce<T>(&rvar,MPI_MAX,1);
1413 }
1414
1415 template <typename T>
1416 std::enable_if_t<std::is_floating_point_v<T>>
1417 ReduceRealMax (T* rvar, int cnt) {
1418 detail::DoAllReduce<T>(rvar,MPI_MAX,cnt);
1419 }
1420
1421 template <typename T>
1422 std::enable_if_t<std::is_floating_point_v<T>>
1423 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar)
1424 {
1425 int cnt = rvar.size();
1426 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1427 detail::DoAllReduce<T>(tmp.data(),MPI_MAX,cnt);
1428 for (int i = 0; i < cnt; ++i) {
1429 rvar[i].get() = tmp[i];
1430 }
1431 }
1432
1434 template <typename T>
1435 std::enable_if_t<std::is_floating_point_v<T>>
1436 ReduceRealMax (T& rvar, int cpu) {
1437 detail::DoReduce<T>(&rvar,MPI_MAX,1,cpu);
1438 }
1439
1440 template <typename T>
1441 std::enable_if_t<std::is_floating_point_v<T>>
1442 ReduceRealMax (T* rvar, int cnt, int cpu) {
1443 detail::DoReduce<T>(rvar,MPI_MAX,cnt,cpu);
1444 }
1445
1446 template <typename T>
1447 std::enable_if_t<std::is_floating_point_v<T>>
1448 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1449 {
1450 int cnt = rvar.size();
1451 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1452 detail::DoReduce<T>(tmp.data(),MPI_MAX,cnt,cpu);
1453 for (int i = 0; i < cnt; ++i) {
1454 rvar[i].get() = tmp[i];
1455 }
1456 }
1457
1459 template <typename T>
1460 std::enable_if_t<std::is_floating_point_v<T>>
1461 ReduceRealMin (T& rvar) {
1462 detail::DoAllReduce<T>(&rvar,MPI_MIN,1);
1463 }
1464
1465 template <typename T>
1466 std::enable_if_t<std::is_floating_point_v<T>>
1467 ReduceRealMin (T* rvar, int cnt) {
1468 detail::DoAllReduce<T>(rvar,MPI_MIN,cnt);
1469 }
1470
1471 template <typename T>
1472 std::enable_if_t<std::is_floating_point_v<T>>
1473 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar)
1474 {
1475 int cnt = rvar.size();
1476 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1477 detail::DoAllReduce<T>(tmp.data(),MPI_MIN,cnt);
1478 for (int i = 0; i < cnt; ++i) {
1479 rvar[i].get() = tmp[i];
1480 }
1481 }
1482
1484 template <typename T>
1485 std::enable_if_t<std::is_floating_point_v<T>>
1486 ReduceRealMin (T& rvar, int cpu) {
1487 detail::DoReduce<T>(&rvar,MPI_MIN,1,cpu);
1488 }
1489
1490 template <typename T>
1491 std::enable_if_t<std::is_floating_point_v<T>>
1492 ReduceRealMin (T* rvar, int cnt, int cpu) {
1493 detail::DoReduce<T>(rvar,MPI_MIN,cnt,cpu);
1494 }
1495
1496 template <typename T>
1497 std::enable_if_t<std::is_floating_point_v<T>>
1498 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1499 {
1500 int cnt = rvar.size();
1501 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1502 detail::DoReduce<T>(tmp.data(),MPI_MIN,cnt,cpu);
1503 for (int i = 0; i < cnt; ++i) {
1504 rvar[i].get() = tmp[i];
1505 }
1506 }
1507
1508#else
1509
1511 template <typename T>
1512 std::enable_if_t<std::is_floating_point_v<T>>
1513 ReduceRealSum (T& ) {}
1514
1515 template <typename T>
1516 std::enable_if_t<std::is_floating_point_v<T>>
1517 ReduceRealSum (T*, int) {}
1518
1519 template <typename T>
1520 std::enable_if_t<std::is_floating_point_v<T>>
1521 ReduceRealSum (Vector<std::reference_wrapper<T> > const&) {}
1522
1524 template <typename T>
1525 std::enable_if_t<std::is_floating_point_v<T>>
1526 ReduceRealSum (T&, int) {}
1527
1528 template <typename T>
1529 std::enable_if_t<std::is_floating_point_v<T>>
1530 ReduceRealSum (T*, int, int) {}
1531
1532 template <typename T>
1533 std::enable_if_t<std::is_floating_point_v<T>>
1534 ReduceRealSum (Vector<std::reference_wrapper<T> > const&, int) {}
1535
1537 template <typename T>
1538 std::enable_if_t<std::is_floating_point_v<T>>
1539 ReduceRealMax (T&) {}
1540
1541 template <typename T>
1542 std::enable_if_t<std::is_floating_point_v<T>>
1543 ReduceRealMax (T*, int) {}
1544
1545 template <typename T>
1546 std::enable_if_t<std::is_floating_point_v<T>>
1547 ReduceRealMax (Vector<std::reference_wrapper<T> > const&) {}
1548
1550 template <typename T>
1551 std::enable_if_t<std::is_floating_point_v<T>>
1552 ReduceRealMax (T&, int) {}
1553
1554 template <typename T>
1555 std::enable_if_t<std::is_floating_point_v<T>>
1556 ReduceRealMax (T*, int, int) {}
1557
1558 template <typename T>
1559 std::enable_if_t<std::is_floating_point_v<T>>
1560 ReduceRealMax (Vector<std::reference_wrapper<T> > const&, int) {}
1561
1563 template <typename T>
1564 std::enable_if_t<std::is_floating_point_v<T>>
1565 ReduceRealMin (T&) {}
1566
1567 template <typename T>
1568 std::enable_if_t<std::is_floating_point_v<T>>
1569 ReduceRealMin (T*, int) {}
1570
1571 template <typename T>
1572 std::enable_if_t<std::is_floating_point_v<T>>
1573 ReduceRealMin (Vector<std::reference_wrapper<T> > const&) {}
1574
1576 template <typename T>
1577 std::enable_if_t<std::is_floating_point_v<T>>
1578 ReduceRealMin (T&, int) {}
1579
1580 template <typename T>
1581 std::enable_if_t<std::is_floating_point_v<T>>
1582 ReduceRealMin (T*, int, int) {}
1583
1584 template <typename T>
1585 std::enable_if_t<std::is_floating_point_v<T>>
1586 ReduceRealMin (Vector<std::reference_wrapper<T> > const&, int) {}
1587
1588#endif
1589}
1590
1591#ifdef AMREX_USE_MPI
1592namespace ParallelDescriptor {
1593
1594template <class T>
1596{
1598 {
1599 static_assert(std::is_same<T,double>() ||
1600 std::is_same<T,float >(),
1601 "Unsupported type T for GpuComplex");
1602 if constexpr (std::is_same<T,double>()) {
1603 return MPI_C_DOUBLE_COMPLEX;
1604 } else {
1605 return MPI_C_FLOAT_COMPLEX;
1606 }
1607 }
1608};
1609
1610template<typename TV, typename TI>
1612{
1614 {
1615 static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
1616 if (mpi_type == MPI_DATATYPE_NULL) {
1617 using T = ValLocPair<TV,TI>;
1618 static_assert(std::is_trivially_copyable_v<T>,
1619 "To communicate with MPI, ValLocPair must be trivially copyable.");
1620 static_assert(std::is_standard_layout_v<T>,
1621 "To communicate with MPI, ValLocPair must be standard layout");
1622
1623 T vlp[2];
1624 MPI_Datatype types[] = {
1627 };
1628 int blocklens[] = { 1, 1 };
1629 MPI_Aint disp[2];
1630 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].value, &disp[0]) );
1631 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].index, &disp[1]) );
1632 disp[1] -= disp[0];
1633 disp[0] = 0;
1634 BL_MPI_REQUIRE( MPI_Type_create_struct(2, blocklens, disp, types,
1635 &mpi_type) );
1636 MPI_Aint lb, extent;
1637 BL_MPI_REQUIRE( MPI_Type_get_extent(mpi_type, &lb, &extent) );
1638 if (extent != sizeof(T)) {
1639 MPI_Datatype tmp = mpi_type;
1640 BL_MPI_REQUIRE( MPI_Type_create_resized(tmp, 0, sizeof(vlp[0]), &mpi_type) );
1641 BL_MPI_REQUIRE( MPI_Type_free(&tmp) );
1642 }
1643 BL_MPI_REQUIRE( MPI_Type_commit( &mpi_type ) );
1644
1645 m_mpi_types.push_back(&mpi_type);
1646 }
1647 return mpi_type;
1648 }
1649};
1650
1651template <typename T, typename F>
1653{
1654 static MPI_Op mpi_op = MPI_OP_NULL;
1655 if (mpi_op == MPI_OP_NULL) {
1656 static auto user_fn = [] (void *invec, void *inoutvec, int* len, // NOLINT
1657 MPI_Datatype * /*datatype*/)
1658 {
1659 auto in = static_cast<T const*>(invec);
1660 auto out = static_cast<T*>(inoutvec);
1661 for (int i = 0; i < *len; ++i) {
1662 out[i] = F()(in[i],out[i]);
1663 }
1664 };
1665 BL_MPI_REQUIRE( MPI_Op_create(user_fn, 1, &mpi_op) );
1666 m_mpi_ops.push_back(&mpi_op);
1667 }
1668 return mpi_op;
1669}
1670
1671}
1672#endif
1673
1674}
1675
1676#endif /*BL_PARALLELDESCRIPTOR_H*/
#define BL_COMM_PROFILE(cft, size, pid, tag)
Definition AMReX_BLProfiler.H:587
#define BL_PROFILE_T_S(fname, T)
Definition AMReX_BLProfiler.H:554
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:110
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
Hold the description and status of communication data.
Definition AMReX_ParallelDescriptor.H:57
Message(MPI_Request req_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:61
MPI_Datatype type() const
Definition AMReX_ParallelDescriptor.H:73
MPI_Request req() const
Definition AMReX_ParallelDescriptor.H:74
bool test()
Definition AMReX_ParallelDescriptor.cpp:1191
MPI_Status stat() const
Definition AMReX_ParallelDescriptor.H:75
void wait()
Definition AMReX_ParallelDescriptor.cpp:1187
Message(MPI_Status stat_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:65
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:28
amrex_real Real
Floating Point Type for Fields.
Definition AMReX_REAL.H:79
amrex_long Long
Definition AMReX_INT.H:30
void ReduceIntMax(int &)
Definition AMReX_ParallelDescriptor.cpp:1266
void ReduceLongAnd(Long &)
Definition AMReX_ParallelDescriptor.cpp:1235
void ReduceBoolAnd(bool &)
Definition AMReX_ParallelDescriptor.cpp:1289
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
void Gather(Real const *sendbuf, int nsend, Real *recvbuf, int root)
Definition AMReX_ParallelDescriptor.cpp:1173
void Barrier(const std::string &)
Definition AMReX_ParallelDescriptor.cpp:1215
void ReduceIntSum(int &)
Definition AMReX_ParallelDescriptor.cpp:1265
void ReduceBoolOr(bool &)
Definition AMReX_ParallelDescriptor.cpp:1290
void ReadAndBcastFile(const std::string &filename, Vector< char > &charBuf, bool bExitOnError, const MPI_Comm &comm)
Definition AMReX_ParallelDescriptor.cpp:1495
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void ReduceLongSum(Long &)
Definition AMReX_ParallelDescriptor.cpp:1236
int IOProcessorNumber() noexcept
The MPI rank number of the I/O Processor (probably rank 0). This rank is usually used to write to std...
Definition AMReX_ParallelDescriptor.H:279
void ReduceLongMax(Long &)
Definition AMReX_ParallelDescriptor.cpp:1237
bool IOProcessor() noexcept
Is this CPU the I/O Processor? To get the rank number, call IOProcessorNumber()
Definition AMReX_ParallelDescriptor.H:289
void ReduceLongMin(Long &)
Definition AMReX_ParallelDescriptor.cpp:1238
void ReduceIntMin(int &)
Definition AMReX_ParallelDescriptor.cpp:1267
void EvalReduction()
Definition AMReX_Lazy.cpp:20
int NProcs()
Process ID in MPI_COMM_WORLD.
Definition AMReX_MPMD.cpp:122
int MyProc()
Definition AMReX_MPMD.cpp:117
int MyProcAll() noexcept
my rank in world communicator
Definition AMReX_ParallelContext.H:61
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int get_inc_mpi_tag() noexcept
get and increment mpi tag in current frame
Definition AMReX_ParallelContext.H:93
int MyProcSub() noexcept
my sub-rank in current frame
Definition AMReX_ParallelContext.H:76
int NProcsAll() noexcept
number of ranks in world communicator
Definition AMReX_ParallelContext.H:59
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
bool sameTeam(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:345
int RankInLeadComm(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:355
void Test(MPI_Request &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1220
const char * ErrorString(int)
ErrorString return string associated with error internal error condition.
Definition AMReX_ParallelDescriptor.cpp:1213
Message Asend(const T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1172
MPI_Comm Communicator() noexcept
Definition AMReX_ParallelDescriptor.H:223
void EndTeams()
Definition AMReX_ParallelDescriptor.cpp:1654
void Waitany(Vector< MPI_Request > &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1312
void Gatherv(const T *send, int sc, T *recv, const std::vector< int > &rc, const std::vector< int > &disp, int root)
Definition AMReX_ParallelDescriptor.H:1286
Message Abarrier()
Definition AMReX_ParallelDescriptor.cpp:1217
void ReduceRealMin(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1229
void StartParallel(int *, char ***, MPI_Comm)
Perform any needed parallel initialization. This MUST be the first routine in this class called from ...
Definition AMReX_ParallelDescriptor.cpp:1165
bool isTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:340
void Wait(MPI_Request &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1304
std::string mpi_level_to_string(int mtlev)
Definition AMReX_ParallelDescriptor.cpp:1660
void ReduceRealSum(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1227
void Waitsome(Vector< MPI_Request > &, int &, Vector< int > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1316
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:113
std::pair< int, int > team_range(int begin, int end, int rit=-1, int nworkers=0) noexcept
Definition AMReX_ParallelDescriptor.H:370
int MyTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:325
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1308
int MyTeamColor() noexcept
Definition AMReX_ParallelDescriptor.H:320
bool doTeamReduce() noexcept
Definition AMReX_ParallelDescriptor.H:360
int MinTag() noexcept
Definition AMReX_ParallelDescriptor.H:220
int TeamLead(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:335
int MyRankInProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:243
Message Send(const T *buf, size_t n, int dst_pid, int tag)
Definition AMReX_ParallelDescriptor.H:1193
void Initialize()
Definition AMReX_ParallelDescriptor.cpp:1547
void Finalize()
Definition AMReX_ParallelDescriptor.cpp:1591
int TeamSize() noexcept
Definition AMReX_ParallelDescriptor.H:310
void EndParallel()
Perform any needed parallel finalization. This MUST be the last routine in this class called from wit...
Definition AMReX_ParallelDescriptor.cpp:1197
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition AMReX_ParallelDescriptor.cpp:1295
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:696
void IProbe(int, int, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1222
int MyRankInNode() noexcept
Definition AMReX_ParallelDescriptor.H:233
void GatherLayoutDataToVector(const LayoutData< T > &sendbuf, Vector< T > &recvbuf, int root)
Gather LayoutData values to a vector on root.
Definition AMReX_ParallelDescriptor.H:1295
int MaxTag() noexcept
Definition AMReX_ParallelDescriptor.H:221
void Comm_dup(MPI_Comm, MPI_Comm &)
Definition AMReX_ParallelDescriptor.cpp:1225
int NProcsPerProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:238
double second() noexcept
Returns wall-clock seconds since start of execution.
Definition AMReX_ParallelDescriptor.cpp:1298
const std::string Unnamed("Unnamed")
Used as default argument to ParallelDescriptor::Barrier().
int MyRankInTeam() noexcept
Definition AMReX_ParallelDescriptor.H:330
void Scatter(T *, size_t n, const T1 *, size_t n1, int root)
Definition AMReX_ParallelDescriptor.H:1308
int NProcsPerNode() noexcept
Definition AMReX_ParallelDescriptor.H:228
void Abort(int errorcode, bool backtrace)
Abort with specified error code.
Definition AMReX_ParallelDescriptor.cpp:1204
void team_for(int begin, int end, const F &f)
Definition AMReX_ParallelDescriptor.H:414
int NTeams() noexcept
Definition AMReX_ParallelDescriptor.H:315
Message Recv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1235
void StartTeams()
Split the process pool into teams.
Definition AMReX_ParallelDescriptor.cpp:1600
Message Arecv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1214
MPI_Op Mpi_op()
Definition AMReX_ParallelDescriptor.H:1652
void ReduceRealMax(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1228
const ProcessTeam & MyTeam() noexcept
Definition AMReX_ParallelDescriptor.H:365
static constexpr int MPI_DATATYPE_NULL
Definition AMReX_ccse-mpi.H:56
static constexpr int MPI_PROC_NULL
Definition AMReX_ccse-mpi.H:61
int MPI_Comm
Definition AMReX_ccse-mpi.H:51
static constexpr int MPI_COMM_WORLD
Definition AMReX_ccse-mpi.H:58
int MPI_Group
Definition AMReX_ccse-mpi.H:52
int MPI_Op
Definition AMReX_ccse-mpi.H:50
int MPI_Request
Definition AMReX_ccse-mpi.H:54
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
int MPI_Datatype
Definition AMReX_ccse-mpi.H:53
Definition AMReX_Amr.cpp:49
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:138
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2006
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2015
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1597
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1613
Communication datatype (note: this structure also works without MPI)
Definition AMReX_ccse-mpi.H:78
Provide functionalities needed to construct a team of processes to perform a particular job.
Definition AMReX_ParallelDescriptor.H:148
MPI_Comm team_t
Definition AMReX_ParallelDescriptor.H:149
void clear()
free a communicator
Definition AMReX_ParallelDescriptor.H:180
void Barrier() const
synchronize processes within the team
Definition AMReX_ParallelDescriptor.H:152
const MPI_Comm & get_team_comm() const
return the communicator
Definition AMReX_ParallelDescriptor.H:193
const MPI_Comm & get_lead_comm() const
Definition AMReX_ParallelDescriptor.H:194
void MemoryBarrier() const
memory fence
Definition AMReX_ParallelDescriptor.H:161
const team_t & get() const
Definition AMReX_ParallelDescriptor.H:189
Definition AMReX_ValLocPair.H:10
Definition AMReX_ccse-mpi.H:55