Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_ParallelDescriptor.H
Go to the documentation of this file.
1#ifndef BL_PARALLELDESCRIPTOR_H
2#define BL_PARALLELDESCRIPTOR_H
3#include <AMReX_Config.H>
4
5#include <AMReX_ccse-mpi.H>
7#include <AMReX_BLBackTrace.H>
8#include <AMReX_BLProfiler.H>
9#include <AMReX_BLassert.H>
10#include <AMReX_Extension.H>
11#include <AMReX_INT.H>
12#include <AMReX_REAL.H>
13#include <AMReX_GpuComplex.H>
14#include <AMReX_Array.H>
15#include <AMReX_Vector.H>
16#include <AMReX_ValLocPair.H>
17
18#ifndef BL_AMRPROF
19#include <AMReX_Box.H>
20#endif
21
22#ifdef BL_LAZY
23#include <AMReX_Lazy.H>
24#endif
25
26#ifdef AMREX_PMI
27#include <pmi.h>
28#endif
29
30#ifdef AMREX_USE_OMP
31#include <omp.h>
32#endif
33
34#include <algorithm>
35#include <atomic>
36#include <csignal>
37#include <functional>
38#include <limits>
39#include <numeric>
40#include <string>
41#include <typeinfo>
42#include <vector>
43
44namespace amrex {
45
46template <typename T> class LayoutData;
47
49namespace ParallelDescriptor
50{
51 //
53 const std::string Unnamed("Unnamed");
54
56 class Message
57 {
58 public:
59
60 Message () = default;
62 m_finished(false),
63 m_type(type_),
64 m_req(req_) {}
66 m_type(type_),
67 m_stat(stat_) {}
68 void wait ();
69 bool test ();
70 size_t count () const;
71 int tag () const;
72 int pid () const;
73 MPI_Datatype type () const { return m_type; }
74 MPI_Request req () const { return m_req; }
75 MPI_Status stat () const { return m_stat; }
76
77 private:
78
79 bool m_finished = true;
82 mutable MPI_Status m_stat{};
83 };
84
85#ifdef BL_USE_MPI
86 void MPI_Error(const char* file, int line, const char* str, int rc);
87
88#define BL_MPI_REQUIRE(x) \
89do \
90{ \
91 if ( int l_status_ = (x) ) \
92 { \
93 amrex::ParallelDescriptor::MPI_Error(__FILE__,__LINE__,#x, l_status_); \
94 } \
95} \
96while ( false )
97#endif
98
103 void StartParallel (int* argc = nullptr,
104 char*** argv = nullptr,
105 MPI_Comm mpi_comm = MPI_COMM_WORLD);
106
107 void Initialize ();
108 void Finalize ();
109
111 extern AMREX_EXPORT bool use_gpu_aware_mpi;
113 inline bool UseGpuAwareMpi () { return use_gpu_aware_mpi; }
114
116 void StartTeams ();
117 void EndTeams ();
118
123 void EndParallel ();
124
127 inline int
128 MyProc () noexcept
129 {
131 }
133 inline int
134 MyProc (MPI_Comm comm) noexcept
135 {
136#ifdef BL_USE_MPI
137 int r;
138 MPI_Comm_rank(comm,&r);
139 return r;
140#else
142 return 0;
143#endif
144 }
145
148 {
150
152 void Barrier () const {
153 if (m_size > 1) {
154#if defined(BL_USE_MPI3)
155 MPI_Barrier(m_team_comm);
156#endif
157 }
158 }
159
161 void MemoryBarrier () const {
162 if (m_size > 1) {
163#ifdef AMREX_USE_OMP
164 if (omp_in_parallel()) {
165 #pragma omp barrier
166 }
167 #pragma omp master
168#endif
169 {
170#if defined(BL_USE_MPI3)
171 std::atomic_thread_fence(std::memory_order_release);
172 MPI_Barrier(m_team_comm);
173 std::atomic_thread_fence(std::memory_order_acquire);
174#endif
175 }
176 }
177 }
178
180 void clear () {
181#if defined(BL_USE_MPI3)
182 if (m_size > 1) {
183 MPI_Comm_free(&m_team_comm);
184 if (m_rankInTeam==0) { MPI_Comm_free(&m_lead_comm); }
185 }
186#endif
187 }
188
189 [[nodiscard]] const team_t& get() const {
190 return m_team_comm;
191 }
193 [[nodiscard]] const MPI_Comm& get_team_comm() const { return m_team_comm; }
194 [[nodiscard]] const MPI_Comm& get_lead_comm() const { return m_lead_comm; }
195
197 int m_numTeams;
198 int m_size;
199 int m_color;
200 int m_lead;
201 int m_rankInTeam;
202 int m_do_team_reduce;
203
204 MPI_Comm m_team_comm;
205 MPI_Comm m_lead_comm;
207 };
208
210 extern AMREX_EXPORT ProcessTeam m_Team;
211 extern AMREX_EXPORT int m_MinTag, m_MaxTag;
212 extern AMREX_EXPORT MPI_Comm m_comm;
213 extern AMREX_EXPORT int m_nprocs_per_node;
214 extern AMREX_EXPORT int m_rank_in_node;
215 extern AMREX_EXPORT int m_nprocs_per_processor;
216 extern AMREX_EXPORT int m_rank_in_processor;
217 extern AMREX_EXPORT const int ioProcessor;
219
220 inline int MinTag () noexcept { return m_MinTag; }
221 inline int MaxTag () noexcept { return m_MaxTag; }
222
223 inline MPI_Comm Communicator () noexcept { return m_comm; }
224
228 inline int NProcsPerNode () noexcept { return m_nprocs_per_node; }
229
233 inline int MyRankInNode () noexcept { return m_rank_in_node; }
234
238 inline int NProcsPerProcessor () noexcept { return m_nprocs_per_processor; }
239
243 inline int MyRankInProcessor () noexcept { return m_rank_in_processor; }
244
245#ifdef AMREX_USE_MPI
247 extern AMREX_EXPORT Vector<MPI_Datatype*> m_mpi_types;
248 extern AMREX_EXPORT Vector<MPI_Op*> m_mpi_ops;
250#endif
251
254 inline int
255 NProcs () noexcept
256 {
258 }
259
261 inline int
262 NProcs (MPI_Comm comm) noexcept
263 {
264#ifdef BL_USE_MPI
265 int s;
266 BL_MPI_REQUIRE(MPI_Comm_size(comm, &s));
267 return s;
268#else
270 return 1;
271#endif
272 }
278 inline int
280 {
281 return ioProcessor;
282 }
288 inline bool
289 IOProcessor () noexcept
290 {
291 return MyProc() == IOProcessorNumber();
292 }
293
295 inline int
297 {
298 return (comm == ParallelDescriptor::Communicator()) ? ioProcessor : 0;
299 }
300
302 inline bool
303 IOProcessor (MPI_Comm comm) noexcept
304 {
305 return MyProc(comm) == IOProcessorNumber(comm);
306 }
307
308 //
309 inline int
310 TeamSize () noexcept
311 {
312 return m_Team.m_size;
313 }
314 inline int
315 NTeams () noexcept
316 {
317 return m_Team.m_numTeams;
318 }
319 inline int
320 MyTeamColor () noexcept
321 {
322 return m_Team.m_color;
323 }
324 inline int
325 MyTeamLead () noexcept
326 {
327 return m_Team.m_lead;
328 }
329 inline int
330 MyRankInTeam () noexcept
331 {
332 return m_Team.m_rankInTeam;
333 }
334 inline int
335 TeamLead (int rank) noexcept
336 {
337 return (rank >= 0) ? (rank - rank % m_Team.m_size) : MPI_PROC_NULL;
338 }
339 inline bool
340 isTeamLead () noexcept
341 {
342 return MyRankInTeam() == 0;
343 }
344 inline bool
345 sameTeam (int rank) noexcept
346 {
347 return MyTeamLead() == TeamLead(rank);
348 }
349 inline bool
350 sameTeam (int rankA, int rankB) noexcept
351 {
352 return TeamLead(rankA) == TeamLead(rankB);
353 }
354 inline int
355 RankInLeadComm (int rank) noexcept
356 {
357 return (rank >= 0) ? (rank / m_Team.m_size) : MPI_PROC_NULL;
358 }
359 inline bool
360 doTeamReduce () noexcept
361 {
362 return m_Team.m_do_team_reduce;
363 }
364 inline const ProcessTeam&
365 MyTeam () noexcept
366 {
367 return m_Team;
368 }
369 inline std::pair<int,int>
370 team_range (int begin, int end, int rit = -1, int nworkers = 0) noexcept
371 {
372 int rb, re;
373 {
374 if (rit < 0) { rit = ParallelDescriptor::MyRankInTeam(); }
375 if (nworkers == 0) { nworkers = ParallelDescriptor::TeamSize(); }
376 BL_ASSERT(rit<nworkers);
377 if (nworkers == 1) {
378 rb = begin;
379 re = end;
380 } else {
381 int ntot = end - begin;
382 int nr = ntot / nworkers;
383 int nlft = ntot - nr * nworkers;
384 if (rit < nlft) { // get nr+1 items
385 rb = begin + rit * (nr + 1);
386 re = rb + nr + 1;
387 } else { // get nr items
388 rb = begin + rit * nr + nlft;
389 re = rb + nr;
390 }
391 }
392 }
393
394#ifdef AMREX_USE_OMP
395 int nthreads = omp_get_num_threads();
396 if (nthreads > 1) {
397 int tid = omp_get_thread_num();
398 int ntot = re - rb;
399 int nr = ntot / nthreads;
400 int nlft = ntot - nr * nthreads;
401 if (tid < nlft) { // get nr+1 items
402 rb += tid * (nr + 1);
403 re = rb + nr + 1;
404 } else { // get nr items
405 rb += tid * nr + nlft;
406 re = rb + nr;
407 }
408 }
409#endif
410
411 return std::make_pair(rb,re);
412 }
413 template <typename F>
414 void team_for (int begin, int end, const F& f)
415 {
416 const auto& range = team_range(begin, end);
417 for (int i = range.first; i < range.second; ++i) {
418 f(i);
419 }
420 }
421 template <typename F> // rit: rank in team
422 void team_for (int begin, int end, int rit, const F& f)
423 {
424 const auto& range = team_range(begin, end, rit);
425 for (int i = range.first; i < range.second; ++i) {
426 f(i);
427 }
428 }
429 template <typename F> // rit: rank in team
430 void team_for (int begin, int end, int rit, int nworkers, const F& f)
431 {
432 const auto& range = team_range(begin, end, rit, nworkers);
433 for (int i = range.first; i < range.second; ++i) {
434 f(i);
435 }
436 }
437
439 void Barrier (const std::string& message = Unnamed);
441 void Barrier (const MPI_Comm &comm, const std::string& message = Unnamed);
442 Message Abarrier ();
443 Message Abarrier (const MPI_Comm &comm);
444
445 void Test (MPI_Request& request, int& flag, MPI_Status& status);
446 void Test (Vector<MPI_Request>& request, int& flag, Vector<MPI_Status>& status);
447
448 void Comm_dup (MPI_Comm comm, MPI_Comm& newcomm);
450 void Abort (int errorcode = SIGABRT, bool backtrace = true);
452 const char* ErrorString (int errorcode);
454 double second () noexcept;
455
458 void ReduceBoolAnd (bool& rvar);
461 void ReduceBoolAnd (bool& rvar, int cpu);
462
465 void ReduceBoolOr (bool& rvar);
468 void ReduceBoolOr (bool& rvar, int cpu);
469
472 template <std::floating_point T>
473 void ReduceRealSum (T& rvar);
474
476 template <std::floating_point T>
477 void ReduceRealSum (T* rvar, int cnt);
478
479 // Having this for backward compatibility
480 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar);
481 //
483 template <std::floating_point T>
484 void ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar);
485
488 template <std::floating_point T>
489 void ReduceRealSum (T& rvar, int cpu);
490
492 template <std::floating_point T>
493 void ReduceRealSum (T* rvar, int cnt, int cpu);
494
495 // Having this for backward compatibility
496 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
497 //
499 template <std::floating_point T>
500 void ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
501
504 template <std::floating_point T>
505 void ReduceRealMax (T& rvar);
506
508 template <std::floating_point T>
509 void ReduceRealMax (T* rvar, int cnt);
510
511 // Having this for backward compatibility
512 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar);
513 //
515 template <std::floating_point T>
516 void ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar);
517
520 template <std::floating_point T>
521 void ReduceRealMax (T& rvar, int cpu);
522
524 template <std::floating_point T>
525 void ReduceRealMax (T* rvar, int cnt, int cpu);
526
527 // Having this for backward compatibility
528 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
529 //
531 template <std::floating_point T>
532 void ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
533
536 template <std::floating_point T>
537 void ReduceRealMin (T& rvar);
538
540 template <std::floating_point T>
541 void ReduceRealMin (T* rvar, int cnt);
542
543 // Having this for backward compatibility
544 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar);
545 //
547 template <std::floating_point T>
548 void ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar);
549
552 template <std::floating_point T>
553 void ReduceRealMin (T& rvar, int cpu);
554
556 template <std::floating_point T>
557 void ReduceRealMin (T* rvar, int cnt, int cpu);
558
559 // Having this for backward compatibility
560 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
561 //
563 template <std::floating_point T>
564 void ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
565
568 void ReduceIntSum (int& rvar);
570 void ReduceIntSum (int* rvar, int cnt);
572 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar);
575 void ReduceIntSum (int& rvar, int cpu);
577 void ReduceIntSum (int* rvar, int cnt, int cpu);
579 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
580
583 void ReduceIntMax (int& rvar);
585 void ReduceIntMax (int* rvar, int cnt);
587 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar);
590 void ReduceIntMax (int& rvar, int cpu);
592 void ReduceIntMax (int* rvar, int cnt, int cpu);
594 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
595
598 void ReduceIntMin (int& rvar);
600 void ReduceIntMin (int* rvar, int cnt);
602 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar);
605 void ReduceIntMin (int& rvar, int cpu);
607 void ReduceIntMin (int* rvar, int cnt, int cpu);
609 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
610
613 void ReduceLongSum (Long& rvar);
615 void ReduceLongSum (Long* rvar, int cnt);
617 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar);
620 void ReduceLongSum (Long& rvar, int cpu);
622 void ReduceLongSum (Long* rvar, int cnt, int cpu);
624 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
625
628 void ReduceLongMax (Long& rvar);
630 void ReduceLongMax (Long* rvar, int cnt);
632 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar);
635 void ReduceLongMax (Long& rvar, int cpu);
637 void ReduceLongMax (Long* rvar, int cnt, int cpu);
639 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
640
643 void ReduceLongMin (Long& rvar);
645 void ReduceLongMin (Long* rvar, int cnt);
647 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar);
650 void ReduceLongMin (Long& rvar, int cpu);
652 void ReduceLongMin (Long* rvar, int cnt, int cpu);
654 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
655
658 void ReduceLongAnd (Long& rvar);
660 void ReduceLongAnd (Long* rvar, int cnt);
662 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar);
665 void ReduceLongAnd (Long& rvar, int cpu);
667 void ReduceLongAnd (Long* rvar, int cnt, int cpu);
669 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
670
673 void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root);
678 inline int SeqNum () noexcept { return ParallelContext::get_inc_mpi_tag(); }
679
680 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag);
681 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag, MPI_Comm comm);
682 template <class T> [[nodiscard]] Message Asend(const std::vector<T>& buf, int pid, int tag);
683
684 template <class T> Message Arecv(T*, size_t n, int pid, int tag);
685 template <class T> Message Arecv(T*, size_t n, int pid, int tag, MPI_Comm comm);
686 template <class T> Message Arecv(std::vector<T>& buf, int pid, int tag);
687
688 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag);
689 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
690 template <class T> Message Send(const std::vector<T>& buf, int dst_pid, int tag);
691
692 template <class T> Message Recv(T*, size_t n, int pid, int tag);
693 template <class T> Message Recv(T*, size_t n, int pid, int tag, MPI_Comm comm);
694 template <class T> Message Recv(std::vector<T>& buf, int pid, int tag);
695
696 template <class T> void Bcast(T*, size_t n, int root = 0);
697 template <class T> void Bcast(T*, size_t n, int root, const MPI_Comm &comm);
698 void Bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
699
700 template <class T, class T1> void Scatter(T*, size_t n, const T1*, size_t n1, int root);
701
702 template <class T, class T1> void Gather(const T*, size_t n, T1*, size_t n1, int root);
703 template <class T> std::vector<T> Gather(const T&, int root);
704
705 template <class T> void Gatherv (const T* send, int sc,
706 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
707 int root);
708
710 template <class T> void GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
711 Vector<T>& recvbuf,
712 int root);
713
714 void Wait (MPI_Request& req, MPI_Status& status);
715 void Waitall (Vector<MPI_Request>& reqs, Vector<MPI_Status>& status);
716 void Waitany (Vector<MPI_Request>& reqs, int &index, MPI_Status& status);
718
720 void ReadAndBcastFile(const std::string &filename, Vector<char> &charBuf,
721 bool bExitOnError = true,
722 const MPI_Comm &comm = Communicator() );
723 void IProbe(int src_pid, int tag, int &mflag, MPI_Status &status);
724 void IProbe(int src_pid, int tag, MPI_Comm comm, int &mflag, MPI_Status &status);
725
731 std::string
732 mpi_level_to_string (int mtlev);
733
734 // PMI = Process Management Interface, available on Crays. Provides API to
735 // query topology of the job.
736#ifdef AMREX_PMI
737 void PMI_Initialize();
738 void PMI_PrintMeshcoords(const pmi_mesh_coord_t *pmi_mesh_coord);
739#endif
740
741#ifdef BL_USE_MPI
742 int select_comm_data_type (std::size_t nbytes);
743 std::size_t sizeof_selected_comm_data_type (std::size_t nbytes);
744#endif
745}
746}
747
748namespace amrex {
749
750#ifdef BL_USE_MPI
751
752template <class T>
753ParallelDescriptor::Message
754ParallelDescriptor::Asend (const T* buf, size_t n, int dst_pid, int tag)
755{
756 return Asend(buf, n, dst_pid, tag, Communicator());
757}
758
759namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
760template <>
761Message
762Asend<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
763
764template <class T>
765Message
766Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
767{
768 static_assert(!std::is_same_v<char,T>, "Asend: char version has been specialized");
769
770 BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T);
771 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag);
772
773 MPI_Request req;
774 BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
775 n,
777 dst_pid,
778 tag,
779 comm,
780 &req) );
781 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, BLProfiler::AfterCall(), dst_pid, tag);
782 return Message(req, Mpi_typemap<T>::type());
783}
784}
785
786template <class T>
787ParallelDescriptor::Message
788ParallelDescriptor::Asend (const std::vector<T>& buf, int dst_pid, int tag)
789{
790 return Asend(buf.data(), buf.size(), dst_pid, tag, Communicator());
791}
792
793template <class T>
794ParallelDescriptor::Message
795ParallelDescriptor::Send (const T* buf, size_t n, int dst_pid, int tag)
796{
797 return Send(buf, n, dst_pid, tag, Communicator());
798}
799
800namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
801template <>
802Message
803Send<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
804
805template <class T>
806Message
807Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
808{
809 static_assert(!std::is_same_v<char,T>, "Send: char version has been specialized");
810
811 BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T);
812
813#ifdef BL_COMM_PROFILING
814 int dst_pid_world(-1);
815 MPI_Group groupComm, groupWorld;
816 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
817 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
818 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &dst_pid, groupWorld, &dst_pid_world) );
819
820 BL_COMM_PROFILE(BLProfiler::SendTsii, n * sizeof(T), dst_pid_world, tag);
821#endif
822
823 BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(buf),
824 n,
826 dst_pid,
827 tag,
828 comm) );
829 BL_COMM_PROFILE(BLProfiler::SendTsii, BLProfiler::AfterCall(), dst_pid, tag);
830 return Message();
831}
832}
833
834template <class T>
835ParallelDescriptor::Message
836ParallelDescriptor::Send (const std::vector<T>& buf, int dst_pid, int tag)
837{
838 return Send(buf.data(), buf.size(), dst_pid, tag, Communicator());
839}
840
841template <class T>
842ParallelDescriptor::Message
843ParallelDescriptor::Arecv (T* buf, size_t n, int src_pid, int tag)
844{
845 return Arecv(buf, n, src_pid, tag, Communicator());
846}
847
848namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
849template <>
850Message
851Arecv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
852
853template <class T>
854Message
855Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
856{
857 static_assert(!std::is_same_v<char,T>, "Arecv: char version has been specialized");
858
859 BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T);
860 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag);
861
862 MPI_Request req;
863 BL_MPI_REQUIRE( MPI_Irecv(buf,
864 n,
866 src_pid,
867 tag,
868 comm,
869 &req) );
870 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, BLProfiler::AfterCall(), src_pid, tag);
871 return Message(req, Mpi_typemap<T>::type());
872}
873}
874
875template <class T>
876ParallelDescriptor::Message
877ParallelDescriptor::Arecv (std::vector<T>& buf, int src_pid, int tag)
878{
879 return Arecv(buf.data(), buf.size(), src_pid, tag, Communicator());
880}
881
882template <class T>
883ParallelDescriptor::Message
884ParallelDescriptor::Recv (T* buf, size_t n, int src_pid, int tag)
885{
886 return Recv(buf, n, src_pid, tag, Communicator());
887}
888
889namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
890template <>
891Message
892Recv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
893
894template <class T>
895Message
896Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
897{
898 static_assert(!std::is_same_v<char,T>, "Recv: char version has been specialized");
899
900 BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T);
901 BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag);
902
903 MPI_Status stat;
904 BL_MPI_REQUIRE( MPI_Recv(buf,
905 n,
907 src_pid,
908 tag,
909 comm,
910 &stat) );
911#ifdef BL_COMM_PROFILING
912 int src_pid_comm(stat.MPI_SOURCE);
913 int src_pid_world(stat.MPI_SOURCE);
914 if(src_pid_comm != MPI_ANY_SOURCE) {
915 MPI_Group groupComm, groupWorld;
916 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
917 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
918 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &src_pid_comm, groupWorld, &src_pid_world) );
919 }
920
921 BL_COMM_PROFILE(BLProfiler::RecvTsii, n * sizeof(T), src_pid_world, stat.MPI_TAG);
922#endif
923 return Message(stat, Mpi_typemap<T>::type());
924}
925}
926
927template <class T>
928ParallelDescriptor::Message
929ParallelDescriptor::Recv (std::vector<T>& buf, int src_pid, int tag)
930{
931 return Recv(buf.data(), buf.size(), src_pid, tag, Communicator());
932}
933
934template <class T>
935void
937 size_t n,
938 int root)
939{
940#ifdef BL_LAZY
942#endif
943
944 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
945
946 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
947 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
948
949 BL_MPI_REQUIRE( MPI_Bcast(t,
950 n,
951 Mpi_typemap<T>::type(),
952 root,
953 Communicator()) );
954 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
955}
956
957template <class T>
958void
960 size_t n,
961 int root,
962 const MPI_Comm &comm)
963{
964#ifdef BL_LAZY
965 int r;
966 MPI_Comm_compare(comm, Communicator(), &r);
967 if (r == MPI_IDENT) { Lazy::EvalReduction(); }
968#endif
969
970 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
971
972 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
973 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
974
975 BL_MPI_REQUIRE( MPI_Bcast(t,
976 n,
977 Mpi_typemap<T>::type(),
978 root,
979 comm) );
980 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
981}
982
983template <class T, class T1>
984void
986 size_t n,
987 T1* t1,
988 size_t n1,
989 int root)
990{
991 BL_PROFILE_T_S("ParallelDescriptor::Gather(TsT1si)", T);
992 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
993
994 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
995 BL_ASSERT(n1 < static_cast<size_t>(std::numeric_limits<int>::max()));
996
997 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(t),
998 n,
999 Mpi_typemap<T>::type(),
1000 t1,
1001 n1,
1002 Mpi_typemap<T1>::type(),
1003 root,
1004 Communicator()) );
1005 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, n * sizeof(T), root, BLProfiler::NoTag());
1006}
1007
1008template <class T>
1009std::vector<T>
1010ParallelDescriptor::Gather (const T& t, int root)
1011{
1012 BL_PROFILE_T_S("ParallelDescriptor::Gather(Ti)", T);
1013 BL_COMM_PROFILE(BLProfiler::GatherTi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1014
1015 std::vector<T> resl;
1016 if ( root == MyProc() ) { resl.resize(NProcs()); }
1017 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(&t),
1018 1,
1019 Mpi_typemap<T>::type(),
1020 resl.data(),
1021 1,
1022 Mpi_typemap<T>::type(),
1023 root,
1024 Communicator()) );
1025 BL_COMM_PROFILE(BLProfiler::GatherTi, sizeof(T), root, BLProfiler::NoTag());
1026 return resl;
1027}
1028
1029template <class T>
1030void
1031ParallelDescriptor::Gatherv (const T* send, int sc,
1032 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
1033 int root)
1034{
1035 BL_PROFILE_T_S("ParallelDescriptor::Gatherv(Ti)", T);
1036 BL_COMM_PROFILE(BLProfiler::Gatherv, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1037
1038 MPI_Gatherv(send, sc, ParallelDescriptor::Mpi_typemap<T>::type(),
1039 recv, rc.data(), disp.data(), ParallelDescriptor::Mpi_typemap<T>::type(),
1040 root, Communicator());
1041
1042 BL_COMM_PROFILE(BLProfiler::Gatherv, std::accumulate(rc.begin(),rc.end(),0)*sizeof(T), root, BLProfiler::NoTag());
1043}
1044
1045template <class T>
1046void
1047ParallelDescriptor::GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
1048 Vector<T>& recvbuf, int root)
1049{
1050 BL_PROFILE_T_S("ParallelDescriptor::GatherLayoutData(Ti)", T);
1051
1052 // Gather prelims
1053 Vector<T> T_to_send;
1054 T_to_send.reserve(sendbuf.local_size());
1055
1056 for (int i : sendbuf.IndexArray())
1057 {
1058 T_to_send.push_back(sendbuf[i]);
1059 }
1060
1061 int nprocs = ParallelContext::NProcsSub();
1062 Vector<int> recvcount(nprocs, 0);
1063 recvbuf.resize(sendbuf.size());
1064 const Vector<int>& old_pmap = sendbuf.DistributionMap().ProcessorMap();
1065 for (int i : old_pmap)
1066 {
1067 ++recvcount[i];
1068 }
1069
1070 // Make a map from post-gather to pre-gather index
1071 Vector<Vector<int>> new_ind_to_old_ind(nprocs);
1072 for (int i=0; i<nprocs; ++i)
1073 {
1074 new_ind_to_old_ind[i].reserve(recvcount[i]);
1075 }
1076 for (int i=0; i<old_pmap.size(); ++i)
1077 {
1078 new_ind_to_old_ind[old_pmap[i]].push_back(i);
1079 }
1080
1081 // Flatten
1082 Vector<int> new_index_to_old_index;
1083 new_index_to_old_index.reserve(old_pmap.size());
1084 for (const Vector<int>& v : new_ind_to_old_ind)
1085 {
1086 if (!v.empty())
1087 {
1088 for (int el : v)
1089 {
1090 new_index_to_old_index.push_back(el);
1091 }
1092 }
1093 }
1094
1095 Vector<int> disp(nprocs);
1096 if (!disp.empty()) { disp[0] = 0; }
1097 std::partial_sum(recvcount.begin(), recvcount.end()-1, disp.begin()+1);
1098 Vector<T> new_index_to_T(sendbuf.size());
1099
1100 MPI_Gatherv(T_to_send.data(), T_to_send.size(),
1102 new_index_to_T.data(), recvcount.data(), disp.data(),
1105
1106 // Now work just on the root, which now has global information on the collected;
1107 // LayoutData information; with new_index_to_old_index and new_index_to_T,
1108 // sort the gathered vector in pre-gather index space
1109 if (ParallelContext::MyProcSub() == root)
1110 {
1111 // Invert the map (new_index) --> (old_index)
1112 Vector<int> old_index_to_new_index(sendbuf.size());
1113
1114 for (int i=0; i<old_index_to_new_index.size(); ++i)
1115 {
1116 old_index_to_new_index[new_index_to_old_index[i]] = i;
1117 }
1118
1119 for (int i=0; i<recvbuf.size(); ++i)
1120 {
1121 recvbuf[i] = new_index_to_T[old_index_to_new_index[i]];
1122 }
1123 }
1124}
1125
1126template <class T, class T1>
1127void
1129 size_t n,
1130 const T1* t1,
1131 size_t n1,
1132 int root)
1133{
1134 BL_PROFILE_T_S("ParallelDescriptor::Scatter(TsT1si)", T);
1135 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1136
1137 BL_MPI_REQUIRE( MPI_Scatter(const_cast<T1*>(t1),
1138 n1,
1139 Mpi_typemap<T1>::type(),
1140 t,
1141 n,
1142 Mpi_typemap<T>::type(),
1143 root,
1144 Communicator()) );
1145 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, n * sizeof(T), root, BLProfiler::NoTag());
1146}
1147
1148#else
1149
1150namespace ParallelDescriptor
1151{
1152template <class T>
1153Message
1154Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1155{
1156 return Message();
1157}
1158
1159template <class T>
1160Message
1161Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1162{
1163 return Message();
1164}
1165
1166template <class T>
1167Message
1168Asend(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1169{
1170 return Message();
1171}
1172
1173template <class T>
1174Message
1175Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1176{
1177 return Message();
1178}
1179
1180template <class T>
1181Message
1182Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1183{
1184 return Message();
1185}
1186
1187template <class T>
1188Message
1189Send(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1190{
1191 return Message();
1192}
1193
1194template <class T>
1195Message
1196Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1197{
1198 return Message();
1199}
1200
1201template <class T>
1202Message
1203Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1204{
1205 return Message();
1206}
1207
1208template <class T>
1209Message
1210Arecv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1211{
1212 return Message();
1213}
1214
1215template <class T>
1216Message
1217Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1218{
1219 return Message();
1220}
1221
1222template <class T>
1223Message
1224Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1225{
1226 return Message();
1227}
1228
1229template <class T>
1230Message
1231Recv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1232{
1233 return Message();
1234}
1235
1236template <class T>
1237void
1238Bcast(T* /*t*/, size_t /*n*/, int /*root*/)
1239{}
1240
1241template <class T>
1242void
1243Bcast(T* /*t*/, size_t /*n*/, int /*root*/, const MPI_Comm & /*comm*/)
1244{}
1245
1246template <class T, class T1>
1247void
1248Gather (const T* t, size_t n, T1* t1, size_t n1, int /*root*/)
1249{
1250 BL_ASSERT(n == n1);
1252
1253 int const sc = static_cast<int>(n);
1254 for (int j=0; j<sc; ++j) { t1[j] = t[j]; }
1255}
1256
1257template <class T>
1258std::vector<T>
1259Gather(const T& t, int /*root*/)
1260{
1261 std::vector<T> resl(1);
1262 resl[0] = t;
1263 return resl;
1264}
1265
1266template <class T>
1267void
1268Gatherv (const T* send, int sc,
1269 T* recv, const std::vector<int>& /*rc*/,
1270 const std::vector<int>& /*disp*/, int /*root*/)
1271{
1272 for (int j=0; j<sc; ++j) { recv[j] = send[j]; }
1273}
1274
1275template <class T>
1276void
1278 Vector<T>& recvbuf, int /*root*/)
1279{
1280 recvbuf.resize(sendbuf.size());
1281
1282 for (int i=0; i<sendbuf.size(); ++i)
1283 {
1284 recvbuf[i] = sendbuf[i];
1285 }
1286}
1287
1288template <class T, class T1>
1289void
1290Scatter(T* /*t*/, size_t /*n*/, const T1* /*t1*/, size_t /*n1*/, int /*root*/)
1291{}
1292
1293}
1294#endif
1295
1296namespace ParallelDescriptor {
1297
1298#ifdef AMREX_USE_MPI
1299
1301namespace detail {
1302
1303template<typename T>
1304void DoAllReduce (T* r, MPI_Op op, int cnt)
1305{
1306#ifdef BL_LAZY
1307 Lazy::EvalReduction();
1308#endif
1309
1310 BL_ASSERT(cnt > 0);
1311
1312 BL_MPI_REQUIRE( MPI_Allreduce(MPI_IN_PLACE, r, cnt,
1313 Mpi_typemap<T>::type(), op,
1314 Communicator()) );
1315}
1316
1317template<typename T>
1318void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
1319{
1320#ifdef BL_LAZY
1321 Lazy::EvalReduction();
1322#endif
1323
1324 BL_ASSERT(cnt > 0);
1325
1326 if (MyProc() == cpu) {
1327 BL_MPI_REQUIRE( MPI_Reduce(MPI_IN_PLACE, r, cnt,
1328 Mpi_typemap<T>::type(), op,
1329 cpu, Communicator()) );
1330 } else {
1331 BL_MPI_REQUIRE( MPI_Reduce(r, r, cnt,
1332 Mpi_typemap<T>::type(), op,
1333 cpu, Communicator()) );
1334 }
1335}
1336
1337}
1339
1341 template <std::floating_point T>
1342 void ReduceRealSum (T& rvar) {
1343 detail::DoAllReduce<T>(&rvar,MPI_SUM,1);
1344 }
1345
1346 template <std::floating_point T>
1347 void ReduceRealSum (T* rvar, int cnt) {
1348 detail::DoAllReduce<T>(rvar,MPI_SUM,cnt);
1349 }
1350
1351 template <std::floating_point T>
1352 void ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar)
1353 {
1354 int cnt = rvar.size();
1355 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1356 detail::DoAllReduce<T>(tmp.data(),MPI_SUM,cnt);
1357 for (int i = 0; i < cnt; ++i) {
1358 rvar[i].get() = tmp[i];
1359 }
1360 }
1361
1363 template <std::floating_point T>
1364 void ReduceRealSum (T& rvar, int cpu) {
1365 detail::DoReduce<T>(&rvar,MPI_SUM,1,cpu);
1366 }
1367
1368 template <std::floating_point T>
1369 void ReduceRealSum (T* rvar, int cnt, int cpu) {
1370 detail::DoReduce<T>(rvar,MPI_SUM,cnt,cpu);
1371 }
1372
1373 template <std::floating_point T>
1374 void ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1375 {
1376 int cnt = rvar.size();
1377 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1378 detail::DoReduce<T>(tmp.data(),MPI_SUM,cnt,cpu);
1379 for (int i = 0; i < cnt; ++i) {
1380 rvar[i].get() = tmp[i];
1381 }
1382 }
1383
1385 template <std::floating_point T>
1386 void ReduceRealMax (T& rvar) {
1387 detail::DoAllReduce<T>(&rvar,MPI_MAX,1);
1388 }
1389
1390 template <std::floating_point T>
1391 void ReduceRealMax (T* rvar, int cnt) {
1392 detail::DoAllReduce<T>(rvar,MPI_MAX,cnt);
1393 }
1394
1395 template <std::floating_point T>
1396 void ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar)
1397 {
1398 int cnt = rvar.size();
1399 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1400 detail::DoAllReduce<T>(tmp.data(),MPI_MAX,cnt);
1401 for (int i = 0; i < cnt; ++i) {
1402 rvar[i].get() = tmp[i];
1403 }
1404 }
1405
1407 template <std::floating_point T>
1408 void ReduceRealMax (T& rvar, int cpu) {
1409 detail::DoReduce<T>(&rvar,MPI_MAX,1,cpu);
1410 }
1411
1412 template <std::floating_point T>
1413 void ReduceRealMax (T* rvar, int cnt, int cpu) {
1414 detail::DoReduce<T>(rvar,MPI_MAX,cnt,cpu);
1415 }
1416
1417 template <std::floating_point T>
1418 void ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1419 {
1420 int cnt = rvar.size();
1421 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1422 detail::DoReduce<T>(tmp.data(),MPI_MAX,cnt,cpu);
1423 for (int i = 0; i < cnt; ++i) {
1424 rvar[i].get() = tmp[i];
1425 }
1426 }
1427
1429 template <std::floating_point T>
1430 void ReduceRealMin (T& rvar) {
1431 detail::DoAllReduce<T>(&rvar,MPI_MIN,1);
1432 }
1433
1434 template <std::floating_point T>
1435 void ReduceRealMin (T* rvar, int cnt) {
1436 detail::DoAllReduce<T>(rvar,MPI_MIN,cnt);
1437 }
1438
1439 template <std::floating_point T>
1440 void ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar)
1441 {
1442 int cnt = rvar.size();
1443 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1444 detail::DoAllReduce<T>(tmp.data(),MPI_MIN,cnt);
1445 for (int i = 0; i < cnt; ++i) {
1446 rvar[i].get() = tmp[i];
1447 }
1448 }
1449
1451 template <std::floating_point T>
1452 void ReduceRealMin (T& rvar, int cpu) {
1453 detail::DoReduce<T>(&rvar,MPI_MIN,1,cpu);
1454 }
1455
1456 template <std::floating_point T>
1457 void ReduceRealMin (T* rvar, int cnt, int cpu) {
1458 detail::DoReduce<T>(rvar,MPI_MIN,cnt,cpu);
1459 }
1460
1461 template <std::floating_point T>
1462 void ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1463 {
1464 int cnt = rvar.size();
1465 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1466 detail::DoReduce<T>(tmp.data(),MPI_MIN,cnt,cpu);
1467 for (int i = 0; i < cnt; ++i) {
1468 rvar[i].get() = tmp[i];
1469 }
1470 }
1471
1472#else
1473
1475 template <std::floating_point T>
1476 void ReduceRealSum (T& ) {}
1477
1478 template <std::floating_point T>
1479 void ReduceRealSum (T*, int) {}
1480
1481 template <std::floating_point T>
1482 void ReduceRealSum (Vector<std::reference_wrapper<T> > const&) {}
1483
1485 template <std::floating_point T>
1486 void ReduceRealSum (T&, int) {}
1487
1488 template <std::floating_point T>
1489 void ReduceRealSum (T*, int, int) {}
1490
1491 template <std::floating_point T>
1492 void ReduceRealSum (Vector<std::reference_wrapper<T> > const&, int) {}
1493
1495 template <std::floating_point T>
1496 void ReduceRealMax (T&) {}
1497
1498 template <std::floating_point T>
1499 void ReduceRealMax (T*, int) {}
1500
1501 template <std::floating_point T>
1502 void ReduceRealMax (Vector<std::reference_wrapper<T> > const&) {}
1503
1505 template <std::floating_point T>
1506 void ReduceRealMax (T&, int) {}
1507
1508 template <std::floating_point T>
1509 void ReduceRealMax (T*, int, int) {}
1510
1511 template <std::floating_point T>
1512 void ReduceRealMax (Vector<std::reference_wrapper<T> > const&, int) {}
1513
1515 template <std::floating_point T>
1516 void ReduceRealMin (T&) {}
1517
1518 template <std::floating_point T>
1519 void ReduceRealMin (T*, int) {}
1520
1521 template <std::floating_point T>
1522 void ReduceRealMin (Vector<std::reference_wrapper<T> > const&) {}
1523
1525 template <std::floating_point T>
1526 void ReduceRealMin (T&, int) {}
1527
1528 template <std::floating_point T>
1529 void ReduceRealMin (T*, int, int) {}
1530
1531 template <std::floating_point T>
1532 void ReduceRealMin (Vector<std::reference_wrapper<T> > const&, int) {}
1533
1534#endif
1535}
1536
1537#ifdef AMREX_USE_MPI
1538namespace ParallelDescriptor {
1539
1540template <class T>
1542{
1544 {
1545 static_assert(std::is_same<T,double>() ||
1546 std::is_same<T,float >(),
1547 "Unsupported type T for GpuComplex");
1548 if constexpr (std::is_same<T,double>()) {
1549 return MPI_C_DOUBLE_COMPLEX;
1550 } else {
1551 return MPI_C_FLOAT_COMPLEX;
1552 }
1553 }
1554};
1555
1556template<typename TV, typename TI>
1558{
1560 {
1561 static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
1562 if (mpi_type == MPI_DATATYPE_NULL) {
1563 using T = ValLocPair<TV,TI>;
1564 static_assert(std::is_trivially_copyable_v<T>,
1565 "To communicate with MPI, ValLocPair must be trivially copyable.");
1566 static_assert(std::is_standard_layout_v<T>,
1567 "To communicate with MPI, ValLocPair must be standard layout");
1568
1569 T vlp[2];
1570 MPI_Datatype types[] = {
1573 };
1574 int blocklens[] = { 1, 1 };
1575 MPI_Aint disp[2];
1576 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].value, &disp[0]) );
1577 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].index, &disp[1]) );
1578 disp[1] -= disp[0];
1579 disp[0] = 0;
1580 BL_MPI_REQUIRE( MPI_Type_create_struct(2, blocklens, disp, types,
1581 &mpi_type) );
1582 MPI_Aint lb, extent;
1583 BL_MPI_REQUIRE( MPI_Type_get_extent(mpi_type, &lb, &extent) );
1584 if (extent != sizeof(T)) {
1585 MPI_Datatype tmp = mpi_type;
1586 BL_MPI_REQUIRE( MPI_Type_create_resized(tmp, 0, sizeof(vlp[0]), &mpi_type) );
1587 BL_MPI_REQUIRE( MPI_Type_free(&tmp) );
1588 }
1589 BL_MPI_REQUIRE( MPI_Type_commit( &mpi_type ) );
1590
1591 m_mpi_types.push_back(&mpi_type);
1592 }
1593 return mpi_type;
1594 }
1595};
1596
1597template <typename T, typename F>
1599{
1600 static MPI_Op mpi_op = MPI_OP_NULL;
1601 if (mpi_op == MPI_OP_NULL) {
1602 static auto user_fn = [] (void *invec, void *inoutvec, int* len, // NOLINT
1603 MPI_Datatype * /*datatype*/)
1604 {
1605 auto in = static_cast<T const*>(invec);
1606 auto out = static_cast<T*>(inoutvec);
1607 for (int i = 0; i < *len; ++i) {
1608 out[i] = F()(in[i],out[i]);
1609 }
1610 };
1611 BL_MPI_REQUIRE( MPI_Op_create(user_fn, 1, &mpi_op) );
1612 m_mpi_ops.push_back(&mpi_op);
1613 }
1614 return mpi_op;
1615}
1616
1617}
1618#endif
1619
1620}
1621
1622#endif /*BL_PARALLELDESCRIPTOR_H*/
#define BL_COMM_PROFILE(cft, size, pid, tag)
Definition AMReX_BLProfiler.H:587
#define BL_PROFILE_T_S(fname, T)
Definition AMReX_BLProfiler.H:554
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:110
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
Hold the description and status of communication data.
Definition AMReX_ParallelDescriptor.H:57
Message(MPI_Request req_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:61
MPI_Datatype type() const
Definition AMReX_ParallelDescriptor.H:73
MPI_Request req() const
Definition AMReX_ParallelDescriptor.H:74
bool test()
Definition AMReX_ParallelDescriptor.cpp:1191
MPI_Status stat() const
Definition AMReX_ParallelDescriptor.H:75
void wait()
Definition AMReX_ParallelDescriptor.cpp:1187
Message(MPI_Status stat_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:65
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:29
amrex_real Real
Floating Point Type for Fields.
Definition AMReX_REAL.H:79
amrex_long Long
Definition AMReX_INT.H:30
void ReduceIntMax(int &)
Definition AMReX_ParallelDescriptor.cpp:1266
void ReduceLongAnd(Long &)
Definition AMReX_ParallelDescriptor.cpp:1235
void ReduceBoolAnd(bool &)
Definition AMReX_ParallelDescriptor.cpp:1289
int MyProc() noexcept
Definition AMReX_ParallelDescriptor.H:128
void Gather(Real const *sendbuf, int nsend, Real *recvbuf, int root)
Definition AMReX_ParallelDescriptor.cpp:1173
void Barrier(const std::string &)
Definition AMReX_ParallelDescriptor.cpp:1215
void ReduceIntSum(int &)
Definition AMReX_ParallelDescriptor.cpp:1265
void ReduceBoolOr(bool &)
Definition AMReX_ParallelDescriptor.cpp:1290
void ReadAndBcastFile(const std::string &filename, Vector< char > &charBuf, bool bExitOnError, const MPI_Comm &comm)
Definition AMReX_ParallelDescriptor.cpp:1495
int NProcs() noexcept
Definition AMReX_ParallelDescriptor.H:255
void ReduceLongSum(Long &)
Definition AMReX_ParallelDescriptor.cpp:1236
int IOProcessorNumber() noexcept
The MPI rank number of the I/O Processor (probably rank 0). This rank is usually used to write to std...
Definition AMReX_ParallelDescriptor.H:279
void ReduceLongMax(Long &)
Definition AMReX_ParallelDescriptor.cpp:1237
bool IOProcessor() noexcept
Is this CPU the I/O Processor? To get the rank number, call IOProcessorNumber()
Definition AMReX_ParallelDescriptor.H:289
void ReduceLongMin(Long &)
Definition AMReX_ParallelDescriptor.cpp:1238
void ReduceIntMin(int &)
Definition AMReX_ParallelDescriptor.cpp:1267
void EvalReduction()
Definition AMReX_Lazy.cpp:20
int NProcs()
Process ID in MPI_COMM_WORLD.
Definition AMReX_MPMD.cpp:122
int MyProc()
Definition AMReX_MPMD.cpp:117
int MyProcAll() noexcept
my rank in world communicator
Definition AMReX_ParallelContext.H:61
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int get_inc_mpi_tag() noexcept
get and increment mpi tag in current frame
Definition AMReX_ParallelContext.H:93
int MyProcSub() noexcept
my sub-rank in current frame
Definition AMReX_ParallelContext.H:76
int NProcsAll() noexcept
number of ranks in world communicator
Definition AMReX_ParallelContext.H:59
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
bool sameTeam(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:345
int RankInLeadComm(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:355
void Test(MPI_Request &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1220
const char * ErrorString(int)
ErrorString return string associated with error internal error condition.
Definition AMReX_ParallelDescriptor.cpp:1213
Message Asend(const T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1154
MPI_Comm Communicator() noexcept
Definition AMReX_ParallelDescriptor.H:223
void EndTeams()
Definition AMReX_ParallelDescriptor.cpp:1656
void Waitany(Vector< MPI_Request > &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1312
void Gatherv(const T *send, int sc, T *recv, const std::vector< int > &rc, const std::vector< int > &disp, int root)
Definition AMReX_ParallelDescriptor.H:1268
Message Abarrier()
Definition AMReX_ParallelDescriptor.cpp:1217
void ReduceRealMin(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1229
void StartParallel(int *, char ***, MPI_Comm)
Perform any needed parallel initialization. This MUST be the first routine in this class called from ...
Definition AMReX_ParallelDescriptor.cpp:1165
bool isTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:340
void Wait(MPI_Request &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1304
std::string mpi_level_to_string(int mtlev)
Definition AMReX_ParallelDescriptor.cpp:1662
void ReduceRealSum(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1227
void Waitsome(Vector< MPI_Request > &, int &, Vector< int > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1316
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:113
std::pair< int, int > team_range(int begin, int end, int rit=-1, int nworkers=0) noexcept
Definition AMReX_ParallelDescriptor.H:370
int MyTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:325
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1308
int MyTeamColor() noexcept
Definition AMReX_ParallelDescriptor.H:320
bool doTeamReduce() noexcept
Definition AMReX_ParallelDescriptor.H:360
int MinTag() noexcept
Definition AMReX_ParallelDescriptor.H:220
int TeamLead(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:335
int MyRankInProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:243
Message Send(const T *buf, size_t n, int dst_pid, int tag)
Definition AMReX_ParallelDescriptor.H:1175
void Initialize()
Definition AMReX_ParallelDescriptor.cpp:1546
void Finalize()
Definition AMReX_ParallelDescriptor.cpp:1590
int TeamSize() noexcept
Definition AMReX_ParallelDescriptor.H:310
void EndParallel()
Perform any needed parallel finalization. This MUST be the last routine in this class called from wit...
Definition AMReX_ParallelDescriptor.cpp:1197
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition AMReX_ParallelDescriptor.cpp:1295
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:678
void IProbe(int, int, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1222
int MyRankInNode() noexcept
Definition AMReX_ParallelDescriptor.H:233
void GatherLayoutDataToVector(const LayoutData< T > &sendbuf, Vector< T > &recvbuf, int root)
Gather LayoutData values to a vector on root.
Definition AMReX_ParallelDescriptor.H:1277
int MaxTag() noexcept
Definition AMReX_ParallelDescriptor.H:221
void Comm_dup(MPI_Comm, MPI_Comm &)
Definition AMReX_ParallelDescriptor.cpp:1225
int NProcsPerProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:238
double second() noexcept
Returns wall-clock seconds since start of execution.
Definition AMReX_ParallelDescriptor.cpp:1298
const std::string Unnamed("Unnamed")
Used as default argument to ParallelDescriptor::Barrier().
int MyRankInTeam() noexcept
Definition AMReX_ParallelDescriptor.H:330
void Scatter(T *, size_t n, const T1 *, size_t n1, int root)
Definition AMReX_ParallelDescriptor.H:1290
int NProcsPerNode() noexcept
Definition AMReX_ParallelDescriptor.H:228
void Abort(int errorcode, bool backtrace)
Abort with specified error code.
Definition AMReX_ParallelDescriptor.cpp:1204
void team_for(int begin, int end, const F &f)
Definition AMReX_ParallelDescriptor.H:414
int NTeams() noexcept
Definition AMReX_ParallelDescriptor.H:315
Message Recv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1217
void StartTeams()
Split the process pool into teams.
Definition AMReX_ParallelDescriptor.cpp:1599
Message Arecv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1196
MPI_Op Mpi_op()
Definition AMReX_ParallelDescriptor.H:1598
void ReduceRealMax(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1228
const ProcessTeam & MyTeam() noexcept
Definition AMReX_ParallelDescriptor.H:365
static constexpr int MPI_DATATYPE_NULL
Definition AMReX_ccse-mpi.H:56
static constexpr int MPI_PROC_NULL
Definition AMReX_ccse-mpi.H:61
int MPI_Comm
Definition AMReX_ccse-mpi.H:51
static constexpr int MPI_COMM_WORLD
Definition AMReX_ccse-mpi.H:58
int MPI_Group
Definition AMReX_ccse-mpi.H:52
int MPI_Op
Definition AMReX_ccse-mpi.H:50
int MPI_Request
Definition AMReX_ccse-mpi.H:54
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:57
int MPI_Datatype
Definition AMReX_ccse-mpi.H:53
Definition AMReX_Amr.cpp:50
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
__host__ __device__ Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2018
__host__ __device__ Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:2028
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1543
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1559
Communication datatype (note: this structure also works without MPI)
Definition AMReX_ccse-mpi.H:78
Provide functionalities needed to construct a team of processes to perform a particular job.
Definition AMReX_ParallelDescriptor.H:148
MPI_Comm team_t
Definition AMReX_ParallelDescriptor.H:149
void clear()
free a communicator
Definition AMReX_ParallelDescriptor.H:180
void Barrier() const
synchronize processes within the team
Definition AMReX_ParallelDescriptor.H:152
const MPI_Comm & get_team_comm() const
return the communicator
Definition AMReX_ParallelDescriptor.H:193
const MPI_Comm & get_lead_comm() const
Definition AMReX_ParallelDescriptor.H:194
void MemoryBarrier() const
memory fence
Definition AMReX_ParallelDescriptor.H:161
const team_t & get() const
Definition AMReX_ParallelDescriptor.H:189
Definition AMReX_ValLocPair.H:10
Definition AMReX_ccse-mpi.H:55