Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_ParallelDescriptor.H
Go to the documentation of this file.
1#ifndef BL_PARALLELDESCRIPTOR_H
2#define BL_PARALLELDESCRIPTOR_H
3#include <AMReX_Config.H>
4
5#include <AMReX_ccse-mpi.H>
7#include <AMReX_BLBackTrace.H>
8#include <AMReX_BLProfiler.H>
9#include <AMReX_BLassert.H>
10#include <AMReX_Extension.H>
11#include <AMReX_INT.H>
12#include <AMReX_REAL.H>
13#include <AMReX_GpuComplex.H>
14#include <AMReX_Array.H>
15#include <AMReX_Vector.H>
16#include <AMReX_ValLocPair.H>
17
18#ifndef BL_AMRPROF
19#include <AMReX_Box.H>
20#endif
21
22#ifdef BL_LAZY
23#include <AMReX_Lazy.H>
24#endif
25
26#ifdef AMREX_PMI
27#include <pmi.h>
28#endif
29
30#ifdef AMREX_USE_OMP
31#include <omp.h>
32#endif
33
34#include <algorithm>
35#include <atomic>
36#include <csignal>
37#include <functional>
38#include <limits>
39#include <numeric>
40#include <string>
41#include <typeinfo>
42#include <vector>
43
44namespace amrex {
45
46template <typename T> class LayoutData;
47
49namespace ParallelDescriptor
50{
51 //
53 const std::string Unnamed("Unnamed");
54
56 class Message
57 {
58 public:
59
60 Message () = default;
62 m_finished(false),
63 m_type(type_),
64 m_req(req_) {}
66 m_type(type_),
67 m_stat(stat_) {}
68 void wait ();
69 bool test ();
70 size_t count () const;
71 int tag () const;
72 int pid () const;
73 MPI_Datatype type () const { return m_type; }
74 MPI_Request req () const { return m_req; }
75 MPI_Status stat () const { return m_stat; }
76
77 private:
78
79 bool m_finished = true;
82 mutable MPI_Status m_stat{};
83 };
84
85#ifdef BL_USE_MPI
86 void MPI_Error(const char* file, int line, const char* str, int rc);
87
88#define BL_MPI_REQUIRE(x) \
89do \
90{ \
91 if ( int l_status_ = (x) ) \
92 { \
93 amrex::ParallelDescriptor::MPI_Error(__FILE__,__LINE__,#x, l_status_); \
94 } \
95} \
96while ( false )
97#endif
98
103 void StartParallel (int* argc = nullptr,
104 char*** argv = nullptr,
105 MPI_Comm mpi_comm = MPI_COMM_WORLD);
106
107 void Initialize ();
108 void Finalize ();
109
111 inline bool UseGpuAwareMpi () { return use_gpu_aware_mpi; }
112
114 void StartTeams ();
115 void EndTeams ();
116
121 void EndParallel ();
122
124 inline int
125 MyProc () noexcept
126 {
128 }
129 inline int
130 MyProc (MPI_Comm comm) noexcept
131 {
132#ifdef BL_USE_MPI
133 int r;
134 MPI_Comm_rank(comm,&r);
135 return r;
136#else
138 return 0;
139#endif
140 }
141
144 {
146
148 void Barrier () const {
149 if (m_size > 1) {
150#if defined(BL_USE_MPI3)
151 MPI_Barrier(m_team_comm);
152#endif
153 }
154 }
155
157 void MemoryBarrier () const {
158 if (m_size > 1) {
159#ifdef AMREX_USE_OMP
160 if (omp_in_parallel()) {
161 #pragma omp barrier
162 }
163 #pragma omp master
164#endif
165 {
166#if defined(BL_USE_MPI3)
167 std::atomic_thread_fence(std::memory_order_release);
168 MPI_Barrier(m_team_comm);
169 std::atomic_thread_fence(std::memory_order_acquire);
170#endif
171 }
172 }
173 }
174
176 void clear () {
177#if defined(BL_USE_MPI3)
178 if (m_size > 1) {
179 MPI_Comm_free(&m_team_comm);
180 if (m_rankInTeam==0) { MPI_Comm_free(&m_lead_comm); }
181 }
182#endif
183 }
184
185 [[nodiscard]] const team_t& get() const {
186 return m_team_comm;
187 }
189 [[nodiscard]] const MPI_Comm& get_team_comm() const { return m_team_comm; }
190 [[nodiscard]] const MPI_Comm& get_lead_comm() const { return m_lead_comm; }
191
198
201 };
202
204
205 extern AMREX_EXPORT int m_MinTag, m_MaxTag;
206 inline int MinTag () noexcept { return m_MinTag; }
207 inline int MaxTag () noexcept { return m_MaxTag; }
208
210 inline MPI_Comm Communicator () noexcept { return m_comm; }
211
216 inline int NProcsPerNode () noexcept { return m_nprocs_per_node; }
217
218 extern AMREX_EXPORT int m_rank_in_node;
222 inline int MyRankInNode () noexcept { return m_rank_in_node; }
223
228 inline int NProcsPerProcessor () noexcept { return m_nprocs_per_processor; }
229
234 inline int MyRankInProcessor () noexcept { return m_rank_in_processor; }
235
236#ifdef AMREX_USE_MPI
239#endif
240
242 inline int
243 NProcs () noexcept
244 {
246 }
247
248 inline int
249 NProcs (MPI_Comm comm) noexcept
250 {
251#ifdef BL_USE_MPI
252 int s;
253 BL_MPI_REQUIRE(MPI_Comm_size(comm, &s));
254 return s;
255#else
257 return 1;
258#endif
259 }
264 extern AMREX_EXPORT const int ioProcessor;
265 inline int
267 {
268 return ioProcessor;
269 }
274 inline bool
275 IOProcessor () noexcept
276 {
277 return MyProc() == IOProcessorNumber();
278 }
279
280 inline int
282 {
283 return (comm == ParallelDescriptor::Communicator()) ? ioProcessor : 0;
284 }
285
286 inline bool
287 IOProcessor (MPI_Comm comm) noexcept
288 {
289 return MyProc(comm) == IOProcessorNumber(comm);
290 }
291
292 //
293 inline int
294 TeamSize () noexcept
295 {
296 return m_Team.m_size;
297 }
298 inline int
299 NTeams () noexcept
300 {
301 return m_Team.m_numTeams;
302 }
303 inline int
304 MyTeamColor () noexcept
305 {
306 return m_Team.m_color;
307 }
308 inline int
309 MyTeamLead () noexcept
310 {
311 return m_Team.m_lead;
312 }
313 inline int
314 MyRankInTeam () noexcept
315 {
316 return m_Team.m_rankInTeam;
317 }
318 inline int
319 TeamLead (int rank) noexcept
320 {
321 return (rank >= 0) ? (rank - rank % m_Team.m_size) : MPI_PROC_NULL;
322 }
323 inline bool
324 isTeamLead () noexcept
325 {
326 return MyRankInTeam() == 0;
327 }
328 inline bool
329 sameTeam (int rank) noexcept
330 {
331 return MyTeamLead() == TeamLead(rank);
332 }
333 inline bool
334 sameTeam (int rankA, int rankB) noexcept
335 {
336 return TeamLead(rankA) == TeamLead(rankB);
337 }
338 inline int
339 RankInLeadComm (int rank) noexcept
340 {
341 return (rank >= 0) ? (rank / m_Team.m_size) : MPI_PROC_NULL;
342 }
343 inline bool
344 doTeamReduce () noexcept
345 {
347 }
348 inline const ProcessTeam&
349 MyTeam () noexcept
350 {
351 return m_Team;
352 }
353 inline std::pair<int,int>
354 team_range (int begin, int end, int rit = -1, int nworkers = 0) noexcept
355 {
356 int rb, re;
357 {
358 if (rit < 0) { rit = ParallelDescriptor::MyRankInTeam(); }
359 if (nworkers == 0) { nworkers = ParallelDescriptor::TeamSize(); }
360 BL_ASSERT(rit<nworkers);
361 if (nworkers == 1) {
362 rb = begin;
363 re = end;
364 } else {
365 int ntot = end - begin;
366 int nr = ntot / nworkers;
367 int nlft = ntot - nr * nworkers;
368 if (rit < nlft) { // get nr+1 items
369 rb = begin + rit * (nr + 1);
370 re = rb + nr + 1;
371 } else { // get nr items
372 rb = begin + rit * nr + nlft;
373 re = rb + nr;
374 }
375 }
376 }
377
378#ifdef AMREX_USE_OMP
379 int nthreads = omp_get_num_threads();
380 if (nthreads > 1) {
381 int tid = omp_get_thread_num();
382 int ntot = re - rb;
383 int nr = ntot / nthreads;
384 int nlft = ntot - nr * nthreads;
385 if (tid < nlft) { // get nr+1 items
386 rb += tid * (nr + 1);
387 re = rb + nr + 1;
388 } else { // get nr items
389 rb += tid * nr + nlft;
390 re = rb + nr;
391 }
392 }
393#endif
394
395 return std::make_pair(rb,re);
396 }
397 template <typename F>
398 void team_for (int begin, int end, const F& f)
399 {
400 const auto& range = team_range(begin, end);
401 for (int i = range.first; i < range.second; ++i) {
402 f(i);
403 }
404 }
405 template <typename F> // rit: rank in team
406 void team_for (int begin, int end, int rit, const F& f)
407 {
408 const auto& range = team_range(begin, end, rit);
409 for (int i = range.first; i < range.second; ++i) {
410 f(i);
411 }
412 }
413 template <typename F> // rit: rank in team
414 void team_for (int begin, int end, int rit, int nworkers, const F& f)
415 {
416 const auto& range = team_range(begin, end, rit, nworkers);
417 for (int i = range.first; i < range.second; ++i) {
418 f(i);
419 }
420 }
421
422 void Barrier (const std::string& message = Unnamed);
423 void Barrier (const MPI_Comm &comm, const std::string& message = Unnamed);
424 Message Abarrier ();
425 Message Abarrier (const MPI_Comm &comm);
426
427 void Test (MPI_Request& request, int& flag, MPI_Status& status);
428 void Test (Vector<MPI_Request>& request, int& flag, Vector<MPI_Status>& status);
429
430 void Comm_dup (MPI_Comm comm, MPI_Comm& newcomm);
432 void Abort (int errorcode = SIGABRT, bool backtrace = true);
434 const char* ErrorString (int errorcode);
436 double second () noexcept;
437
439 void ReduceBoolAnd (bool& rvar);
441 void ReduceBoolAnd (bool& rvar, int cpu);
442
444 void ReduceBoolOr (bool& rvar);
446 void ReduceBoolOr (bool& rvar, int cpu);
447
449 template <typename T>
450 std::enable_if_t<std::is_floating_point_v<T>>
451 ReduceRealSum (T& rvar);
452
453 template <typename T>
454 std::enable_if_t<std::is_floating_point_v<T>>
455 ReduceRealSum (T* rvar, int cnt);
456
457 // Having this for backward compatibility
458 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar);
459 //
460 template <typename T>
461 std::enable_if_t<std::is_floating_point_v<T>>
462 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar);
463
465 template <typename T>
466 std::enable_if_t<std::is_floating_point_v<T>>
467 ReduceRealSum (T& rvar, int cpu);
468
469 template <typename T>
470 std::enable_if_t<std::is_floating_point_v<T>>
471 ReduceRealSum (T* rvar, int cnt, int cpu);
472
473 // Having this for backward compatibility
474 void ReduceRealSum (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
475 //
476 template <typename T>
477 std::enable_if_t<std::is_floating_point_v<T>>
478 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
479
481 template <typename T>
482 std::enable_if_t<std::is_floating_point_v<T>>
483 ReduceRealMax (T& rvar);
484
485 template <typename T>
486 std::enable_if_t<std::is_floating_point_v<T>>
487 ReduceRealMax (T* rvar, int cnt);
488
489 // Having this for backward compatibility
490 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar);
491 //
492 template <typename T>
493 std::enable_if_t<std::is_floating_point_v<T>>
494 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar);
495
497 template <typename T>
498 std::enable_if_t<std::is_floating_point_v<T>>
499 ReduceRealMax (T& rvar, int cpu);
500
501 template <typename T>
502 std::enable_if_t<std::is_floating_point_v<T>>
503 ReduceRealMax (T* rvar, int cnt, int cpu);
504
505 // Having this for backward compatibility
506 void ReduceRealMax (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
507 //
508 template <typename T>
509 std::enable_if_t<std::is_floating_point_v<T>>
510 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
511
513 template <typename T>
514 std::enable_if_t<std::is_floating_point_v<T>>
515 ReduceRealMin (T& rvar);
516
517 template <typename T>
518 std::enable_if_t<std::is_floating_point_v<T>>
519 ReduceRealMin (T* rvar, int cnt);
520
521 // Having this for backward compatibility
522 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar);
523 //
524 template <typename T>
525 std::enable_if_t<std::is_floating_point_v<T>>
526 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar);
527
529 template <typename T>
530 std::enable_if_t<std::is_floating_point_v<T>>
531 ReduceRealMin (T& rvar, int cpu);
532
533 template <typename T>
534 std::enable_if_t<std::is_floating_point_v<T>>
535 ReduceRealMin (T* rvar, int cnt, int cpu);
536
537 // Having this for backward compatibility
538 void ReduceRealMin (Vector<std::reference_wrapper<Real> > const& rvar, int cpu);
539 //
540 template <typename T>
541 std::enable_if_t<std::is_floating_point_v<T>>
542 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu);
543
545 void ReduceIntSum (int& rvar);
546 void ReduceIntSum (int* rvar, int cnt);
547 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar);
549 void ReduceIntSum (int& rvar, int cpu);
550 void ReduceIntSum (int* rvar, int cnt, int cpu);
551 void ReduceIntSum (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
552
554 void ReduceIntMax (int& rvar);
555 void ReduceIntMax (int* rvar, int cnt);
556 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar);
558 void ReduceIntMax (int& rvar, int cpu);
559 void ReduceIntMax (int* rvar, int cnt, int cpu);
560 void ReduceIntMax (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
561
563 void ReduceIntMin (int& rvar);
564 void ReduceIntMin (int* rvar, int cnt);
565 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar);
567 void ReduceIntMin (int& rvar, int cpu);
568 void ReduceIntMin (int* rvar, int cnt, int cpu);
569 void ReduceIntMin (Vector<std::reference_wrapper<int> > const& rvar, int cpu);
570
572 void ReduceLongSum (Long& rvar);
573 void ReduceLongSum (Long* rvar, int cnt);
574 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar);
576 void ReduceLongSum (Long& rvar, int cpu);
577 void ReduceLongSum (Long* rvar, int cnt, int cpu);
578 void ReduceLongSum (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
579
581 void ReduceLongMax (Long& rvar);
582 void ReduceLongMax (Long* rvar, int cnt);
583 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar);
585 void ReduceLongMax (Long& rvar, int cpu);
586 void ReduceLongMax (Long* rvar, int cnt, int cpu);
587 void ReduceLongMax (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
588
590 void ReduceLongMin (Long& rvar);
591 void ReduceLongMin (Long* rvar, int cnt);
592 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar);
594 void ReduceLongMin (Long& rvar, int cpu);
595 void ReduceLongMin (Long* rvar, int cnt, int cpu);
596 void ReduceLongMin (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
597
599 void ReduceLongAnd (Long& rvar);
600 void ReduceLongAnd (Long* rvar, int cnt);
601 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar);
603 void ReduceLongAnd (Long& rvar, int cpu);
604 void ReduceLongAnd (Long* rvar, int cnt, int cpu);
605 void ReduceLongAnd (Vector<std::reference_wrapper<Long> > const& rvar, int cpu);
606
608 void Gather (Real const* sendbuf, int nsend, Real* recvbuf, int root);
613 inline int SeqNum () noexcept { return ParallelContext::get_inc_mpi_tag(); }
614
615 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag);
616 template <class T> [[nodiscard]] Message Asend(const T*, size_t n, int pid, int tag, MPI_Comm comm);
617 template <class T> [[nodiscard]] Message Asend(const std::vector<T>& buf, int pid, int tag);
618
619 template <class T> Message Arecv(T*, size_t n, int pid, int tag);
620 template <class T> Message Arecv(T*, size_t n, int pid, int tag, MPI_Comm comm);
621 template <class T> Message Arecv(std::vector<T>& buf, int pid, int tag);
622
623 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag);
624 template <class T> Message Send(const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
625 template <class T> Message Send(const std::vector<T>& buf, int dst_pid, int tag);
626
627 template <class T> Message Recv(T*, size_t n, int pid, int tag);
628 template <class T> Message Recv(T*, size_t n, int pid, int tag, MPI_Comm comm);
629 template <class T> Message Recv(std::vector<T>& buf, int pid, int tag);
630
631 template <class T> void Bcast(T*, size_t n, int root = 0);
632 template <class T> void Bcast(T*, size_t n, int root, const MPI_Comm &comm);
633 void Bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm);
634
635 template <class T, class T1> void Scatter(T*, size_t n, const T1*, size_t n1, int root);
636
637 template <class T, class T1> void Gather(const T*, size_t n, T1*, size_t n1, int root);
638 template <class T> std::vector<T> Gather(const T&, int root);
639
640 template <class T> void Gatherv (const T* send, int sc,
641 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
642 int root);
643
645 template <class T> void GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
646 Vector<T>& recvbuf,
647 int root);
648
649 void Wait (MPI_Request& req, MPI_Status& status);
650 void Waitall (Vector<MPI_Request>& reqs, Vector<MPI_Status>& status);
651 void Waitany (Vector<MPI_Request>& reqs, int &index, MPI_Status& status);
653
654 void ReadAndBcastFile(const std::string &filename, Vector<char> &charBuf,
655 bool bExitOnError = true,
656 const MPI_Comm &comm = Communicator() );
657 void IProbe(int src_pid, int tag, int &mflag, MPI_Status &status);
658 void IProbe(int src_pid, int tag, MPI_Comm comm, int &mflag, MPI_Status &status);
659
665 std::string
666 mpi_level_to_string (int mtlev);
667
668 // PMI = Process Management Interface, available on Crays. Provides API to
669 // query topology of the job.
670#ifdef AMREX_PMI
671 void PMI_Initialize();
672 void PMI_PrintMeshcoords(const pmi_mesh_coord_t *pmi_mesh_coord);
673#endif
674
675#ifdef BL_USE_MPI
676 int select_comm_data_type (std::size_t nbytes);
677 std::size_t sizeof_selected_comm_data_type (std::size_t nbytes);
678#endif
679}
680}
681
682namespace amrex {
683
684#ifdef BL_USE_MPI
685
686template <class T>
687ParallelDescriptor::Message
688ParallelDescriptor::Asend (const T* buf, size_t n, int dst_pid, int tag)
689{
690 return Asend(buf, n, dst_pid, tag, Communicator());
691}
692
693namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
694template <>
695Message
696Asend<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
697
698template <class T>
699Message
700Asend (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
701{
702 static_assert(!std::is_same_v<char,T>, "Asend: char version has been specialized");
703
704 BL_PROFILE_T_S("ParallelDescriptor::Asend(TsiiM)", T);
705 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, n * sizeof(T), dst_pid, tag);
706
707 MPI_Request req;
708 BL_MPI_REQUIRE( MPI_Isend(const_cast<T*>(buf),
709 n,
711 dst_pid,
712 tag,
713 comm,
714 &req) );
715 BL_COMM_PROFILE(BLProfiler::AsendTsiiM, BLProfiler::AfterCall(), dst_pid, tag);
716 return Message(req, Mpi_typemap<T>::type());
717}
718}
719
720template <class T>
721ParallelDescriptor::Message
722ParallelDescriptor::Asend (const std::vector<T>& buf, int dst_pid, int tag)
723{
724 return Asend(buf.data(), buf.size(), dst_pid, tag, Communicator());
725}
726
727template <class T>
728ParallelDescriptor::Message
729ParallelDescriptor::Send (const T* buf, size_t n, int dst_pid, int tag)
730{
731 return Send(buf, n, dst_pid, tag, Communicator());
732}
733
734namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
735template <>
736Message
737Send<char> (const char* buf, size_t n, int dst_pid, int tag, MPI_Comm comm);
738
739template <class T>
740Message
741Send (const T* buf, size_t n, int dst_pid, int tag, MPI_Comm comm)
742{
743 static_assert(!std::is_same_v<char,T>, "Send: char version has been specialized");
744
745 BL_PROFILE_T_S("ParallelDescriptor::Send(Tsii)", T);
746
747#ifdef BL_COMM_PROFILING
748 int dst_pid_world(-1);
749 MPI_Group groupComm, groupWorld;
750 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
751 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
752 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &dst_pid, groupWorld, &dst_pid_world) );
753
754 BL_COMM_PROFILE(BLProfiler::SendTsii, n * sizeof(T), dst_pid_world, tag);
755#endif
756
757 BL_MPI_REQUIRE( MPI_Send(const_cast<T*>(buf),
758 n,
760 dst_pid,
761 tag,
762 comm) );
763 BL_COMM_PROFILE(BLProfiler::SendTsii, BLProfiler::AfterCall(), dst_pid, tag);
764 return Message();
765}
766}
767
768template <class T>
769ParallelDescriptor::Message
770ParallelDescriptor::Send (const std::vector<T>& buf, int dst_pid, int tag)
771{
772 return Send(buf.data(), buf.size(), dst_pid, tag, Communicator());
773}
774
775template <class T>
776ParallelDescriptor::Message
777ParallelDescriptor::Arecv (T* buf, size_t n, int src_pid, int tag)
778{
779 return Arecv(buf, n, src_pid, tag, Communicator());
780}
781
782namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
783template <>
784Message
785Arecv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
786
787template <class T>
788Message
789Arecv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
790{
791 static_assert(!std::is_same_v<char,T>, "Arecv: char version has been specialized");
792
793 BL_PROFILE_T_S("ParallelDescriptor::Arecv(TsiiM)", T);
794 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, n * sizeof(T), src_pid, tag);
795
796 MPI_Request req;
797 BL_MPI_REQUIRE( MPI_Irecv(buf,
798 n,
800 src_pid,
801 tag,
802 comm,
803 &req) );
804 BL_COMM_PROFILE(BLProfiler::ArecvTsiiM, BLProfiler::AfterCall(), src_pid, tag);
805 return Message(req, Mpi_typemap<T>::type());
806}
807}
808
809template <class T>
810ParallelDescriptor::Message
811ParallelDescriptor::Arecv (std::vector<T>& buf, int src_pid, int tag)
812{
813 return Arecv(buf.data(), buf.size(), src_pid, tag, Communicator());
814}
815
816template <class T>
817ParallelDescriptor::Message
818ParallelDescriptor::Recv (T* buf, size_t n, int src_pid, int tag)
819{
820 return Recv(buf, n, src_pid, tag, Communicator());
821}
822
823namespace ParallelDescriptor { // Have to use namespace here to work around a gcc bug
824template <>
825Message
826Recv<char> (char* buf, size_t n, int src_pid, int tag, MPI_Comm comm);
827
828template <class T>
829Message
830Recv (T* buf, size_t n, int src_pid, int tag, MPI_Comm comm)
831{
832 static_assert(!std::is_same_v<char,T>, "Recv: char version has been specialized");
833
834 BL_PROFILE_T_S("ParallelDescriptor::Recv(Tsii)", T);
835 BL_COMM_PROFILE(BLProfiler::RecvTsii, BLProfiler::BeforeCall(), src_pid, tag);
836
837 MPI_Status stat;
838 BL_MPI_REQUIRE( MPI_Recv(buf,
839 n,
841 src_pid,
842 tag,
843 comm,
844 &stat) );
845#ifdef BL_COMM_PROFILING
846 int src_pid_comm(stat.MPI_SOURCE);
847 int src_pid_world(stat.MPI_SOURCE);
848 if(src_pid_comm != MPI_ANY_SOURCE) {
849 MPI_Group groupComm, groupWorld;
850 BL_MPI_REQUIRE( MPI_Comm_group(comm, &groupComm) );
851 BL_MPI_REQUIRE( MPI_Comm_group(Communicator(), &groupWorld) );
852 BL_MPI_REQUIRE( MPI_Group_translate_ranks(groupComm, 1, &src_pid_comm, groupWorld, &src_pid_world) );
853 }
854
855 BL_COMM_PROFILE(BLProfiler::RecvTsii, n * sizeof(T), src_pid_world, stat.MPI_TAG);
856#endif
857 return Message(stat, Mpi_typemap<T>::type());
858}
859}
860
861template <class T>
862ParallelDescriptor::Message
863ParallelDescriptor::Recv (std::vector<T>& buf, int src_pid, int tag)
864{
865 return Recv(buf.data(), buf.size(), src_pid, tag, Communicator());
866}
867
868template <class T>
869void
871 size_t n,
872 int root)
873{
874#ifdef BL_LAZY
876#endif
877
878 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
879
880 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
881 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
882
883 BL_MPI_REQUIRE( MPI_Bcast(t,
884 n,
885 Mpi_typemap<T>::type(),
886 root,
887 Communicator()) );
888 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
889}
890
891template <class T>
892void
894 size_t n,
895 int root,
896 const MPI_Comm &comm)
897{
898#ifdef BL_LAZY
899 int r;
900 MPI_Comm_compare(comm, Communicator(), &r);
901 if (r == MPI_IDENT) { Lazy::EvalReduction(); }
902#endif
903
904 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
905
906 BL_PROFILE_T_S("ParallelDescriptor::Bcast(Tsi)", T);
907 BL_COMM_PROFILE(BLProfiler::BCastTsi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
908
909 BL_MPI_REQUIRE( MPI_Bcast(t,
910 n,
911 Mpi_typemap<T>::type(),
912 root,
913 comm) );
914 BL_COMM_PROFILE(BLProfiler::BCastTsi, n * sizeof(T), root, BLProfiler::NoTag());
915}
916
917template <class T, class T1>
918void
920 size_t n,
921 T1* t1,
922 size_t n1,
923 int root)
924{
925 BL_PROFILE_T_S("ParallelDescriptor::Gather(TsT1si)", T);
926 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
927
928 BL_ASSERT(n < static_cast<size_t>(std::numeric_limits<int>::max()));
929 BL_ASSERT(n1 < static_cast<size_t>(std::numeric_limits<int>::max()));
930
931 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(t),
932 n,
933 Mpi_typemap<T>::type(),
934 t1,
935 n1,
936 Mpi_typemap<T1>::type(),
937 root,
938 Communicator()) );
939 BL_COMM_PROFILE(BLProfiler::GatherTsT1Si, n * sizeof(T), root, BLProfiler::NoTag());
940}
941
942template <class T>
943std::vector<T>
944ParallelDescriptor::Gather (const T& t, int root)
945{
946 BL_PROFILE_T_S("ParallelDescriptor::Gather(Ti)", T);
947 BL_COMM_PROFILE(BLProfiler::GatherTi, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
948
949 std::vector<T> resl;
950 if ( root == MyProc() ) { resl.resize(NProcs()); }
951 BL_MPI_REQUIRE( MPI_Gather(const_cast<T*>(&t),
952 1,
953 Mpi_typemap<T>::type(),
954 resl.data(),
955 1,
956 Mpi_typemap<T>::type(),
957 root,
958 Communicator()) );
959 BL_COMM_PROFILE(BLProfiler::GatherTi, sizeof(T), root, BLProfiler::NoTag());
960 return resl;
961}
962
963template <class T>
964void
965ParallelDescriptor::Gatherv (const T* send, int sc,
966 T* recv, const std::vector<int>& rc, const std::vector<int>& disp,
967 int root)
968{
969 BL_PROFILE_T_S("ParallelDescriptor::Gatherv(Ti)", T);
970 BL_COMM_PROFILE(BLProfiler::Gatherv, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
971
972 MPI_Gatherv(send, sc, ParallelDescriptor::Mpi_typemap<T>::type(),
973 recv, rc.data(), disp.data(), ParallelDescriptor::Mpi_typemap<T>::type(),
974 root, Communicator());
975
976 BL_COMM_PROFILE(BLProfiler::Gatherv, std::accumulate(rc.begin(),rc.end(),0)*sizeof(T), root, BLProfiler::NoTag());
977}
978
979template <class T>
980void
981ParallelDescriptor::GatherLayoutDataToVector (const LayoutData<T>& sendbuf,
982 Vector<T>& recvbuf, int root)
983{
984 BL_PROFILE_T_S("ParallelDescriptor::GatherLayoutData(Ti)", T);
985
986 // Gather prelims
987 Vector<T> T_to_send;
988 T_to_send.reserve(sendbuf.local_size());
989
990 for (int i : sendbuf.IndexArray())
991 {
992 T_to_send.push_back(sendbuf[i]);
993 }
994
995 int nprocs = ParallelContext::NProcsSub();
996 Vector<int> recvcount(nprocs, 0);
997 recvbuf.resize(sendbuf.size());
998 const Vector<int>& old_pmap = sendbuf.DistributionMap().ProcessorMap();
999 for (int i : old_pmap)
1000 {
1001 ++recvcount[i];
1002 }
1003
1004 // Make a map from post-gather to pre-gather index
1005 Vector<Vector<int>> new_ind_to_old_ind(nprocs);
1006 for (int i=0; i<nprocs; ++i)
1007 {
1008 new_ind_to_old_ind[i].reserve(recvcount[i]);
1009 }
1010 for (int i=0; i<old_pmap.size(); ++i)
1011 {
1012 new_ind_to_old_ind[old_pmap[i]].push_back(i);
1013 }
1014
1015 // Flatten
1016 Vector<int> new_index_to_old_index;
1017 new_index_to_old_index.reserve(old_pmap.size());
1018 for (const Vector<int>& v : new_ind_to_old_ind)
1019 {
1020 if (!v.empty())
1021 {
1022 for (int el : v)
1023 {
1024 new_index_to_old_index.push_back(el);
1025 }
1026 }
1027 }
1028
1029 Vector<int> disp(nprocs);
1030 if (!disp.empty()) { disp[0] = 0; }
1031 std::partial_sum(recvcount.begin(), recvcount.end()-1, disp.begin()+1);
1032 Vector<T> new_index_to_T(sendbuf.size());
1033
1034 MPI_Gatherv(T_to_send.data(), T_to_send.size(),
1036 new_index_to_T.data(), recvcount.data(), disp.data(),
1039
1040 // Now work just on the root, which now has global information on the collected;
1041 // LayoutData information; with new_index_to_old_index and new_index_to_T,
1042 // sort the gathered vector in pre-gather index space
1043 if (ParallelContext::MyProcSub() == root)
1044 {
1045 // Invert the map (new_index) --> (old_index)
1046 Vector<int> old_index_to_new_index(sendbuf.size());
1047
1048 for (int i=0; i<old_index_to_new_index.size(); ++i)
1049 {
1050 old_index_to_new_index[new_index_to_old_index[i]] = i;
1051 }
1052
1053 for (int i=0; i<recvbuf.size(); ++i)
1054 {
1055 recvbuf[i] = new_index_to_T[old_index_to_new_index[i]];
1056 }
1057 }
1058}
1059
1060template <class T, class T1>
1061void
1063 size_t n,
1064 const T1* t1,
1065 size_t n1,
1066 int root)
1067{
1068 BL_PROFILE_T_S("ParallelDescriptor::Scatter(TsT1si)", T);
1069 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, BLProfiler::BeforeCall(), root, BLProfiler::NoTag());
1070
1071 BL_MPI_REQUIRE( MPI_Scatter(const_cast<T1*>(t1),
1072 n1,
1073 Mpi_typemap<T1>::type(),
1074 t,
1075 n,
1076 Mpi_typemap<T>::type(),
1077 root,
1078 Communicator()) );
1079 BL_COMM_PROFILE(BLProfiler::ScatterTsT1si, n * sizeof(T), root, BLProfiler::NoTag());
1080}
1081
1082#else
1083
1084namespace ParallelDescriptor
1085{
1086template <class T>
1087Message
1088Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1089{
1090 return Message();
1091}
1092
1093template <class T>
1094Message
1095Asend(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1096{
1097 return Message();
1098}
1099
1100template <class T>
1101Message
1102Asend(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1103{
1104 return Message();
1105}
1106
1107template <class T>
1108Message
1109Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/)
1110{
1111 return Message();
1112}
1113
1114template <class T>
1115Message
1116Send(const T* /*buf*/, size_t /*n*/, int /*dst_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1117{
1118 return Message();
1119}
1120
1121template <class T>
1122Message
1123Send(const std::vector<T>& /*buf*/, int /*dst_pid*/, int /*tag*/)
1124{
1125 return Message();
1126}
1127
1128template <class T>
1129Message
1130Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1131{
1132 return Message();
1133}
1134
1135template <class T>
1136Message
1137Arecv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1138{
1139 return Message();
1140}
1141
1142template <class T>
1143Message
1144Arecv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1145{
1146 return Message();
1147}
1148
1149template <class T>
1150Message
1151Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/)
1152{
1153 return Message();
1154}
1155
1156template <class T>
1157Message
1158Recv(T* /*buf*/, size_t /*n*/, int /*src_pid*/, int /*tag*/, MPI_Comm /*comm*/)
1159{
1160 return Message();
1161}
1162
1163template <class T>
1164Message
1165Recv(std::vector<T>& /*buf*/, int /*src_pid*/, int /*tag*/)
1166{
1167 return Message();
1168}
1169
1170template <class T>
1171void
1172Bcast(T* /*t*/, size_t /*n*/, int /*root*/)
1173{}
1174
1175template <class T>
1176void
1177Bcast(T* /*t*/, size_t /*n*/, int /*root*/, const MPI_Comm & /*comm*/)
1178{}
1179
1180template <class T, class T1>
1181void
1182Gather (const T* t, size_t n, T1* t1, size_t n1, int /*root*/)
1183{
1184 BL_ASSERT(n == n1);
1186
1187 int const sc = static_cast<int>(n);
1188 for (int j=0; j<sc; ++j) { t1[j] = t[j]; }
1189}
1190
1191template <class T>
1192std::vector<T>
1193Gather(const T& t, int /*root*/)
1194{
1195 std::vector<T> resl(1);
1196 resl[0] = t;
1197 return resl;
1198}
1199
1200template <class T>
1201void
1202Gatherv (const T* send, int sc,
1203 T* recv, const std::vector<int>& /*rc*/,
1204 const std::vector<int>& /*disp*/, int /*root*/)
1205{
1206 for (int j=0; j<sc; ++j) { recv[j] = send[j]; }
1207}
1208
1209template <class T>
1210void
1212 Vector<T>& recvbuf, int /*root*/)
1213{
1214 recvbuf.resize(sendbuf.size());
1215
1216 for (int i=0; i<sendbuf.size(); ++i)
1217 {
1218 recvbuf[i] = sendbuf[i];
1219 }
1220}
1221
1222template <class T, class T1>
1223void
1224Scatter(T* /*t*/, size_t /*n*/, const T1* /*t1*/, size_t /*n1*/, int /*root*/)
1225{}
1226
1227}
1228#endif
1229
1230namespace ParallelDescriptor {
1231
1232#ifdef AMREX_USE_MPI
1233
1234namespace detail {
1235
1236template<typename T>
1237void DoAllReduce (T* r, MPI_Op op, int cnt)
1238{
1239#ifdef BL_LAZY
1241#endif
1242
1243 BL_ASSERT(cnt > 0);
1244
1245 BL_MPI_REQUIRE( MPI_Allreduce(MPI_IN_PLACE, r, cnt,
1247 Communicator()) );
1248}
1249
1250template<typename T>
1251void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
1252{
1253#ifdef BL_LAZY
1255#endif
1256
1257 BL_ASSERT(cnt > 0);
1258
1259 if (MyProc() == cpu) {
1260 BL_MPI_REQUIRE( MPI_Reduce(MPI_IN_PLACE, r, cnt,
1262 cpu, Communicator()) );
1263 } else {
1264 BL_MPI_REQUIRE( MPI_Reduce(r, r, cnt,
1266 cpu, Communicator()) );
1267 }
1268}
1269
1270}
1271
1273 template <typename T>
1274 std::enable_if_t<std::is_floating_point_v<T>>
1275 ReduceRealSum (T& rvar) {
1276 detail::DoAllReduce<T>(&rvar,MPI_SUM,1);
1277 }
1278
1279 template <typename T>
1280 std::enable_if_t<std::is_floating_point_v<T>>
1281 ReduceRealSum (T* rvar, int cnt) {
1282 detail::DoAllReduce<T>(rvar,MPI_SUM,cnt);
1283 }
1284
1285 template <typename T>
1286 std::enable_if_t<std::is_floating_point_v<T>>
1287 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar)
1288 {
1289 int cnt = rvar.size();
1290 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1291 detail::DoAllReduce<T>(tmp.data(),MPI_SUM,cnt);
1292 for (int i = 0; i < cnt; ++i) {
1293 rvar[i].get() = tmp[i];
1294 }
1295 }
1296
1298 template <typename T>
1299 std::enable_if_t<std::is_floating_point_v<T>>
1300 ReduceRealSum (T& rvar, int cpu) {
1301 detail::DoReduce<T>(&rvar,MPI_SUM,1,cpu);
1302 }
1303
1304 template <typename T>
1305 std::enable_if_t<std::is_floating_point_v<T>>
1306 ReduceRealSum (T* rvar, int cnt, int cpu) {
1307 detail::DoReduce<T>(rvar,MPI_SUM,cnt,cpu);
1308 }
1309
1310 template <typename T>
1311 std::enable_if_t<std::is_floating_point_v<T>>
1312 ReduceRealSum (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1313 {
1314 int cnt = rvar.size();
1315 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1316 detail::DoReduce<T>(tmp.data(),MPI_SUM,cnt,cpu);
1317 for (int i = 0; i < cnt; ++i) {
1318 rvar[i].get() = tmp[i];
1319 }
1320 }
1321
1323 template <typename T>
1324 std::enable_if_t<std::is_floating_point_v<T>>
1325 ReduceRealMax (T& rvar) {
1326 detail::DoAllReduce<T>(&rvar,MPI_MAX,1);
1327 }
1328
1329 template <typename T>
1330 std::enable_if_t<std::is_floating_point_v<T>>
1331 ReduceRealMax (T* rvar, int cnt) {
1332 detail::DoAllReduce<T>(rvar,MPI_MAX,cnt);
1333 }
1334
1335 template <typename T>
1336 std::enable_if_t<std::is_floating_point_v<T>>
1337 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar)
1338 {
1339 int cnt = rvar.size();
1340 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1341 detail::DoAllReduce<T>(tmp.data(),MPI_MAX,cnt);
1342 for (int i = 0; i < cnt; ++i) {
1343 rvar[i].get() = tmp[i];
1344 }
1345 }
1346
1348 template <typename T>
1349 std::enable_if_t<std::is_floating_point_v<T>>
1350 ReduceRealMax (T& rvar, int cpu) {
1351 detail::DoReduce<T>(&rvar,MPI_MAX,1,cpu);
1352 }
1353
1354 template <typename T>
1355 std::enable_if_t<std::is_floating_point_v<T>>
1356 ReduceRealMax (T* rvar, int cnt, int cpu) {
1357 detail::DoReduce<T>(rvar,MPI_MAX,cnt,cpu);
1358 }
1359
1360 template <typename T>
1361 std::enable_if_t<std::is_floating_point_v<T>>
1362 ReduceRealMax (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1363 {
1364 int cnt = rvar.size();
1365 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1366 detail::DoReduce<T>(tmp.data(),MPI_MAX,cnt,cpu);
1367 for (int i = 0; i < cnt; ++i) {
1368 rvar[i].get() = tmp[i];
1369 }
1370 }
1371
1373 template <typename T>
1374 std::enable_if_t<std::is_floating_point_v<T>>
1375 ReduceRealMin (T& rvar) {
1376 detail::DoAllReduce<T>(&rvar,MPI_MIN,1);
1377 }
1378
1379 template <typename T>
1380 std::enable_if_t<std::is_floating_point_v<T>>
1381 ReduceRealMin (T* rvar, int cnt) {
1382 detail::DoAllReduce<T>(rvar,MPI_MIN,cnt);
1383 }
1384
1385 template <typename T>
1386 std::enable_if_t<std::is_floating_point_v<T>>
1387 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar)
1388 {
1389 int cnt = rvar.size();
1390 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1391 detail::DoAllReduce<T>(tmp.data(),MPI_MIN,cnt);
1392 for (int i = 0; i < cnt; ++i) {
1393 rvar[i].get() = tmp[i];
1394 }
1395 }
1396
1398 template <typename T>
1399 std::enable_if_t<std::is_floating_point_v<T>>
1400 ReduceRealMin (T& rvar, int cpu) {
1401 detail::DoReduce<T>(&rvar,MPI_MIN,1,cpu);
1402 }
1403
1404 template <typename T>
1405 std::enable_if_t<std::is_floating_point_v<T>>
1406 ReduceRealMin (T* rvar, int cnt, int cpu) {
1407 detail::DoReduce<T>(rvar,MPI_MIN,cnt,cpu);
1408 }
1409
1410 template <typename T>
1411 std::enable_if_t<std::is_floating_point_v<T>>
1412 ReduceRealMin (Vector<std::reference_wrapper<T> > const& rvar, int cpu)
1413 {
1414 int cnt = rvar.size();
1415 Vector<T> tmp{std::begin(rvar), std::end(rvar)};
1416 detail::DoReduce<T>(tmp.data(),MPI_MIN,cnt,cpu);
1417 for (int i = 0; i < cnt; ++i) {
1418 rvar[i].get() = tmp[i];
1419 }
1420 }
1421
1422#else
1423
1425 template <typename T>
1426 std::enable_if_t<std::is_floating_point_v<T>>
1427 ReduceRealSum (T& ) {}
1428
1429 template <typename T>
1430 std::enable_if_t<std::is_floating_point_v<T>>
1431 ReduceRealSum (T*, int) {}
1432
1433 template <typename T>
1434 std::enable_if_t<std::is_floating_point_v<T>>
1435 ReduceRealSum (Vector<std::reference_wrapper<T> > const&) {}
1436
1438 template <typename T>
1439 std::enable_if_t<std::is_floating_point_v<T>>
1440 ReduceRealSum (T&, int) {}
1441
1442 template <typename T>
1443 std::enable_if_t<std::is_floating_point_v<T>>
1444 ReduceRealSum (T*, int, int) {}
1445
1446 template <typename T>
1447 std::enable_if_t<std::is_floating_point_v<T>>
1448 ReduceRealSum (Vector<std::reference_wrapper<T> > const&, int) {}
1449
1451 template <typename T>
1452 std::enable_if_t<std::is_floating_point_v<T>>
1453 ReduceRealMax (T&) {}
1454
1455 template <typename T>
1456 std::enable_if_t<std::is_floating_point_v<T>>
1457 ReduceRealMax (T*, int) {}
1458
1459 template <typename T>
1460 std::enable_if_t<std::is_floating_point_v<T>>
1461 ReduceRealMax (Vector<std::reference_wrapper<T> > const&) {}
1462
1464 template <typename T>
1465 std::enable_if_t<std::is_floating_point_v<T>>
1466 ReduceRealMax (T&, int) {}
1467
1468 template <typename T>
1469 std::enable_if_t<std::is_floating_point_v<T>>
1470 ReduceRealMax (T*, int, int) {}
1471
1472 template <typename T>
1473 std::enable_if_t<std::is_floating_point_v<T>>
1474 ReduceRealMax (Vector<std::reference_wrapper<T> > const&, int) {}
1475
1477 template <typename T>
1478 std::enable_if_t<std::is_floating_point_v<T>>
1479 ReduceRealMin (T&) {}
1480
1481 template <typename T>
1482 std::enable_if_t<std::is_floating_point_v<T>>
1483 ReduceRealMin (T*, int) {}
1484
1485 template <typename T>
1486 std::enable_if_t<std::is_floating_point_v<T>>
1487 ReduceRealMin (Vector<std::reference_wrapper<T> > const&) {}
1488
1490 template <typename T>
1491 std::enable_if_t<std::is_floating_point_v<T>>
1492 ReduceRealMin (T&, int) {}
1493
1494 template <typename T>
1495 std::enable_if_t<std::is_floating_point_v<T>>
1496 ReduceRealMin (T*, int, int) {}
1497
1498 template <typename T>
1499 std::enable_if_t<std::is_floating_point_v<T>>
1500 ReduceRealMin (Vector<std::reference_wrapper<T> > const&, int) {}
1501
1502#endif
1503}
1504
1505#ifdef AMREX_USE_MPI
1506namespace ParallelDescriptor {
1507
1508template <class T>
1510{
1512 {
1513 static_assert(std::is_same<T,double>() ||
1514 std::is_same<T,float >(),
1515 "Unsupported type T for GpuComplex");
1516 if constexpr (std::is_same<T,double>()) {
1517 return MPI_C_DOUBLE_COMPLEX;
1518 } else {
1519 return MPI_C_FLOAT_COMPLEX;
1520 }
1521 }
1522};
1523
1524template<typename TV, typename TI>
1526{
1528 {
1529 static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
1530 if (mpi_type == MPI_DATATYPE_NULL) {
1531 using T = ValLocPair<TV,TI>;
1532 static_assert(std::is_trivially_copyable_v<T>,
1533 "To communicate with MPI, ValLocPair must be trivially copyable.");
1534 static_assert(std::is_standard_layout_v<T>,
1535 "To communicate with MPI, ValLocPair must be standard layout");
1536
1537 T vlp[2];
1538 MPI_Datatype types[] = {
1541 };
1542 int blocklens[] = { 1, 1 };
1543 MPI_Aint disp[2];
1544 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].value, &disp[0]) );
1545 BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].index, &disp[1]) );
1546 disp[1] -= disp[0];
1547 disp[0] = 0;
1548 BL_MPI_REQUIRE( MPI_Type_create_struct(2, blocklens, disp, types,
1549 &mpi_type) );
1550 MPI_Aint lb, extent;
1551 BL_MPI_REQUIRE( MPI_Type_get_extent(mpi_type, &lb, &extent) );
1552 if (extent != sizeof(T)) {
1553 MPI_Datatype tmp = mpi_type;
1554 BL_MPI_REQUIRE( MPI_Type_create_resized(tmp, 0, sizeof(vlp[0]), &mpi_type) );
1555 BL_MPI_REQUIRE( MPI_Type_free(&tmp) );
1556 }
1557 BL_MPI_REQUIRE( MPI_Type_commit( &mpi_type ) );
1558
1559 m_mpi_types.push_back(&mpi_type);
1560 }
1561 return mpi_type;
1562 }
1563};
1564
1565template <typename T, typename F>
1567{
1568 static MPI_Op mpi_op = MPI_OP_NULL;
1569 if (mpi_op == MPI_OP_NULL) {
1570 static auto user_fn = [] (void *invec, void *inoutvec, int* len, // NOLINT
1571 MPI_Datatype * /*datatype*/)
1572 {
1573 auto in = static_cast<T const*>(invec);
1574 auto out = static_cast<T*>(inoutvec);
1575 for (int i = 0; i < *len; ++i) {
1576 out[i] = F()(in[i],out[i]);
1577 }
1578 };
1579 BL_MPI_REQUIRE( MPI_Op_create(user_fn, 1, &mpi_op) );
1580 m_mpi_ops.push_back(&mpi_op);
1581 }
1582 return mpi_op;
1583}
1584
1585}
1586#endif
1587
1588}
1589
1590#endif /*BL_PARALLELDESCRIPTOR_H*/
#define BL_COMM_PROFILE(cft, size, pid, tag)
Definition AMReX_BLProfiler.H:587
#define BL_PROFILE_T_S(fname, T)
Definition AMReX_BLProfiler.H:554
#define BL_ASSERT(EX)
Definition AMReX_BLassert.H:39
#define AMREX_EXPORT
Definition AMReX_Extension.H:191
int MPI_Comm
Definition AMReX_ccse-mpi.H:47
int MPI_Request
Definition AMReX_ccse-mpi.H:50
int MPI_Group
Definition AMReX_ccse-mpi.H:48
int MPI_Op
Definition AMReX_ccse-mpi.H:46
static constexpr int MPI_COMM_WORLD
Definition AMReX_ccse-mpi.H:54
static constexpr int MPI_PROC_NULL
Definition AMReX_ccse-mpi.H:57
static constexpr int MPI_DATATYPE_NULL
Definition AMReX_ccse-mpi.H:52
int MPI_Datatype
Definition AMReX_ccse-mpi.H:49
static constexpr int MPI_REQUEST_NULL
Definition AMReX_ccse-mpi.H:53
int size() const noexcept
Return the number of FABs in the FabArray.
Definition AMReX_FabArrayBase.H:109
a one-thingy-per-box distributed object
Definition AMReX_LayoutData.H:13
Hold the description and status of communication data.
Definition AMReX_ParallelDescriptor.H:57
Message(MPI_Request req_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:61
MPI_Datatype type() const
Definition AMReX_ParallelDescriptor.H:73
MPI_Request req() const
Definition AMReX_ParallelDescriptor.H:74
bool test()
Definition AMReX_ParallelDescriptor.cpp:1187
MPI_Status m_stat
Definition AMReX_ParallelDescriptor.H:82
MPI_Request m_req
Definition AMReX_ParallelDescriptor.H:81
MPI_Status stat() const
Definition AMReX_ParallelDescriptor.H:75
void wait()
Definition AMReX_ParallelDescriptor.cpp:1183
MPI_Datatype m_type
Definition AMReX_ParallelDescriptor.H:80
Message(MPI_Status stat_, MPI_Datatype type_)
Definition AMReX_ParallelDescriptor.H:65
bool m_finished
Definition AMReX_ParallelDescriptor.H:79
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
void EvalReduction()
Definition AMReX_Lazy.cpp:20
int NProcs()
Process ID in MPI_COMM_WORLD.
Definition AMReX_MPMD.cpp:122
int MyProc()
Definition AMReX_MPMD.cpp:117
int MyProcAll() noexcept
my rank in world communicator
Definition AMReX_ParallelContext.H:61
MPI_Comm CommunicatorSub() noexcept
sub-communicator for current frame
Definition AMReX_ParallelContext.H:70
int get_inc_mpi_tag() noexcept
get and increment mpi tag in current frame
Definition AMReX_ParallelContext.H:93
int MyProcSub() noexcept
my sub-rank in current frame
Definition AMReX_ParallelContext.H:76
int NProcsAll() noexcept
number of ranks in world communicator
Definition AMReX_ParallelContext.H:59
int NProcsSub() noexcept
number of ranks in current frame
Definition AMReX_ParallelContext.H:74
void DoAllReduce(T *r, MPI_Op op, int cnt)
Definition AMReX_ParallelDescriptor.H:1237
void DoReduce(T *r, MPI_Op op, int cnt, int cpu)
Definition AMReX_ParallelDescriptor.H:1251
bool sameTeam(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:329
int RankInLeadComm(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:339
void Test(MPI_Request &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1216
const char * ErrorString(int)
ErrorString return string associated with error internal error condition.
Definition AMReX_ParallelDescriptor.cpp:1209
int m_rank_in_processor
Definition AMReX_ParallelDescriptor.cpp:79
Message Asend(const T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1088
const int ioProcessor
The MPI rank number of the I/O Processor (probably rank 0). This rank is usually used to write to std...
Definition AMReX_ParallelDescriptor.cpp:88
MPI_Comm Communicator() noexcept
Definition AMReX_ParallelDescriptor.H:210
void ReduceIntSum(int &)
Integer sum reduction.
Definition AMReX_ParallelDescriptor.cpp:1261
Vector< MPI_Datatype * > m_mpi_types
Definition AMReX_ParallelDescriptor.cpp:82
void EndTeams()
Definition AMReX_ParallelDescriptor.cpp:1645
void Waitany(Vector< MPI_Request > &, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1308
void Gatherv(const T *send, int sc, T *recv, const std::vector< int > &rc, const std::vector< int > &disp, int root)
Definition AMReX_ParallelDescriptor.H:1202
Message Abarrier()
Definition AMReX_ParallelDescriptor.cpp:1213
void ReduceRealMin(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1225
void StartParallel(int *, char ***, MPI_Comm)
Perform any needed parallel initialization. This MUST be the first routine in this class called from ...
Definition AMReX_ParallelDescriptor.cpp:1161
bool isTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:324
int m_nprocs_per_node
Definition AMReX_ParallelDescriptor.cpp:75
void Wait(MPI_Request &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1300
int MyProc() noexcept
return the rank number local to the current Parallel Context
Definition AMReX_ParallelDescriptor.H:125
Vector< MPI_Op * > m_mpi_ops
Definition AMReX_ParallelDescriptor.cpp:83
std::string mpi_level_to_string(int mtlev)
Definition AMReX_ParallelDescriptor.cpp:1651
ProcessTeam m_Team
Definition AMReX_ParallelDescriptor.cpp:71
int m_nprocs_per_processor
Definition AMReX_ParallelDescriptor.cpp:78
void ReduceRealSum(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1223
void Waitsome(Vector< MPI_Request > &, int &, Vector< int > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1312
bool UseGpuAwareMpi()
Definition AMReX_ParallelDescriptor.H:111
void ReduceLongMin(Long &)
Long min reduction.
Definition AMReX_ParallelDescriptor.cpp:1234
void ReadAndBcastFile(const std::string &filename, Vector< char > &charBuf, bool bExitOnError, const MPI_Comm &comm)
Definition AMReX_ParallelDescriptor.cpp:1491
void ReduceLongAnd(Long &)
Long and-wise reduction.
Definition AMReX_ParallelDescriptor.cpp:1231
std::pair< int, int > team_range(int begin, int end, int rit=-1, int nworkers=0) noexcept
Definition AMReX_ParallelDescriptor.H:354
int MyTeamLead() noexcept
Definition AMReX_ParallelDescriptor.H:309
void Waitall(Vector< MPI_Request > &, Vector< MPI_Status > &)
Definition AMReX_ParallelDescriptor.cpp:1304
int MyTeamColor() noexcept
Definition AMReX_ParallelDescriptor.H:304
bool doTeamReduce() noexcept
Definition AMReX_ParallelDescriptor.H:344
int MinTag() noexcept
Definition AMReX_ParallelDescriptor.H:206
void ReduceIntMax(int &)
Integer max reduction.
Definition AMReX_ParallelDescriptor.cpp:1262
void ReduceLongMax(Long &)
Long max reduction.
Definition AMReX_ParallelDescriptor.cpp:1233
int TeamLead(int rank) noexcept
Definition AMReX_ParallelDescriptor.H:319
int MyRankInProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:234
void ReduceBoolAnd(bool &)
And-wise boolean reduction.
Definition AMReX_ParallelDescriptor.cpp:1285
Message Send(const T *buf, size_t n, int dst_pid, int tag)
Definition AMReX_ParallelDescriptor.H:1109
MPI_Comm m_comm
Definition AMReX_ParallelDescriptor.cpp:73
void ReduceBoolOr(bool &)
Or-wise boolean reduction.
Definition AMReX_ParallelDescriptor.cpp:1286
void Initialize()
Definition AMReX_ParallelDescriptor.cpp:1543
void Finalize()
Definition AMReX_ParallelDescriptor.cpp:1582
bool use_gpu_aware_mpi
Definition AMReX_ParallelDescriptor.cpp:66
int TeamSize() noexcept
Definition AMReX_ParallelDescriptor.H:294
void ReduceLongSum(Long &)
Long sum reduction.
Definition AMReX_ParallelDescriptor.cpp:1232
void EndParallel()
Perform any needed parallel finalization. This MUST be the last routine in this class called from wit...
Definition AMReX_ParallelDescriptor.cpp:1193
void Bcast(void *, int, MPI_Datatype, int, MPI_Comm)
Definition AMReX_ParallelDescriptor.cpp:1291
int SeqNum() noexcept
Returns sequential message sequence numbers, usually used as tags for send/recv.
Definition AMReX_ParallelDescriptor.H:613
int NProcs() noexcept
return the number of MPI ranks local to the current Parallel Context
Definition AMReX_ParallelDescriptor.H:243
void IProbe(int, int, int &, MPI_Status &)
Definition AMReX_ParallelDescriptor.cpp:1218
int MyRankInNode() noexcept
Definition AMReX_ParallelDescriptor.H:222
void GatherLayoutDataToVector(const LayoutData< T > &sendbuf, Vector< T > &recvbuf, int root)
Gather LayoutData values to a vector on root.
Definition AMReX_ParallelDescriptor.H:1211
int MaxTag() noexcept
Definition AMReX_ParallelDescriptor.H:207
void Gather(Real const *sendbuf, int nsend, Real *recvbuf, int root)
Parallel gather.
Definition AMReX_ParallelDescriptor.cpp:1169
void Comm_dup(MPI_Comm, MPI_Comm &)
Definition AMReX_ParallelDescriptor.cpp:1221
int IOProcessorNumber() noexcept
Definition AMReX_ParallelDescriptor.H:266
int NProcsPerProcessor() noexcept
Definition AMReX_ParallelDescriptor.H:228
void Barrier(const std::string &)
Definition AMReX_ParallelDescriptor.cpp:1211
int m_rank_in_node
Definition AMReX_ParallelDescriptor.cpp:76
double second() noexcept
Returns wall-clock seconds since start of execution.
Definition AMReX_ParallelDescriptor.cpp:1294
const std::string Unnamed("Unnamed")
Used as default argument to ParallelDescriptor::Barrier().
bool IOProcessor() noexcept
Is this CPU the I/O Processor? To get the rank number, call IOProcessorNumber()
Definition AMReX_ParallelDescriptor.H:275
int MyRankInTeam() noexcept
Definition AMReX_ParallelDescriptor.H:314
int m_MaxTag
Definition AMReX_ParallelDescriptor.cpp:86
void Scatter(T *, size_t n, const T1 *, size_t n1, int root)
Definition AMReX_ParallelDescriptor.H:1224
int NProcsPerNode() noexcept
Definition AMReX_ParallelDescriptor.H:216
void Abort(int errorcode, bool backtrace)
Abort with specified error code.
Definition AMReX_ParallelDescriptor.cpp:1200
void team_for(int begin, int end, const F &f)
Definition AMReX_ParallelDescriptor.H:398
int NTeams() noexcept
Definition AMReX_ParallelDescriptor.H:299
Message Recv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1151
void StartTeams()
Split the process pool into teams.
Definition AMReX_ParallelDescriptor.cpp:1591
Message Arecv(T *, size_t n, int pid, int tag)
Definition AMReX_ParallelDescriptor.H:1130
MPI_Op Mpi_op()
Definition AMReX_ParallelDescriptor.H:1566
int m_MinTag
Definition AMReX_ParallelDescriptor.cpp:86
void ReduceIntMin(int &)
Integer min reduction.
Definition AMReX_ParallelDescriptor.cpp:1263
void ReduceRealMax(Vector< std::reference_wrapper< Real > > const &)
Definition AMReX_ParallelDescriptor.cpp:1224
const ProcessTeam & MyTeam() noexcept
Definition AMReX_ParallelDescriptor.H:349
Definition AMReX_Amr.cpp:49
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1890
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:127
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
Definition AMReX_Box.H:1881
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_ccse-mpi.H:51
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:29
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1511
static MPI_Datatype type()
Definition AMReX_ParallelDescriptor.H:1527
Communication datatype (note: this structure also works without MPI)
Definition AMReX_ccse-mpi.H:68
Provide functionalities needed to construct a team of processes to perform a particular job.
Definition AMReX_ParallelDescriptor.H:144
MPI_Comm team_t
Definition AMReX_ParallelDescriptor.H:145
int m_color
Definition AMReX_ParallelDescriptor.H:194
int m_size
Definition AMReX_ParallelDescriptor.H:193
void clear()
free a communicator
Definition AMReX_ParallelDescriptor.H:176
void Barrier() const
synchronize processes within the team
Definition AMReX_ParallelDescriptor.H:148
const MPI_Comm & get_team_comm() const
return the communicator
Definition AMReX_ParallelDescriptor.H:189
int m_do_team_reduce
Definition AMReX_ParallelDescriptor.H:197
int m_numTeams
Definition AMReX_ParallelDescriptor.H:192
MPI_Comm m_team_comm
Definition AMReX_ParallelDescriptor.H:199
int m_rankInTeam
Definition AMReX_ParallelDescriptor.H:196
const MPI_Comm & get_lead_comm() const
Definition AMReX_ParallelDescriptor.H:190
MPI_Comm m_lead_comm
Definition AMReX_ParallelDescriptor.H:200
int m_lead
Definition AMReX_ParallelDescriptor.H:195
void MemoryBarrier() const
memory fence
Definition AMReX_ParallelDescriptor.H:157
const team_t & get() const
Definition AMReX_ParallelDescriptor.H:185
Definition AMReX_ValLocPair.H:10