Block-Structured AMR Software Framework
 
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Loading...
Searching...
No Matches
AMReX_MFParallelForG.H
Go to the documentation of this file.
1#ifndef AMREX_MF_PARALLEL_FOR_G_H_
2#define AMREX_MF_PARALLEL_FOR_G_H_
3#include <AMReX_Config.H>
4
5#ifdef AMREX_USE_GPU
6
7#include <algorithm>
8#include <cmath>
9#include <limits>
10
11namespace amrex {
12namespace detail {
13
14inline
15void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
16 Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
17{
18 if (!ncells.empty()) {
19 const int nboxes = ncells.size();
20 const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
21 const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
22 a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
23 int* hp_blks = (int*)a_hp;
24 auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
25 hp_blks[0] = 0;
26 bool same_size = true;
27 for (int i = 0; i < nboxes; ++i) {
28 Long nblocks = (ncells[i] + nthreads-1) / nthreads;
29 AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
30 hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
31 same_size = same_size && (ncells[i] == ncells[0]);
32
33 new (hp_boxes+i) BoxIndexer(boxes[i]);
34 }
35
36 a_dp = (char*) The_Arena()->alloc(nbytes);
37 Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);
38
39 blocks_x.first = hp_blks;
40 blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
41 pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
42 }
43}
44
45inline
46void destroy_par_for_nblocks (char* hp, char* dp)
47{
49 The_Arena()->free(dp);
50}
51}
52
53namespace experimental::detail {
54
55namespace parfor_mf_detail {
56 template <typename F>
58 auto call_f (F const& f, int b, int i, int j, int k, int) noexcept
59 -> decltype(f(0,0,0,0))
60 {
61 f(b,i,j,k);
62 }
63
64 template <typename F>
66 auto call_f (F const& f, int b, int i, int j, int k, int n) noexcept
67 -> decltype(f(0,0,0,0,0))
68 {
69 f(b,i,j,k,n);
70 }
71}
72
73template <int MT, typename MF, typename F>
74std::enable_if_t<IsFabArray<MF>::value>
75ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, bool, F const& f)
76{
77 const auto& index_array = mf.IndexArray();
78 const int nboxes = index_array.size();
79
80 if (nboxes == 0) {
81 return;
82 } else if (nboxes == 1) {
83 Box const& b = amrex::grow(mf.box(index_array[0]), nghost);
84 amrex::ParallelFor(b, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
85 {
86 parfor_mf_detail::call_f(f, 0, i, j, k, n);
87 });
88 } else {
89 auto const& parforinfo = mf.getParForInfo(nghost,MT);
90 auto par_for_blocks = parforinfo.getBlocks();
91 const int nblocks = par_for_blocks.first[nboxes];
92 const int block_0_size = par_for_blocks.first[1];
93 const int* dp_nblocks = par_for_blocks.second;
94 const BoxIndexer* dp_boxes = parforinfo.getBoxes();
95
96#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
97
98 amrex::launch_global<MT>
99 <<<nblocks, MT, 0, Gpu::gpuStream()>>>
100 ([=] AMREX_GPU_DEVICE () noexcept
101 {
102 int ibox;
103 std::uint64_t icell;
104 if (dp_nblocks) {
105 ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
106 icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
107 } else {
108 ibox = blockIdx.x / block_0_size;
109 icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
110 }
111
112#elif defined(AMREX_USE_SYCL)
113
114 amrex::launch<MT>(nblocks, Gpu::gpuStream(),
115 [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
116 {
117 int ibox;
118 std::uint64_t icell;
119 int blockIdxx = item.get_group_linear_id();
120 int threadIdxx = item.get_local_linear_id();
121 if (dp_nblocks) {
122 ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
123 icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
124 } else {
125 ibox = blockIdxx / block_0_size;
126 icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
127 }
128#endif
129 BoxIndexer const& indexer = dp_boxes[ibox];
130 if (icell < indexer.numPts()) {
131 auto [i, j, k] = indexer(icell);
132 for (int n = 0; n < ncomp; ++n) {
133 parfor_mf_detail::call_f(f, ibox, i, j, k, n);
134 }
135 }
136 });
137 }
139}
140
141template <typename MF, typename F>
142std::enable_if_t<IsFabArray<MF>::value>
143ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const& ts, bool dynamic, F&& f)
144{
145 ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, ncomp, ts, dynamic, std::forward<F>(f));
146}
147
148template <typename MF, typename F>
149std::enable_if_t<IsFabArray<MF>::value>
150ParallelFor (MF const& mf, IntVect const& nghost, IntVect const& ts, bool dynamic, F&& f)
151{
152 ParallelFor<AMREX_GPU_MAX_THREADS>(mf, nghost, 1, ts, dynamic, std::forward<F>(f));
153}
154
155}
156
157}
158
159#endif
160#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_GPU_ERROR_CHECK()
Definition AMReX_GpuError.H:133
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
This class is a thin wrapper around std::vector. Unlike vector, Vector::operator[] provides bound che...
Definition AMReX_Vector.H:27
Long size() const noexcept
Definition AMReX_Vector.H:50
void htod_memcpy_async(void *p_d, const void *p_h, const std::size_t sz) noexcept
Definition AMReX_GpuDevice.H:251
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:218
void build_par_for_nblocks(char *&a_hp, char *&a_dp, std::pair< int *, int * > &blocks_x, BoxIndexer *&pboxes, Vector< Box > const &boxes, Vector< Long > const &ncells, int nthreads)
Definition AMReX_MFParallelForG.H:15
void destroy_par_for_nblocks(char *hp, char *dp)
Definition AMReX_MFParallelForG.H:46
AMREX_GPU_DEVICE auto call_f(F const &f, int b, int i, int j, int k, int) noexcept -> decltype(f(0, 0, 0, 0))
Definition AMReX_MFParallelForG.H:58
Definition AMReX_Amr.cpp:49
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
Definition AMReX_CTOParallelForImpl.H:191
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T bisect(T lo, T hi, F f, T tol=1e-12, int max_iter=100)
Definition AMReX_Algorithm.H:105
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > grow(const BoxND< dim > &b, int i) noexcept
Grow BoxND in all directions by given amount.
Definition AMReX_Box.H:1211
BoxIndexerND< AMREX_SPACEDIM > BoxIndexer
Definition AMReX_Box.H:2099
Arena * The_Pinned_Arena()
Definition AMReX_Arena.cpp:656
std::size_t aligned_size(std::size_t align_requirement, std::size_t size) noexcept
Given a minimum required size of size bytes, this returns the next largest arena size that will align...
Definition AMReX_Arena.H:30
Arena * The_Arena()
Definition AMReX_Arena.cpp:616
Definition AMReX_FabArrayCommI.H:896
Definition AMReX_Box.H:2027
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t numPts() const
Definition AMReX_Box.H:2068