Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_BaseFabUtility.H
Go to the documentation of this file.
1#ifndef AMREX_BASEFAB_UTILITY_H_
2#define AMREX_BASEFAB_UTILITY_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_BaseFab.H>
6#include <AMReX_TypeTraits.H>
7
8namespace amrex {
9
10template <class Tto, class Tfrom>
12void
13cast (BaseFab<Tto>& tofab, BaseFab<Tfrom> const& fromfab,
14 Box const& bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
15{
16 auto const& tdata = tofab.array();
17 auto const& fdata = fromfab.const_array();
18 amrex::LoopConcurrent(bx, ncomp.n, [=] (int i, int j, int k, int n) noexcept
19 {
20 tdata(i,j,k,n+dcomp.i) = static_cast<Tto>(fdata(i,j,k,n+scomp.i));
21 });
22}
23
24template <typename STRUCT, typename F>
25requires ((sizeof(STRUCT)<=36*8) &&
27 std::is_trivially_destructible_v<STRUCT>)
28void fill (BaseFab<STRUCT>& aos_fab, F const& f)
29{
30 Box const& box = aos_fab.box();
31 auto const& aos = aos_fab.array();
32 using T = typename STRUCT::value_type;
33 constexpr int STRUCTSIZE = sizeof(STRUCT)/sizeof(T);
34 static_assert(sizeof(STRUCT) == sizeof(T)*STRUCTSIZE,
35 "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
36#ifdef AMREX_USE_GPU
37 if (Gpu::inLaunchRegion()) {
38 BoxIndexer indexer(box);
39 const auto ntotcells = std::uint64_t(box.numPts());
40 constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
41 std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
42 AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
43 auto nblocks = int(nblocks_long);
44 std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
45 T* p = (T*)aos_fab.dataPtr();
46#ifdef AMREX_USE_SYCL
47 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
48 [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
49 {
50 auto const icell = std::uint64_t(handler.globalIdx());
51 std::uint64_t const blockDimx = handler.blockDim();
52 std::uint64_t const threadIdxx = handler.threadIdx();
53 std::uint64_t const blockIdxx = handler.blockIdx();
54 auto const shared = (T*)handler.sharedMemory();
55 if (icell < indexer.numPts()) {
56 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
57 auto [i, j, k] = indexer(icell);
58 f(*ga, i, j, k);
59 }
60 handler.sharedBarrier();
61 for (std::uint64_t m = threadIdxx,
62 mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
63 m < mend; m += blockDimx) {
64 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
65 }
66 });
67#else
68 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
69 [=] AMREX_GPU_DEVICE () noexcept
70 {
71 std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
73 T* const shared = gsm.dataPtr();
74 if (icell < indexer.numPts()) {
75 auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
76 auto [i, j, k] = indexer(icell);
77 f(*ga, i, j, k);
78 }
79 __syncthreads();
80 for (std::uint64_t m = threadIdx.x,
81 mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
82 m < mend; m += blockDim.x) {
83 p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
84 }
85 });
86#endif
87 } else
88#endif
89 {
90 amrex::LoopOnCpu(box, [&] (int i, int j, int k) noexcept
91 {
92 f(aos(i,j,k), i, j, k);
93 });
94 }
95}
96
103template <typename T>
104void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
105{
106 AMREX_ALWAYS_ASSERT(pi != po);
107
108#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
109
110 constexpr int tile_dim = 32;
111 constexpr int block_rows = 16;
112 constexpr int nthreads = tile_dim*block_rows;
113
114 // Each block has tile_dim x block_rows threads. They work on a tile_dim
115 // x tile_dim tile.
116
117 dim3 block{unsigned(tile_dim), unsigned(block_rows), 1};
118 dim3 grid{unsigned((nx+tile_dim-1)/tile_dim),
119 unsigned((nz+tile_dim-1)/tile_dim), unsigned(ny)};
120
121 AMREX_LAUNCH_KERNEL(nthreads, grid, block, 0, Gpu::gpuStream(),
122 [=] AMREX_GPU_DEVICE ()
123 {
124 __shared__ T tile[tile_dim][tile_dim+1]; // +1 to avoid bank conflicts
125
126 int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction
127 int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction
128
129 int j = blockIdx.z; // for y-direction
130
131 if (k < nz) {
132 for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
133 if (i < nx) {
134 // x z
135 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
136 }
137 }
138 }
139
140 __syncthreads();
141
142 i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction
143 k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction
144
145 if (i < nx) {
146 for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
147 if (k < nz) {
148 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
149 }
150 }
151 }
152 });
153
154#elif defined(AMREX_USE_SYCL)
155
156 constexpr int tile_dim = 32;
157 constexpr int block_rows = 8;
158
159 // Each block has tile_dim x block_rows threads. They work on a tile_dim
160 // x tile_dim tile.
161
162 sycl::range<3> block{std::size_t(1), std::size_t(block_rows), std::size_t(tile_dim)};
163 sycl::range<3> grid{std::size_t(ny), std::size_t((nz+tile_dim-1)/tile_dim),
164 std::size_t((nx+tile_dim-1)/tile_dim)};
165 sycl::range<3> global_size{grid[0]*block[0],
166 grid[1]*block[1],
167 grid[2]*block[2]};
168
169 auto& q = *(Gpu::gpuStream().queue);
170 try {
171 q.submit([&] (sycl::handler& h)
172 {
173 auto tile = sycl::local_accessor<T,2>(sycl::range<2>(tile_dim,tile_dim+1),h);
174
175 h.parallel_for(sycl::nd_range<3>(global_size, block),
176 [=] (sycl::nd_item<3> item)
177 {
178 auto group = item.get_group();
179 dim3 blockIdx{unsigned(group.get_group_id(2)),
180 unsigned(group.get_group_id(1)),
181 unsigned(group.get_group_id(0))};
182 dim3 threadIdx{unsigned(item.get_local_id(2)),
183 unsigned(item.get_local_id(1)),
184 unsigned(item.get_local_id(0))};
185
186 int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction
187 int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction
188
189 int j = blockIdx.z; // for y-direction
190
191 if (k < nz) {
192 for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
193 if (i < nx) {
194 // x z
195 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
196 }
197 }
198 }
199
200 item.barrier(sycl::access::fence_space::local_space);
201
202 i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction
203 k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction
204
205 if (i < nx) {
206 for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
207 if (k < nz) {
208 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
209 }
210 }
211 }
212 });
213 });
214 } catch (sycl::exception const& ex) {
215 amrex::Abort(std::string("transposeCtoF: ")+ex.what()+"!!!!!");
216 }
217
218#else
219
220 constexpr int bx = 32;
221 constexpr int bz = 32;
222
223 std::size_t nxy = std::size_t(nx) * ny;
224 std::size_t nyz = std::size_t(ny) * nz;
225
226#ifdef AMREX_USE_OMP
227#pragma omp parallel for collapse(3)
228#endif
229 for (int j = 0; j < ny; ++j) {
230 for (int k0 = 0; k0 < nz; k0 += bz) {
231 for (int i0 = 0; i0 < nx; i0 += bx) {
232 int imax = std::min(i0+bx, nx);
233 int kmax = std::min(k0+bz, nz);
234 auto * AMREX_RESTRICT pdst = po + j*std::size_t(nx);
235 auto const* AMREX_RESTRICT psrc = pi + j*std::size_t(nz);
236 for (int i = i0; i < imax; ++i) {
238 for (int k = k0; k < kmax; ++k) {
239 pdst[i + k*nxy] = psrc[k + i*nyz];
240 }
241 }
242 }
243 }
244 }
245
246#endif
247}
248
255template <typename T>
256void transposeCtoF (T const* pi, T* po, int nx, int ny)
257{
258 transposeCtoF(pi, po, nx, 1, ny);
259}
260
261}
262
263#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_RESTRICT
Definition AMReX_Extension.H:32
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:37
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
#define AMREX_IS_TRIVIALLY_COPYABLE(T)
Definition AMReX_TypeTraits.H:10
A FortranArrayBox(FAB)-like object.
Definition AMReX_BaseFab.H:194
const Box & box() const noexcept
Returns the domain (box) where the array is defined.
Definition AMReX_BaseFab.H:299
Array4< T const > array() const noexcept
Definition AMReX_BaseFab.H:387
T * dataPtr(int n=0) noexcept
Returns a pointer to an object of type T that is the value of the Nth component associated with the c...
Definition AMReX_BaseFab.H:360
__host__ __device__ Long numPts() const noexcept
Return the number of points contained in the BoxND.
Definition AMReX_Box.H:364
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:88
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
Definition AMReX_Amr.cpp:50
void fill(BaseFab< STRUCT > &aos_fab, F const &f)
Definition AMReX_BaseFabUtility.H:28
void transposeCtoF(T const *pi, T *po, int nx, int ny, int nz)
Definition AMReX_BaseFabUtility.H:104
__host__ __device__ void cast(BaseFab< Tto > &tofab, BaseFab< Tfrom > const &fromfab, Box const &bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
Definition AMReX_BaseFabUtility.H:13
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
const int[]
Definition AMReX_BLProfiler.cpp:1664
void LoopOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:365
__host__ __device__ void LoopConcurrent(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:152
Definition AMReX_Box.H:2170
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2211
Definition AMReX_BaseFab.H:77
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuMemory.H:126
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:127
Definition AMReX_BaseFab.H:83
Definition AMReX_BaseFab.H:71