Block-Structured AMR Software Framework
 
Loading...
Searching...
No Matches
AMReX_BaseFabUtility.H
Go to the documentation of this file.
1#ifndef AMREX_BASEFAB_UTILITY_H_
2#define AMREX_BASEFAB_UTILITY_H_
3#include <AMReX_Config.H>
4
5#include <AMReX_BaseFab.H>
6#include <AMReX_TypeTraits.H>
7
8namespace amrex {
9
10template <class Tto, class Tfrom>
12void
13cast (BaseFab<Tto>& tofab, BaseFab<Tfrom> const& fromfab,
14 Box const& bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
15{
16 auto const& tdata = tofab.array();
17 auto const& fdata = fromfab.const_array();
18 amrex::LoopConcurrent(bx, ncomp.n, [=] (int i, int j, int k, int n) noexcept
19 {
20 tdata(i,j,k,n+dcomp.i) = static_cast<Tto>(fdata(i,j,k,n+scomp.i));
21 });
22}
23
24template <typename STRUCT, typename F,
25 std::enable_if_t<(sizeof(STRUCT)<=36*8) &&
27 std::is_trivially_destructible_v<STRUCT>,
28 int>FOO = 0>
29void fill (BaseFab<STRUCT>& aos_fab, F const& f)
30{
31 Box const& box = aos_fab.box();
32 auto const& aos = aos_fab.array();
33 using T = typename STRUCT::value_type;
34 constexpr int STRUCTSIZE = sizeof(STRUCT)/sizeof(T);
35 static_assert(sizeof(STRUCT) == sizeof(T)*STRUCTSIZE,
36 "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
37#ifdef AMREX_USE_GPU
38 if (Gpu::inLaunchRegion()) {
39 BoxIndexer indexer(box);
40 const auto ntotcells = std::uint64_t(box.numPts());
41 constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
42 std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
43 AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
44 auto nblocks = int(nblocks_long);
45 std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
46 T* p = (T*)aos_fab.dataPtr();
47#ifdef AMREX_USE_SYCL
48 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
49 [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
50 {
51 auto const icell = std::uint64_t(handler.globalIdx());
52 std::uint64_t const blockDimx = handler.blockDim();
53 std::uint64_t const threadIdxx = handler.threadIdx();
54 std::uint64_t const blockIdxx = handler.blockIdx();
55 auto const shared = (T*)handler.sharedMemory();
56 if (icell < indexer.numPts()) {
57 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
58 auto [i, j, k] = indexer(icell);
59 f(*ga, i, j, k);
60 }
61 handler.sharedBarrier();
62 for (std::uint64_t m = threadIdxx,
63 mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
64 m < mend; m += blockDimx) {
65 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
66 }
67 });
68#else
69 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
70 [=] AMREX_GPU_DEVICE () noexcept
71 {
72 std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
74 T* const shared = gsm.dataPtr();
75 if (icell < indexer.numPts()) {
76 auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
77 auto [i, j, k] = indexer(icell);
78 f(*ga, i, j, k);
79 }
80 __syncthreads();
81 for (std::uint64_t m = threadIdx.x,
82 mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
83 m < mend; m += blockDim.x) {
84 p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
85 }
86 });
87#endif
88 } else
89#endif
90 {
91 amrex::LoopOnCpu(box, [&] (int i, int j, int k) noexcept
92 {
93 f(aos(i,j,k), i, j, k);
94 });
95 }
96}
97
104template <typename T>
105void transposeCtoF (T const* pi, T* po, int nx, int ny, int nz)
106{
107 AMREX_ALWAYS_ASSERT(pi != po);
108
109#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
110
111 constexpr int tile_dim = 32;
112 constexpr int block_rows = 16;
113 constexpr int nthreads = tile_dim*block_rows;
114
115 // Each block has tile_dim x block_rows threads. They work on a tile_dim
116 // x tile_dim tile.
117
118 dim3 block{unsigned(tile_dim), unsigned(block_rows), 1};
119 dim3 grid{unsigned((nx+tile_dim-1)/tile_dim),
120 unsigned((nz+tile_dim-1)/tile_dim), unsigned(ny)};
121
122 AMREX_LAUNCH_KERNEL(nthreads, grid, block, 0, Gpu::gpuStream(),
123 [=] AMREX_GPU_DEVICE ()
124 {
125 __shared__ T tile[tile_dim][tile_dim+1]; // +1 to avoid bank conflicts
126
127 int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction
128 int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction
129
130 int j = blockIdx.z; // for y-direction
131
132 if (k < nz) {
133 for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
134 if (i < nx) {
135 // x z
136 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
137 }
138 }
139 }
140
141 __syncthreads();
142
143 i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction
144 k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction
145
146 if (i < nx) {
147 for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
148 if (k < nz) {
149 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
150 }
151 }
152 }
153 });
154
155#elif defined(AMREX_USE_SYCL)
156
157 constexpr int tile_dim = 32;
158 constexpr int block_rows = 8;
159
160 // Each block has tile_dim x block_rows threads. They work on a tile_dim
161 // x tile_dim tile.
162
163 sycl::range<3> block{std::size_t(1), std::size_t(block_rows), std::size_t(tile_dim)};
164 sycl::range<3> grid{std::size_t(ny), std::size_t((nz+tile_dim-1)/tile_dim),
165 std::size_t((nx+tile_dim-1)/tile_dim)};
166 sycl::range<3> global_size{grid[0]*block[0],
167 grid[1]*block[1],
168 grid[2]*block[2]};
169
170 auto& q = *(Gpu::gpuStream().queue);
171 try {
172 q.submit([&] (sycl::handler& h)
173 {
174 auto tile = sycl::local_accessor<T,2>(sycl::range<2>(tile_dim,tile_dim+1),h);
175
176 h.parallel_for(sycl::nd_range<3>(global_size, block),
177 [=] (sycl::nd_item<3> item)
178 {
179 auto group = item.get_group();
180 dim3 blockIdx{unsigned(group.get_group_id(2)),
181 unsigned(group.get_group_id(1)),
182 unsigned(group.get_group_id(0))};
183 dim3 threadIdx{unsigned(item.get_local_id(2)),
184 unsigned(item.get_local_id(1)),
185 unsigned(item.get_local_id(0))};
186
187 int k = blockIdx.y * tile_dim + threadIdx.x; // for loading z-direction
188 int i = blockIdx.x * tile_dim + threadIdx.y; // for loading x-direction
189
190 int j = blockIdx.z; // for y-direction
191
192 if (k < nz) {
193 for (int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
194 if (i < nx) {
195 // x z
196 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
197 }
198 }
199 }
200
201 item.barrier(sycl::access::fence_space::local_space);
202
203 i = blockIdx.x * tile_dim + threadIdx.x; // for storing x-direction
204 k = blockIdx.y * tile_dim + threadIdx.y; // for storing z-direction
205
206 if (i < nx) {
207 for (int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
208 if (k < nz) {
209 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
210 }
211 }
212 }
213 });
214 });
215 } catch (sycl::exception const& ex) {
216 amrex::Abort(std::string("transposeCtoF: ")+ex.what()+"!!!!!");
217 }
218
219#else
220
221 constexpr int bx = 32;
222 constexpr int bz = 32;
223
224 std::size_t nxy = std::size_t(nx) * ny;
225 std::size_t nyz = std::size_t(ny) * nz;
226
227#ifdef AMREX_USE_OMP
228#pragma omp parallel for collapse(3)
229#endif
230 for (int j = 0; j < ny; ++j) {
231 for (int k0 = 0; k0 < nz; k0 += bz) {
232 for (int i0 = 0; i0 < nx; i0 += bx) {
233 int imax = std::min(i0+bx, nx);
234 int kmax = std::min(k0+bz, nz);
235 auto * AMREX_RESTRICT pdst = po + j*std::size_t(nx);
236 auto const* AMREX_RESTRICT psrc = pi + j*std::size_t(nz);
237 for (int i = i0; i < imax; ++i) {
239 for (int k = k0; k < kmax; ++k) {
240 pdst[i + k*nxy] = psrc[k + i*nyz];
241 }
242 }
243 }
244 }
245 }
246
247#endif
248}
249
256template <typename T>
257void transposeCtoF (T const* pi, T* po, int nx, int ny)
258{
259 transposeCtoF(pi, po, nx, 1, ny);
260}
261
262}
263
264#endif
#define AMREX_ASSERT(EX)
Definition AMReX_BLassert.H:38
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_PRAGMA_SIMD
Definition AMReX_Extension.H:80
#define AMREX_RESTRICT
Definition AMReX_Extension.H:37
#define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream,...)
Definition AMReX_GpuLaunch.H:35
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
#define AMREX_GPU_HOST_DEVICE
Definition AMReX_GpuQualifiers.H:20
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_IS_TRIVIALLY_COPYABLE(T)
Definition AMReX_TypeTraits.H:10
A FortranArrayBox(FAB)-like object.
Definition AMReX_BaseFab.H:183
const Box & box() const noexcept
Returns the domain (box) where the array is defined.
Definition AMReX_BaseFab.H:287
Array4< T const > array() const noexcept
Definition AMReX_BaseFab.H:375
T * dataPtr(int n=0) noexcept
Returns a pointer to an object of type T that is the value of the Nth component associated with the c...
Definition AMReX_BaseFab.H:348
__host__ __device__ Long numPts() const noexcept
Returns the number of points contained in the BoxND.
Definition AMReX_Box.H:349
bool inLaunchRegion() noexcept
Definition AMReX_GpuControl.H:92
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:241
Definition AMReX_Amr.cpp:49
void transposeCtoF(T const *pi, T *po, int nx, int ny, int nz)
Definition AMReX_BaseFabUtility.H:105
__host__ __device__ void cast(BaseFab< Tto > &tofab, BaseFab< Tfrom > const &fromfab, Box const &bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
Definition AMReX_BaseFabUtility.H:13
void fill(BaseFab< STRUCT > &aos_fab, F const &f)
Definition AMReX_BaseFabUtility.H:29
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
void LoopOnCpu(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:355
__host__ __device__ void LoopConcurrent(Dim3 lo, Dim3 hi, F const &f) noexcept
Definition AMReX_Loop.H:150
Definition AMReX_Box.H:2045
__host__ __device__ std::uint64_t numPts() const
Definition AMReX_Box.H:2086
Definition AMReX_BaseFab.H:72
Definition AMReX_GpuTypes.H:86
Definition AMReX_GpuMemory.H:125
__device__ T * dataPtr() noexcept
Definition AMReX_GpuMemory.H:126
Definition AMReX_BaseFab.H:78
Definition AMReX_BaseFab.H:66