30 Box const& box = aos_fab.
box();
31 auto const& aos = aos_fab.
array();
32 using T =
typename STRUCT::value_type;
33 constexpr int STRUCTSIZE =
sizeof(STRUCT)/
sizeof(T);
34 static_assert(
sizeof(STRUCT) ==
sizeof(T)*STRUCTSIZE,
35 "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
39 const auto ntotcells = std::uint64_t(box.
numPts());
40 constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
41 std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
42 AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
43 auto nblocks =
int(nblocks_long);
44 std::size_t shared_mem_bytes = nthreads_per_block *
sizeof(STRUCT);
47 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
50 auto const icell = std::uint64_t(handler.globalIdx());
51 std::uint64_t
const blockDimx = handler.blockDim();
52 std::uint64_t
const threadIdxx = handler.threadIdx();
53 std::uint64_t
const blockIdxx = handler.blockIdx();
54 auto const shared = (T*)handler.sharedMemory();
55 if (icell < indexer.
numPts()) {
56 auto ga =
new(shared+threadIdxx*STRUCTSIZE) STRUCT;
57 auto [i, j, k] = indexer(icell);
60 handler.sharedBarrier();
61 for (std::uint64_t m = threadIdxx,
62 mend = amrex::min<std::uint64_t>(blockDimx, indexer.
numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
63 m < mend; m += blockDimx) {
64 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
68 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
71 std::uint64_t
const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
73 T*
const shared = gsm.
dataPtr();
74 if (icell < indexer.
numPts()) {
75 auto ga =
new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
76 auto [i, j, k] = indexer(icell);
80 for (std::uint64_t m = threadIdx.x,
81 mend = amrex::min<std::uint64_t>(blockDim.x, indexer.
numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
82 m < mend; m += blockDim.x) {
83 p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
92 f(aos(i,j,k), i, j, k);
108#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
110 constexpr int tile_dim = 32;
111 constexpr int block_rows = 16;
112 constexpr int nthreads = tile_dim*block_rows;
117 dim3 block{unsigned(tile_dim), unsigned(block_rows), 1};
118 dim3 grid{unsigned((nx+tile_dim-1)/tile_dim),
119 unsigned((nz+tile_dim-1)/tile_dim), unsigned(ny)};
124 __shared__ T tile[tile_dim][tile_dim+1];
126 int k = blockIdx.y * tile_dim + threadIdx.x;
127 int i = blockIdx.x * tile_dim + threadIdx.y;
132 for (
int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
135 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
142 i = blockIdx.x * tile_dim + threadIdx.x;
143 k = blockIdx.y * tile_dim + threadIdx.y;
146 for (
int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
148 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
154#elif defined(AMREX_USE_SYCL)
156 constexpr int tile_dim = 32;
157 constexpr int block_rows = 8;
162 sycl::range<3> block{std::size_t(1), std::size_t(block_rows), std::size_t(tile_dim)};
163 sycl::range<3> grid{std::size_t(ny), std::size_t((nz+tile_dim-1)/tile_dim),
164 std::size_t((nx+tile_dim-1)/tile_dim)};
165 sycl::range<3> global_size{grid[0]*block[0],
171 q.submit([&] (sycl::handler& h)
173 auto tile = sycl::local_accessor<T,2>(sycl::range<2>(tile_dim,tile_dim+1),h);
175 h.parallel_for(sycl::nd_range<3>(global_size, block),
176 [=] (sycl::nd_item<3> item)
178 auto group = item.get_group();
179 dim3 blockIdx{unsigned(group.get_group_id(2)),
180 unsigned(group.get_group_id(1)),
181 unsigned(group.get_group_id(0))};
182 dim3 threadIdx{unsigned(item.get_local_id(2)),
183 unsigned(item.get_local_id(1)),
184 unsigned(item.get_local_id(0))};
186 int k = blockIdx.y * tile_dim + threadIdx.x;
187 int i = blockIdx.x * tile_dim + threadIdx.y;
192 for (
int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
195 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
200 item.barrier(sycl::access::fence_space::local_space);
202 i = blockIdx.x * tile_dim + threadIdx.x;
203 k = blockIdx.y * tile_dim + threadIdx.y;
206 for (
int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
208 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
214 }
catch (sycl::exception
const& ex) {
215 amrex::Abort(std::string(
"transposeCtoF: ")+ex.what()+
"!!!!!");
220 constexpr int bx = 32;
221 constexpr int bz = 32;
223 std::size_t nxy = std::size_t(nx) * ny;
224 std::size_t nyz = std::size_t(ny) * nz;
227#pragma omp parallel for collapse(3)
229 for (
int j = 0; j < ny; ++j) {
230 for (
int k0 = 0; k0 < nz; k0 += bz) {
231 for (
int i0 = 0; i0 < nx; i0 += bx) {
232 int imax = std::min(i0+bx, nx);
233 int kmax = std::min(k0+bz, nz);
236 for (
int i = i0; i < imax; ++i) {
238 for (
int k = k0; k < kmax; ++k) {
239 pdst[i + k*nxy] = psrc[k + i*nyz];
__host__ __device__ void cast(BaseFab< Tto > &tofab, BaseFab< Tfrom > const &fromfab, Box const &bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
Definition AMReX_BaseFabUtility.H:13