31 Box const& box = aos_fab.
box();
32 auto const& aos = aos_fab.
array();
33 using T =
typename STRUCT::value_type;
34 constexpr int STRUCTSIZE =
sizeof(STRUCT)/
sizeof(T);
35 static_assert(
sizeof(STRUCT) ==
sizeof(T)*STRUCTSIZE,
36 "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
40 const auto ntotcells = std::uint64_t(box.
numPts());
41 constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
42 std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
43 AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
44 auto nblocks = int(nblocks_long);
45 std::size_t shared_mem_bytes = nthreads_per_block *
sizeof(STRUCT);
48 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
51 auto const icell = std::uint64_t(handler.globalIdx());
52 std::uint64_t
const blockDimx = handler.blockDim();
53 std::uint64_t
const threadIdxx = handler.threadIdx();
54 std::uint64_t
const blockIdxx = handler.blockIdx();
55 auto const shared = (T*)handler.sharedMemory();
56 if (icell < indexer.
numPts()) {
57 auto ga =
new(shared+threadIdxx*STRUCTSIZE) STRUCT;
58 auto [i, j, k] = indexer(icell);
61 handler.sharedBarrier();
62 for (std::uint64_t m = threadIdxx,
63 mend = amrex::min<std::uint64_t>(blockDimx, indexer.
numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
64 m < mend; m += blockDimx) {
65 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
69 amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes,
Gpu::gpuStream(),
72 std::uint64_t
const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
74 T*
const shared = gsm.
dataPtr();
75 if (icell < indexer.
numPts()) {
76 auto ga =
new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
77 auto [i, j, k] = indexer(icell);
81 for (std::uint64_t m = threadIdx.x,
82 mend = amrex::min<std::uint64_t>(blockDim.x, indexer.
numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
83 m < mend; m += blockDim.x) {
84 p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
93 f(aos(i,j,k), i, j, k);
109#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
111 constexpr int tile_dim = 32;
112 constexpr int block_rows = 16;
113 constexpr int nthreads = tile_dim*block_rows;
118 dim3 block{unsigned(tile_dim), unsigned(block_rows), 1};
119 dim3 grid{unsigned((nx+tile_dim-1)/tile_dim),
120 unsigned((nz+tile_dim-1)/tile_dim), unsigned(ny)};
125 __shared__ T tile[tile_dim][tile_dim+1];
127 int k = blockIdx.y * tile_dim + threadIdx.x;
128 int i = blockIdx.x * tile_dim + threadIdx.y;
133 for (
int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
136 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
143 i = blockIdx.x * tile_dim + threadIdx.x;
144 k = blockIdx.y * tile_dim + threadIdx.y;
147 for (
int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
149 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
155#elif defined(AMREX_USE_SYCL)
157 constexpr int tile_dim = 32;
158 constexpr int block_rows = 8;
163 sycl::range<3> block{std::size_t(1), std::size_t(block_rows), std::size_t(tile_dim)};
164 sycl::range<3> grid{std::size_t(ny), std::size_t((nz+tile_dim-1)/tile_dim),
165 std::size_t((nx+tile_dim-1)/tile_dim)};
166 sycl::range<3> global_size{grid[0]*block[0],
172 q.submit([&] (sycl::handler& h)
174 auto tile = sycl::local_accessor<T,2>(sycl::range<2>(tile_dim,tile_dim+1),h);
176 h.parallel_for(sycl::nd_range<3>(global_size, block),
177 [=] (sycl::nd_item<3> item)
179 auto group = item.get_group();
180 dim3 blockIdx{unsigned(group.get_group_id(2)),
181 unsigned(group.get_group_id(1)),
182 unsigned(group.get_group_id(0))};
183 dim3 threadIdx{unsigned(item.get_local_id(2)),
184 unsigned(item.get_local_id(1)),
185 unsigned(item.get_local_id(0))};
187 int k = blockIdx.y * tile_dim + threadIdx.x;
188 int i = blockIdx.x * tile_dim + threadIdx.y;
193 for (
int it = 0; it < tile_dim; it += block_rows, i += block_rows) {
196 tile[threadIdx.y+it][threadIdx.x] = pi[k + (j+i*std::size_t(ny))*nz];
201 item.barrier(sycl::access::fence_space::local_space);
203 i = blockIdx.x * tile_dim + threadIdx.x;
204 k = blockIdx.y * tile_dim + threadIdx.y;
207 for (
int it = 0; it < tile_dim; it += block_rows, k += block_rows) {
209 po[i + (j+k*std::size_t(ny))*nx] = tile[threadIdx.x][threadIdx.y+it];
215 }
catch (sycl::exception
const& ex) {
216 amrex::Abort(std::string(
"transposeCtoF: ")+ex.what()+
"!!!!!");
221 constexpr int bx = 32;
222 constexpr int bz = 32;
224 std::size_t nxy = std::size_t(nx) * ny;
225 std::size_t nyz = std::size_t(ny) * nz;
228#pragma omp parallel for collapse(3)
230 for (
int j = 0; j < ny; ++j) {
231 for (
int k0 = 0; k0 < nz; k0 += bz) {
232 for (
int i0 = 0; i0 < nx; i0 += bx) {
233 int imax = std::min(i0+bx, nx);
234 int kmax = std::min(k0+bz, nz);
237 for (
int i = i0; i < imax; ++i) {
239 for (
int k = k0; k < kmax; ++k) {
240 pdst[i + k*nxy] = psrc[k + i*nyz];
__host__ __device__ void cast(BaseFab< Tto > &tofab, BaseFab< Tfrom > const &fromfab, Box const &bx, SrcComp scomp, DestComp dcomp, NumComps ncomp) noexcept
Definition AMReX_BaseFabUtility.H:13