1#ifndef AMREX_FFT_HELPER_H_
2#define AMREX_FFT_HELPER_H_
3#include <AMReX_Config.H>
22#if defined(AMREX_USE_CUDA)
24# include <cuComplex.h>
25#elif defined(AMREX_USE_HIP)
26# if __has_include(<rocfft/rocfft.h>)
27# include <rocfft/rocfft.h>
31# include <hip/hip_complex.h>
32#elif defined(AMREX_USE_SYCL)
33# if __has_include(<oneapi/mkl/dft.hpp>)
34# include <oneapi/mkl/dft.hpp>
36# define AMREX_USE_MKL_DFTI_2024 1
37# include <oneapi/mkl/dfti.hpp>
84 int nprocs = std::numeric_limits<int>::max();
136namespace detail {
void hip_execute (rocfft_plan plan,
void **in,
void **out); }
144inline void assert_no_external_stream ()
148 "SYCL FFT does not support external GPU streams.");
151template <
typename T, Direction direction,
typename P,
typename TI,
typename TO>
152void sycl_execute (P* plan, TI* in, TO* out)
154 assert_no_external_stream();
155#ifndef AMREX_USE_MKL_DFTI_2024
156 std::int64_t workspaceSize = 0;
158 std::size_t workspaceSize = 0;
160 plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
163 plan->set_workspace(buffer);
165 if (std::is_same_v<TI,TO>) {
167 if constexpr (direction == Direction::forward) {
168 r = oneapi::mkl::dft::compute_forward(*plan, out);
170 r = oneapi::mkl::dft::compute_backward(*plan, out);
173 if constexpr (direction == Direction::forward) {
174 r = oneapi::mkl::dft::compute_forward(*plan, in, out);
176 r = oneapi::mkl::dft::compute_backward(*plan, in, out);
189#if defined(AMREX_USE_CUDA)
192 cuComplex, cuDoubleComplex>;
193#elif defined(AMREX_USE_HIP)
195 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
197#elif defined(AMREX_USE_SYCL)
198 using mkl_desc_r = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
199 ? oneapi::mkl::dft::precision::SINGLE
200 : oneapi::mkl::dft::precision::DOUBLE,
201 oneapi::mkl::dft::domain::REAL>;
202 using mkl_desc_c = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
203 ? oneapi::mkl::dft::precision::SINGLE
204 : oneapi::mkl::dft::precision::DOUBLE,
205 oneapi::mkl::dft::domain::COMPLEX>;
206 using VendorPlan = std::variant<mkl_desc_r*,mkl_desc_c*>;
209 using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
210 fftwf_plan, fftw_plan>;
211 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
212 fftwf_complex, fftw_complex>;
248#if !defined(AMREX_USE_GPU)
265 template <Direction D>
270 int rank = is_2d_transform ? 2 : 1;
285 int nr = (rank == 1) ? len[0] : len[0]*len[1];
287 int nc = (rank == 1) ? (len[0]/2+1) : (len[1]/2+1)*len[0];
288#if (AMREX_SPACEDIM == 1)
298#if defined(AMREX_USE_CUDA)
302 std::size_t work_size;
304 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
306 (cufftMakePlanMany(
plan, rank, len,
nullptr, 1, nr,
nullptr, 1, nc, fwd_type,
howmany, &work_size));
308 cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
310 (cufftMakePlanMany(
plan, rank, len,
nullptr, 1, nc,
nullptr, 1, nr, bwd_type,
howmany, &work_size));
313#elif defined(AMREX_USE_HIP)
315 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
317 std::size_t
length[2] = {std::size_t(len[1]), std::size_t(len[0])};
319 AMREX_ROCFFT_SAFE_CALL
320 (rocfft_plan_create(&
plan, rocfft_placement_notinplace,
321 rocfft_transform_type_real_forward, prec, rank,
324 AMREX_ROCFFT_SAFE_CALL
325 (rocfft_plan_create(&
plan, rocfft_placement_notinplace,
326 rocfft_transform_type_real_inverse, prec, rank,
330#elif defined(AMREX_USE_SYCL)
334 pp =
new mkl_desc_r(len[0]);
336 pp =
new mkl_desc_r({std::int64_t(len[0]), std::int64_t(len[1])});
338#ifndef AMREX_USE_MKL_DFTI_2024
339 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
340 oneapi::mkl::dft::config_value::NOT_INPLACE);
342 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
344 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
howmany);
345 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nr);
346 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
347 std::vector<std::int64_t> strides;
348 strides.push_back(0);
349 if (rank == 2) { strides.push_back(len[1]); }
350 strides.push_back(1);
351#ifndef AMREX_USE_MKL_DFTI_2024
352 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
355 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
358 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
359 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
360 detail::assert_no_external_stream();
361 pp->commit(amrex::Gpu::Device::streamQueue());
366 if constexpr (std::is_same_v<float,T>) {
368 plan = fftwf_plan_many_dft_r2c
369 (rank, len,
howmany, pr,
nullptr, 1, nr, pc,
nullptr, 1, nc,
370 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
372 plan = fftwf_plan_many_dft_c2r
373 (rank, len,
howmany, pc,
nullptr, 1, nc, pr,
nullptr, 1, nr,
374 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
378 plan = fftw_plan_many_dft_r2c
379 (rank, len,
howmany, pr,
nullptr, 1, nr, pc,
nullptr, 1, nc,
380 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
382 plan = fftw_plan_many_dft_c2r
383 (rank, len,
howmany, pc,
nullptr, 1, nc, pr,
nullptr, 1, nr,
384 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
399 template <Direction D,
int M>
417 template <Direction D>
435#if (AMREX_SPACEDIM >= 2)
436 else if (ndims == 2) {
438#if (AMREX_SPACEDIM == 2)
446#if (AMREX_SPACEDIM == 3)
447 else if (ndims == 3) {
457#if defined(AMREX_USE_CUDA)
461 cufftType t = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
462 std::size_t work_size;
464 (cufftMakePlanMany(
plan, ndims, len,
nullptr, 1,
n,
nullptr, 1,
n, t,
howmany, &work_size));
466#elif defined(AMREX_USE_HIP)
468 auto prec = std::is_same_v<float,T> ? rocfft_precision_single
469 : rocfft_precision_double;
471 : rocfft_transform_type_complex_inverse;
475 }
else if (ndims == 2) {
483 AMREX_ROCFFT_SAFE_CALL
484 (rocfft_plan_create(&
plan, rocfft_placement_inplace, dir, prec, ndims,
487#elif defined(AMREX_USE_SYCL)
491 pp =
new mkl_desc_c(
n);
492 }
else if (ndims == 2) {
493 pp =
new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1])});
495 pp =
new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1]), std::int64_t(len[2])});
497#ifndef AMREX_USE_MKL_DFTI_2024
498 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
499 oneapi::mkl::dft::config_value::INPLACE);
501 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
503 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
howmany);
504 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
n);
505 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
n);
506 std::vector<std::int64_t> strides(ndims+1);
509 for (
int i = ndims-1; i >= 1; --i) {
510 strides[i] = strides[i+1] * len[i];
512#ifndef AMREX_USE_MKL_DFTI_2024
513 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
514 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
516 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
517 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
519 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
520 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
521 detail::assert_no_external_stream();
522 pp->commit(amrex::Gpu::Device::streamQueue());
527 if constexpr (std::is_same_v<float,T>) {
529 plan = fftwf_plan_many_dft
530 (ndims, len,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, -1,
533 plan = fftwf_plan_many_dft
534 (ndims, len,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, +1,
539 plan = fftw_plan_many_dft
540 (ndims, len,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, -1,
543 plan = fftw_plan_many_dft
544 (ndims, len,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, +1,
552 template <Direction D>
553 fftw_r2r_kind get_fftw_kind (std::pair<Boundary,Boundary>
const& bc)
573 return fftw_r2r_kind{};
585 template <Direction D>
619 template <Direction D>
620 void init_r2r (
Box const& box, T* p, std::pair<Boundary,Boundary>
const& bc,
621 int howmany_initval = 1)
625 kind = get_r2r_kind<D>(bc);
633#if defined(AMREX_USE_GPU)
653 int nc = (nex/2) + 1;
655#if defined (AMREX_USE_CUDA)
659 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
660 std::size_t work_size;
662 (cufftMakePlanMany(
plan, 1, &nex,
nullptr, 1, nc*2,
nullptr, 1, nc, fwd_type,
howmany, &work_size));
664#elif defined(AMREX_USE_HIP)
667 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
668 const std::size_t
length = nex;
669 AMREX_ROCFFT_SAFE_CALL
670 (rocfft_plan_create(&
plan, rocfft_placement_inplace,
671 rocfft_transform_type_real_forward, prec, 1,
674#elif defined(AMREX_USE_SYCL)
676 auto*
pp =
new mkl_desc_r(nex);
677#ifndef AMREX_USE_MKL_DFTI_2024
678 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
679 oneapi::mkl::dft::config_value::INPLACE);
681 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
683 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
howmany);
684 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nc*2);
685 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
686 std::vector<std::int64_t> strides = {0,1};
687#ifndef AMREX_USE_MKL_DFTI_2024
688 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
689 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
691 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
692 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
694 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
695 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
696 detail::assert_no_external_stream();
697 pp->commit(amrex::Gpu::Device::streamQueue());
703 auto fftw_kind = get_fftw_kind<D>(bc);
704 if constexpr (std::is_same_v<float,T>) {
705 plan = fftwf_plan_many_r2r
706 (1, &
n,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, &fftw_kind,
709 plan = fftw_plan_many_r2r
710 (1, &
n,
howmany, p,
nullptr, 1,
n, p,
nullptr, 1,
n, &fftw_kind,
723 template <Direction D>
725 std::pair<Boundary,Boundary>
const& bc)
731#if defined(AMREX_USE_GPU)
733 init_r2r<D>(box, p, bc, 2);
738 kind = get_r2r_kind<D>(bc);
747 auto fftw_kind = get_fftw_kind<D>(bc);
748 if constexpr (std::is_same_v<float,T>) {
749 plan = fftwf_plan_many_r2r
750 (1, &
n,
howmany, p,
nullptr, 2,
n*2, p,
nullptr, 2,
n*2, &fftw_kind,
752 plan2 = fftwf_plan_many_r2r
753 (1, &
n,
howmany, p+1,
nullptr, 2,
n*2, p+1,
nullptr, 2,
n*2, &fftw_kind,
756 plan = fftw_plan_many_r2r
757 (1, &
n,
howmany, p,
nullptr, 2,
n*2, p,
nullptr, 2,
n*2, &fftw_kind,
759 plan2 = fftw_plan_many_r2r
760 (1, &
n,
howmany, p+1,
nullptr, 2,
n*2, p+1,
nullptr, 2,
n*2, &fftw_kind,
771 template <Direction D>
782#if defined(AMREX_USE_CUDA)
785 std::size_t work_size = 0;
792 if constexpr (std::is_same_v<float,T>) {
798 if constexpr (std::is_same_v<float,T>) {
806#elif defined(AMREX_USE_HIP)
807 detail::hip_execute(
plan, (
void**)&pi, (
void**)&po);
808#elif defined(AMREX_USE_SYCL)
809 detail::sycl_execute<T,D>(std::get<0>(
plan), pi, po);
812 if constexpr (std::is_same_v<float,T>) {
823 template <Direction D>
831#if defined(AMREX_USE_CUDA)
834 std::size_t work_size = 0;
841 if constexpr (std::is_same_v<float,T>) {
848#elif defined(AMREX_USE_HIP)
849 detail::hip_execute(
plan, (
void**)&p, (
void**)&p);
850#elif defined(AMREX_USE_SYCL)
851 detail::sycl_execute<T,D>(std::get<1>(
plan), p, p);
854 if constexpr (std::is_same_v<float,T>) {
877 amrex::Abort(
"FFT: alloc_scratch_space: unsupported kind");
895 auto*
pdst = (T*) pbuf;
898 int ostride = (
n+1)*2;
906 auto batch = ielem /
Long(nex);
907 auto i =
int(ielem - batch*nex);
908 for (
int ir = 0; ir < 2; ++ir) {
909 auto* po =
pdst + (2*batch+ir)*ostride + i;
910 auto const* pi = psrc + 2*batch*istride + ir;
914 *po = sign * pi[(2*norig-1-i)*2];
921 auto batch = ielem /
Long(nex);
922 auto i =
int(ielem - batch*nex);
923 auto* po =
pdst + batch*ostride + i;
924 auto const* pi = psrc + batch*istride;
928 *po = sign * pi[2*norig-1-i];
933 int ostride = (2*
n+1)*2;
941 auto batch = ielem /
Long(nex);
942 auto i =
int(ielem - batch*nex);
943 for (
int ir = 0; ir < 2; ++ir) {
944 auto* po =
pdst + (2*batch+ir)*ostride + i;
945 auto const* pi = psrc + 2*batch*istride + ir;
948 }
else if (i < (2*norig-1)) {
949 *po = pi[(2*norig-2-i)*2];
950 }
else if (i == (2*norig-1)) {
952 }
else if (i < (3*norig)) {
953 *po = -pi[(i-2*norig)*2];
954 }
else if (i < (4*norig-1)) {
955 *po = -pi[(4*norig-2-i)*2];
964 auto batch = ielem /
Long(nex);
965 auto i =
int(ielem - batch*nex);
966 auto* po =
pdst + batch*ostride + i;
967 auto const* pi = psrc + batch*istride;
970 }
else if (i < (2*norig-1)) {
971 *po = pi[2*norig-2-i];
972 }
else if (i == (2*norig-1)) {
974 }
else if (i < (3*norig)) {
975 *po = -pi[i-2*norig];
976 }
else if (i < (4*norig-1)) {
977 *po = -pi[4*norig-2-i];
984 int ostride = (2*
n+1)*2;
992 auto batch = ielem /
Long(nex);
993 auto i =
int(ielem - batch*nex);
994 for (
int ir = 0; ir < 2; ++ir) {
995 auto* po =
pdst + (2*batch+ir)*ostride + i;
996 auto const* pi = psrc + 2*batch*istride + ir;
999 }
else if (i == norig) {
1001 }
else if (i < (2*norig+1)) {
1002 *po = -pi[(2*norig-i)*2];
1003 }
else if (i < (3*norig)) {
1004 *po = -pi[(i-2*norig)*2];
1005 }
else if (i == 3*norig) {
1008 *po = pi[(4*norig-i)*2];
1015 auto batch = ielem /
Long(nex);
1016 auto i =
int(ielem - batch*nex);
1017 auto* po =
pdst + batch*ostride + i;
1018 auto const* pi = psrc + batch*istride;
1021 }
else if (i == norig) {
1023 }
else if (i < (2*norig+1)) {
1024 *po = -pi[2*norig-i];
1025 }
else if (i < (3*norig)) {
1026 *po = -pi[i-2*norig];
1027 }
else if (i == 3*norig) {
1030 *po = pi[4*norig-i];
1035 int ostride = (2*
n+1)*2;
1043 auto batch = ielem /
Long(nex);
1044 auto i =
int(ielem - batch*nex);
1045 for (
int ir = 0; ir < 2; ++ir) {
1046 auto* po =
pdst + (2*batch+ir)*ostride + i;
1047 auto const* pi = psrc + 2*batch*istride + ir;
1050 }
else if (i < (2*norig)) {
1051 *po = -pi[(2*norig-1-i)*2];
1052 }
else if (i < (3*norig)) {
1053 *po = -pi[(i-2*norig)*2];
1055 *po = pi[(4*norig-1-i)*2];
1062 auto batch = ielem /
Long(nex);
1063 auto i =
int(ielem - batch*nex);
1064 auto* po =
pdst + batch*ostride + i;
1065 auto const* pi = psrc + batch*istride;
1068 }
else if (i < (2*norig)) {
1069 *po = -pi[2*norig-1-i];
1070 }
else if (i < (3*norig)) {
1071 *po = -pi[i-2*norig];
1073 *po = pi[4*norig-1-i];
1078 int ostride = (2*
n+1)*2;
1086 auto batch = ielem /
Long(nex);
1087 auto i =
int(ielem - batch*nex);
1088 for (
int ir = 0; ir < 2; ++ir) {
1089 auto* po =
pdst + (2*batch+ir)*ostride + i;
1090 auto const* pi = psrc + 2*batch*istride + ir;
1093 }
else if (i < (2*norig)) {
1094 *po = pi[(2*norig-1-i)*2];
1095 }
else if (i < (3*norig)) {
1096 *po = -pi[(i-2*norig)*2];
1098 *po = -pi[(4*norig-1-i)*2];
1105 auto batch = ielem /
Long(nex);
1106 auto i =
int(ielem - batch*nex);
1107 auto* po =
pdst + batch*ostride + i;
1108 auto const* pi = psrc + batch*istride;
1111 }
else if (i < (2*norig)) {
1112 *po = pi[2*norig-1-i];
1113 }
else if (i < (3*norig)) {
1114 *po = -pi[i-2*norig];
1116 *po = -pi[4*norig-1-i];
1121 amrex::Abort(
"FFT: pack_r2r_buffer: unsupported kind");
1143 auto batch = ielem /
Long(norig);
1144 auto k =
int(ielem - batch*norig);
1146 for (
int ir = 0; ir < 2; ++ir) {
1147 auto const& yk = psrc[(2*batch+ir)*istride+k+1];
1148 pdst[2*batch*ostride+ir+k*2] = s * yk.real() - c * yk.imag();
1154 auto batch = ielem /
Long(norig);
1155 auto k =
int(ielem - batch*norig);
1157 auto const& yk = psrc[batch*istride+k+1];
1158 pdst[batch*ostride+k] = s * yk.real() - c * yk.imag();
1162 int istride = 2*
n+1;
1166 auto batch = ielem /
Long(norig);
1167 auto k =
int(ielem - batch*norig);
1169 for (
int ir = 0; ir < 2; ++ir) {
1170 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1171 pdst[2*batch*ostride+ir+k*2] = T(0.5)*(s * yk.real() - c * yk.imag());
1177 auto batch = ielem /
Long(norig);
1178 auto k =
int(ielem - batch*norig);
1180 auto const& yk = psrc[batch*istride+2*k+1];
1181 pdst[batch*ostride+k] = T(0.5)*(s * yk.real() - c * yk.imag());
1189 auto batch = ielem /
Long(norig);
1190 auto k =
int(ielem - batch*norig);
1192 for (
int ir = 0; ir < 2; ++ir) {
1193 auto const& yk = psrc[(2*batch+ir)*istride+k];
1194 pdst[2*batch*ostride+ir+k*2] = c * yk.real() + s * yk.imag();
1200 auto batch = ielem /
Long(norig);
1201 auto k =
int(ielem - batch*norig);
1203 auto const& yk = psrc[batch*istride+k];
1204 pdst[batch*ostride+k] = c * yk.real() + s * yk.imag();
1208 int istride = 2*
n+1;
1212 auto batch = ielem /
Long(norig);
1213 auto k =
int(ielem - batch*norig);
1214 for (
int ir = 0; ir < 2; ++ir) {
1215 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1216 pdst[2*batch*ostride+ir+k*2] = T(0.5) * yk.real();
1222 auto batch = ielem /
Long(norig);
1223 auto k =
int(ielem - batch*norig);
1224 auto const& yk = psrc[batch*istride+2*k+1];
1225 pdst[batch*ostride+k] = T(0.5) * yk.real();
1229 int istride = 2*
n+1;
1233 auto batch = ielem /
Long(norig);
1234 auto k =
int(ielem - batch*norig);
1236 for (
int ir = 0; ir < 2; ++ir) {
1237 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1238 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (c * yk.real() + s * yk.imag());
1244 auto batch = ielem /
Long(norig);
1245 auto k =
int(ielem - batch*norig);
1247 auto const& yk = psrc[batch*istride+2*k+1];
1248 pdst[batch*ostride+k] = T(0.5) * (c * yk.real() + s * yk.imag());
1252 int istride = 2*
n+1;
1256 auto batch = ielem /
Long(norig);
1257 auto k =
int(ielem - batch*norig);
1259 for (
int ir = 0; ir < 2; ++ir) {
1260 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1261 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (s * yk.real() - c * yk.imag());
1267 auto batch = ielem /
Long(norig);
1268 auto k =
int(ielem - batch*norig);
1270 auto const& yk = psrc[batch*istride+2*k+1];
1271 pdst[batch*ostride+k] = T(0.5) * (s * yk.real() - c * yk.imag());
1275 amrex::Abort(
"FFT: unpack_r2r_buffer: unsupported kind");
1283 template <Direction D>
1289#if defined(AMREX_USE_GPU)
1295#if defined(AMREX_USE_CUDA)
1299 std::size_t work_size = 0;
1305 if constexpr (std::is_same_v<float,T>) {
1311#elif defined(AMREX_USE_HIP)
1312 detail::hip_execute(
plan, (
void**)&pscratch, (
void**)&pscratch);
1313#elif defined(AMREX_USE_SYCL)
1314 detail::sycl_execute<T,Direction::forward>(std::get<0>(
plan), (T*)pscratch, (
VendorComplex*)pscratch);
1321#if defined(AMREX_USE_CUDA)
1327 if constexpr (std::is_same_v<float,T>) {
1328 fftwf_execute(
plan);
1343#if defined(AMREX_USE_CUDA)
1345#elif defined(AMREX_USE_HIP)
1346 AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(
plan));
1347#elif defined(AMREX_USE_SYCL)
1348 std::visit([](
auto&& p) {
delete p; },
plan);
1350 if constexpr (std::is_same_v<float,T>) {
1351 fftwf_destroy_plan(
plan);
1353 fftw_destroy_plan(
plan);
1366 PlanD* get_vendor_plan_d (Key
const& key);
1367 PlanF* get_vendor_plan_f (Key
const& key);
1369 void add_vendor_plan_d (Key
const& key, PlanD plan);
1370 void add_vendor_plan_f (Key
const& key, PlanF plan);
1374template <
typename T>
1375template <Direction D,
int M>
1386 for (
auto s : fft_size) { n *= s; }
1389#if defined(AMREX_USE_GPU)
1390 Key key = {fft_size.template expand<3>(), ncomp, D, kind};
1393 if constexpr (std::is_same_v<float,T>) {
1394 cached_plan = detail::get_vendor_plan_f(key);
1396 cached_plan = detail::get_vendor_plan_d(key);
1399 plan = *cached_plan;
1408 for (
int i = 0; i < M; ++i) {
1409 len[i] = fft_size[M-1-i];
1412 int nc = fft_size[0]/2+1;
1413 for (
int i = 1; i < M; ++i) {
1417#if defined(AMREX_USE_CUDA)
1424 type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
1428 type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
1432 std::size_t work_size;
1434 (cufftMakePlanMany(plan, M, len,
nullptr, 1, n_in,
nullptr, 1, n_out, type, howmany, &work_size));
1436#elif defined(AMREX_USE_HIP)
1438 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
1440 for (
int idim = 0; idim < M; ++idim) {
length[idim] = fft_size[idim]; }
1442 AMREX_ROCFFT_SAFE_CALL
1443 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1444 rocfft_transform_type_real_forward, prec, M,
1445 length, howmany,
nullptr));
1447 AMREX_ROCFFT_SAFE_CALL
1448 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1449 rocfft_transform_type_real_inverse, prec, M,
1450 length, howmany,
nullptr));
1453#elif defined(AMREX_USE_SYCL)
1457 pp =
new mkl_desc_r(fft_size[0]);
1459 std::vector<std::int64_t> len64(M);
1460 for (
int idim = 0; idim < M; ++idim) {
1461 len64[idim] = len[idim];
1463 pp =
new mkl_desc_r(len64);
1465#ifndef AMREX_USE_MKL_DFTI_2024
1466 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
1467 oneapi::mkl::dft::config_value::NOT_INPLACE);
1469 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
1471 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
1472 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
1473 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
1474 std::vector<std::int64_t> strides(M+1);
1477 for (
int i = M-1; i >= 1; --i) {
1478 strides[i] = strides[i+1] * fft_size[M-1-i];
1481#ifndef AMREX_USE_MKL_DFTI_2024
1482 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
1485 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
1488 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
1489 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
1490 detail::assert_no_external_stream();
1491 pp->commit(amrex::Gpu::Device::streamQueue());
1496 if (pf ==
nullptr || pb ==
nullptr) {
1501 if constexpr (std::is_same_v<float,T>) {
1503 plan = fftwf_plan_many_dft_r2c
1504 (M, len, howmany, (
float*)pf,
nullptr, 1, n, (fftwf_complex*)pb,
nullptr, 1, nc,
1507 plan = fftwf_plan_many_dft_c2r
1508 (M, len, howmany, (fftwf_complex*)pb,
nullptr, 1, nc, (
float*)pf,
nullptr, 1, n,
1513 plan = fftw_plan_many_dft_r2c
1514 (M, len, howmany, (
double*)pf,
nullptr, 1, n, (fftw_complex*)pb,
nullptr, 1, nc,
1517 plan = fftw_plan_many_dft_c2r
1518 (M, len, howmany, (fftw_complex*)pb,
nullptr, 1, nc, (
double*)pf,
nullptr, 1, n,
1524#if defined(AMREX_USE_GPU)
1526 if constexpr (std::is_same_v<float,T>) {
1527 detail::add_vendor_plan_f(key, plan);
1529 detail::add_vendor_plan_d(key, plan);
1540 template <
typename FA>
1541 typename FA::FABType::value_type * get_fab (FA& fa)
1543 auto myproc = ParallelContext::MyProcSub();
1544 if (myproc < fa.size()) {
1545 return fa.fabPtr(myproc);
1551 template <
typename FA1,
typename FA2>
1552 std::unique_ptr<char,DataDeleter> make_mfs_share (FA1& fa1, FA2& fa2)
1554 bool not_same_fa =
true;
1555 if constexpr (std::is_same_v<FA1,FA2>) {
1556 not_same_fa = (&fa1 != &fa2);
1558 using FAB1 =
typename FA1::FABType::value_type;
1559 using FAB2 =
typename FA2::FABType::value_type;
1560 using T1 =
typename FAB1::value_type;
1561 using T2 =
typename FAB2::value_type;
1562 auto myproc = ParallelContext::MyProcSub();
1563 bool alloc_1 = (myproc < fa1.size());
1564 bool alloc_2 = (myproc < fa2.size()) && not_same_fa;
1566 if (alloc_1 && alloc_2) {
1567 Box const& box1 = fa1.fabbox(myproc);
1568 Box const& box2 = fa2.fabbox(myproc);
1569 int ncomp1 = fa1.nComp();
1570 int ncomp2 = fa2.nComp();
1572 sizeof(T2)*box2.numPts()*ncomp2));
1573 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1574 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1575 }
else if (alloc_1) {
1576 Box const& box1 = fa1.fabbox(myproc);
1577 int ncomp1 = fa1.nComp();
1579 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1580 }
else if (alloc_2) {
1581 Box const& box2 = fa2.fabbox(myproc);
1582 int ncomp2 = fa2.nComp();
1584 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1588 return std::unique_ptr<char,DataDeleter>((
char*)p, DataDeleter{
The_Arena()});
1597 [[nodiscard]]
constexpr Dim3 operator() (Dim3 i)
const noexcept
1599 return {i.y, i.x, i.z};
1602 static constexpr Dim3 Inverse (Dim3 i)
1604 return {i.y, i.x, i.z};
1607 [[nodiscard]]
constexpr IndexType operator() (IndexType it)
const noexcept
1612 static constexpr IndexType Inverse (IndexType it)
1620 [[nodiscard]]
constexpr Dim3 operator() (Dim3 i)
const noexcept
1622 return {i.z, i.y, i.x};
1625 static constexpr Dim3 Inverse (Dim3 i)
1627 return {i.z, i.y, i.x};
1630 [[nodiscard]]
constexpr IndexType operator() (IndexType it)
const noexcept
1635 static constexpr IndexType Inverse (IndexType it)
1644 [[nodiscard]]
constexpr Dim3 operator() (Dim3 i)
const noexcept
1646 return {i.y, i.z, i.x};
1650 static constexpr Dim3 Inverse (Dim3 i)
1652 return {i.z, i.x, i.y};
1655 [[nodiscard]]
constexpr IndexType operator() (IndexType it)
const noexcept
1660 static constexpr IndexType Inverse (IndexType it)
1669 [[nodiscard]]
constexpr Dim3 operator() (Dim3 i)
const noexcept
1671 return {i.z, i.x, i.y};
1675 static constexpr Dim3 Inverse (Dim3 i)
1677 return {i.y, i.z, i.x};
1680 [[nodiscard]]
constexpr IndexType operator() (IndexType it)
const noexcept
1685 static constexpr IndexType Inverse (IndexType it)
1698 explicit SubHelper (Box
const& domain);
1700 [[nodiscard]]
Box make_box (Box
const& box)
const;
1702 [[nodiscard]] Periodicity make_periodicity (Periodicity
const& period)
const;
1704 [[nodiscard]]
bool ghost_safe (IntVect
const& ng)
const;
1707 [[nodiscard]]
IntVect make_iv (IntVect
const& iv)
const;
1710 [[nodiscard]]
IntVect make_safe_ghost (IntVect
const& ng)
const;
1712 [[nodiscard]] BoxArray inverse_boxarray (BoxArray
const& ba)
const;
1714 [[nodiscard]]
IntVect inverse_order (IntVect
const& order)
const;
1716 template <
typename T>
1717 [[nodiscard]] T make_array (T
const& a)
const
1719#if (AMREX_SPACEDIM == 1)
1722#elif (AMREX_SPACEDIM == 2)
1723 if (m_case == case_1n) {
1724 return T{a[1],a[0]};
1729 if (m_case == case_11n) {
1730 return T{a[2],a[0],a[1]};
1731 }
else if (m_case == case_1n1) {
1732 return T{a[1],a[0],a[2]};
1733 }
else if (m_case == case_1nn) {
1734 return T{a[1],a[2],a[0]};
1735 }
else if (m_case == case_n1n) {
1736 return T{a[0],a[2],a[1]};
1743 [[nodiscard]] GpuArray<int,3> xyz_order ()
const;
1745 template <
typename FA>
1746 FA make_alias_mf (FA
const& mf)
1748 BoxList bl = mf.boxArray().boxList();
1749 for (
auto& b : bl) {
1752 auto const& ng = make_iv(mf.nGrowVect());
1753 FA submf(BoxArray(std::move(bl)), mf.DistributionMap(), mf.nComp(), ng, MFInfo{}.SetAlloc(
false));
1754 using FAB =
typename FA::fab_type;
1755 for (MFIter mfi(submf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
1756 submf.setFab(mfi, FAB(mfi.fabbox(), mf.nComp(), mf[mfi].dataPtr()));
1761#if (AMREX_SPACEDIM == 2)
1762 enum Case { case_1n, case_other };
1763 int m_case = case_other;
1764#elif (AMREX_SPACEDIM == 3)
1765 enum Case { case_11n, case_1n1, case_1nn, case_n1n, case_other };
1766 int m_case = case_other;
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ENUM(CLASS,...)
Definition AMReX_Enum.H:208
#define AMREX_CUFFT_SAFE_CALL(call)
Definition AMReX_GpuError.H:92
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
amrex::ParmParse pp
Input file parser instance for the given namespace.
Definition AMReX_HypreIJIface.cpp:15
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
#define AMREX_D_TERM(a, b, c)
Definition AMReX_SPACE.H:172
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ IntVectND< dim > length() const noexcept
Return the length of the BoxND.
Definition AMReX_Box.H:154
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:836
An Integer Vector in dim-Dimensional Space.
Definition AMReX_IntVect.H:57
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
Arena * The_Arena()
Definition AMReX_Arena.cpp:805
Definition AMReX_FFT_Helper.H:52
Direction
Definition AMReX_FFT_Helper.H:54
Boundary
Definition AMReX_FFT_Helper.H:58
DomainStrategy
Definition AMReX_FFT_Helper.H:56
Kind
Definition AMReX_FFT_Helper.H:60
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
__host__ __device__ std::pair< double, double > sincospi(double x)
Return sin(pi*x) and cos(pi*x) given x.
Definition AMReX_Math.H:204
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IndexTypeND< 3 > IndexType
IndexType is an alias for amrex::IndexTypeND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:36
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FFT_Helper.H:64
bool twod_mode
Definition AMReX_FFT_Helper.H:75
Info & setNumProcs(int n)
Cap the number of MPI ranks used by FFT.
Definition AMReX_FFT_Helper.H:131
bool oned_mode
We might have a special twod_mode: nx or ny == 1 && nz > 1.
Definition AMReX_FFT_Helper.H:78
int batch_size
Batched FFT size. Only support in R2C, not R2X.
Definition AMReX_FFT_Helper.H:81
Info & setDomainStrategy(DomainStrategy s)
Select how the domain is decomposed across MPI ranks.
Definition AMReX_FFT_Helper.H:92
DomainStrategy domain_strategy
Domain composition strategy.
Definition AMReX_FFT_Helper.H:66
int nprocs
Max number of processes to use.
Definition AMReX_FFT_Helper.H:84
int pencil_threshold
Definition AMReX_FFT_Helper.H:70
Info & setOneDMode(bool x)
Flag the degenerate 2-D mode (nx==1 or ny==1) that still batches along z.
Definition AMReX_FFT_Helper.H:117
Info & setBatchSize(int bsize)
Specify the batch size for FFT.
Definition AMReX_FFT_Helper.H:124
Info & setPencilThreshold(int t)
Override the slab→pencil break-even threshold for the automatic strategy.
Definition AMReX_FFT_Helper.H:99
Info & setTwoDMode(bool x)
Restrict transforms to the first two dimensions (3-D problems only).
Definition AMReX_FFT_Helper.H:106
Definition AMReX_FFT_Helper.H:188
void * pf
Definition AMReX_FFT_Helper.H:223
void unpack_r2r_buffer(T *pdst, void const *pbuf) const
Collapse the spectral R2R result back into the original real layout.
Definition AMReX_FFT_Helper.H:1131
std::conditional_t< std::is_same_v< float, T >, cuComplex, cuDoubleComplex > VendorComplex
Definition AMReX_FFT_Helper.H:192
VendorPlan plan2
Definition AMReX_FFT_Helper.H:222
void init_r2c(IntVectND< M > const &fft_size, void *pbf, void *pbb, bool cache, int ncomp=1)
Initialize an M-dimensional batched real-to-complex plan.
Definition AMReX_FFT_Helper.H:1376
int n
Definition AMReX_FFT_Helper.H:215
void destroy()
Release any vendor FFT plan objects owned by this Plan.
Definition AMReX_FFT_Helper.H:242
bool defined2
Definition AMReX_FFT_Helper.H:220
void init_r2r(Box const &box, VendorComplex *pc, std::pair< Boundary, Boundary > const &bc)
Initialize a real-to-real plan that reads/writes complex storage.
Definition AMReX_FFT_Helper.H:724
void pack_r2r_buffer(void *pbuf, T const *psrc) const
Expand the real R2R input into the symmetry-extended buffer expected by CUFFT/rocFFT.
Definition AMReX_FFT_Helper.H:893
static void free_scratch_space(void *p)
Release GPU scratch allocated via alloc_scratch_space().
Definition AMReX_FFT_Helper.H:885
static void destroy_vendor_plan(VendorPlan plan)
Helper that destroys a vendor plan of the appropriate backend type.
Definition AMReX_FFT_Helper.H:1341
Kind get_r2r_kind(std::pair< Boundary, Boundary > const &bc)
Map boundary conditions to the Plan Kind for real-to-real transforms.
Definition AMReX_FFT_Helper.H:586
cufftHandle VendorPlan
Definition AMReX_FFT_Helper.H:190
Kind kind
Definition AMReX_FFT_Helper.H:217
void init_c2c(Box const &box, VendorComplex *p, int ncomp=1, int ndims=1)
Initialize a complex-to-complex plan across 1/2/3 dimensions.
Definition AMReX_FFT_Helper.H:418
int howmany
Definition AMReX_FFT_Helper.H:216
void * pb
Definition AMReX_FFT_Helper.H:224
void init_r2c(Box const &box, T *pr, VendorComplex *pc, bool is_2d_transform=false, int ncomp=1)
Initialize a 1-D or 2-D real-to-complex plan over the supplied Box.
Definition AMReX_FFT_Helper.H:266
void compute_r2r()
Execute the real-to-real plan, including GPU packing/unpacking.
Definition AMReX_FFT_Helper.H:1284
void compute_c2c()
Execute the complex-to-complex plan in place.
Definition AMReX_FFT_Helper.H:824
bool r2r_data_is_complex
Definition AMReX_FFT_Helper.H:218
void * alloc_scratch_space() const
Allocate GPU scratch space large enough to hold packed R2R data.
Definition AMReX_FFT_Helper.H:868
VendorPlan plan
Definition AMReX_FFT_Helper.H:221
void compute_r2c()
Execute the previously initialized real-to-complex plan.
Definition AMReX_FFT_Helper.H:772
bool defined
Definition AMReX_FFT_Helper.H:219
void set_ptrs(void *p0, void *p1)
Register device pointers used by the forward/backward executions.
Definition AMReX_FFT_Helper.H:233
void init_r2r(Box const &box, T *p, std::pair< Boundary, Boundary > const &bc, int howmany_initval=1)
Initialize a real-to-real (cosine/sine) plan that operates on real buffers.
Definition AMReX_FFT_Helper.H:620
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30