Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FFT_Helper.H
Go to the documentation of this file.
1#ifndef AMREX_FFT_HELPER_H_
2#define AMREX_FFT_HELPER_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_BLProfiler.H>
9#include <AMReX_Enum.H>
10#include <AMReX_FabArray.H>
11#include <AMReX_Gpu.H>
12#include <AMReX_GpuComplex.H>
13#include <AMReX_Math.H>
14#include <AMReX_Periodicity.H>
15
22#if defined(AMREX_USE_CUDA)
23# include <cufft.h>
24# include <cuComplex.h>
25#elif defined(AMREX_USE_HIP)
26# if __has_include(<rocfft/rocfft.h>) // ROCm 5.3+
27# include <rocfft/rocfft.h>
28# else
29# include <rocfft.h>
30# endif
31# include <hip/hip_complex.h>
32#elif defined(AMREX_USE_SYCL)
33# if __has_include(<oneapi/mkl/dft.hpp>) // oneAPI 2025.0
34# include <oneapi/mkl/dft.hpp>
35#else
36# define AMREX_USE_MKL_DFTI_2024 1
37# include <oneapi/mkl/dfti.hpp>
38# endif
39#else
40# include <fftw3.h>
41#endif
42
43#include <algorithm>
44#include <complex>
45#include <limits>
46#include <memory>
47#include <tuple>
48#include <utility>
49#include <variant>
50
51namespace amrex::FFT
52{
53
54enum struct Direction { forward, backward, both, none };
55
57
59
62
63struct Info
64{
67
71
75 bool twod_mode = false;
76
78 bool oned_mode = false;
79
81 int batch_size = 1;
82
84 int nprocs = std::numeric_limits<int>::max();
85
99 Info& setPencilThreshold (int t) { pencil_threshold = t; return *this; }
106 Info& setTwoDMode (bool x) { twod_mode = x; return *this; }
117 Info& setOneDMode (bool x) { oned_mode = x; return *this; }
124 Info& setBatchSize (int bsize) { batch_size = bsize; return *this; }
131 Info& setNumProcs (int n) { nprocs = n; return *this; }
132};
133
134#ifdef AMREX_USE_HIP
136namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
138#endif
139
140#ifdef AMREX_USE_SYCL
142namespace detail
143{
144template <typename T, Direction direction, typename P, typename TI, typename TO>
145void sycl_execute (P* plan, TI* in, TO* out)
146{
147#ifndef AMREX_USE_MKL_DFTI_2024
148 std::int64_t workspaceSize = 0;
149#else
150 std::size_t workspaceSize = 0;
151#endif
152 plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
153 &workspaceSize);
154 auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
155 plan->set_workspace(buffer);
156 sycl::event r;
157 if (std::is_same_v<TI,TO>) {
159 if constexpr (direction == Direction::forward) {
160 r = oneapi::mkl::dft::compute_forward(*plan, out);
161 } else {
162 r = oneapi::mkl::dft::compute_backward(*plan, out);
163 }
164 } else {
165 if constexpr (direction == Direction::forward) {
166 r = oneapi::mkl::dft::compute_forward(*plan, in, out);
167 } else {
168 r = oneapi::mkl::dft::compute_backward(*plan, in, out);
169 }
170 }
171 r.wait();
172 amrex::The_Arena()->free(buffer);
173}
174}
176#endif
177
178template <typename T>
179struct Plan
180{
181#if defined(AMREX_USE_CUDA)
182 using VendorPlan = cufftHandle;
183 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
184 cuComplex, cuDoubleComplex>;
185#elif defined(AMREX_USE_HIP)
186 using VendorPlan = rocfft_plan;
187 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
188 float2, double2>;
189#elif defined(AMREX_USE_SYCL)
190 using mkl_desc_r = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
191 ? oneapi::mkl::dft::precision::SINGLE
192 : oneapi::mkl::dft::precision::DOUBLE,
193 oneapi::mkl::dft::domain::REAL>;
194 using mkl_desc_c = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
195 ? oneapi::mkl::dft::precision::SINGLE
196 : oneapi::mkl::dft::precision::DOUBLE,
197 oneapi::mkl::dft::domain::COMPLEX>;
198 using VendorPlan = std::variant<mkl_desc_r*,mkl_desc_c*>;
199 using VendorComplex = std::complex<T>;
200#else
201 using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
202 fftwf_plan, fftw_plan>;
203 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
204 fftwf_complex, fftw_complex>;
205#endif
206
207 int n = 0;
208 int howmany = 0;
211 bool defined = false;
212 bool defined2 = false;
215 void* pf = nullptr;
216 void* pb = nullptr;
217
218#ifdef AMREX_USE_GPU
225 void set_ptrs (void* p0, void* p1) {
226 pf = p0;
227 pb = p1;
228 }
229#endif
230
234 void destroy ()
235 {
236 if (defined) {
238 defined = false;
239 }
240#if !defined(AMREX_USE_GPU)
241 if (defined2) {
243 defined2 = false;
244 }
245#endif
246 }
247
257 template <Direction D>
258 void init_r2c (Box const& box, T* pr, VendorComplex* pc, bool is_2d_transform = false, int ncomp = 1)
259 {
260 static_assert(D == Direction::forward || D == Direction::backward);
261
262 int rank = is_2d_transform ? 2 : 1;
263
265 defined = true;
266 pf = (void*)pr;
267 pb = (void*)pc;
268
269 int len[2] = {};
270 if (rank == 1) {
271 len[0] = box.length(0);
272 len[1] = box.length(0); // Not used except for HIP. Yes it's `(0)`.
273 } else {
274 len[0] = box.length(1); // Most FFT libraries assume row-major ordering
275 len[1] = box.length(0); // except for rocfft
276 }
277 int nr = (rank == 1) ? len[0] : len[0]*len[1];
278 n = nr;
279 int nc = (rank == 1) ? (len[0]/2+1) : (len[1]/2+1)*len[0];
280#if (AMREX_SPACEDIM == 1)
281 howmany = 1;
282#else
283 howmany = (rank == 1) ? AMREX_D_TERM(1, *box.length(1), *box.length(2))
284 : AMREX_D_TERM(1, *1 , *box.length(2));
285#endif
286 howmany *= ncomp;
287
289
290#if defined(AMREX_USE_CUDA)
291
292 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
293 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
294 std::size_t work_size;
295 if constexpr (D == Direction::forward) {
296 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
298 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nr, nullptr, 1, nc, fwd_type, howmany, &work_size));
299 } else {
300 cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
302 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nc, nullptr, 1, nr, bwd_type, howmany, &work_size));
303 }
304
305#elif defined(AMREX_USE_HIP)
306
307 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
308 // switch to column-major ordering
309 std::size_t length[2] = {std::size_t(len[1]), std::size_t(len[0])};
310 if constexpr (D == Direction::forward) {
311 AMREX_ROCFFT_SAFE_CALL
312 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
313 rocfft_transform_type_real_forward, prec, rank,
314 length, howmany, nullptr));
315 } else {
316 AMREX_ROCFFT_SAFE_CALL
317 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
318 rocfft_transform_type_real_inverse, prec, rank,
319 length, howmany, nullptr));
320 }
321
322#elif defined(AMREX_USE_SYCL)
323
324 mkl_desc_r* pp;
325 if (rank == 1) {
326 pp = new mkl_desc_r(len[0]);
327 } else {
328 pp = new mkl_desc_r({std::int64_t(len[0]), std::int64_t(len[1])});
329 }
330#ifndef AMREX_USE_MKL_DFTI_2024
331 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
332 oneapi::mkl::dft::config_value::NOT_INPLACE);
333#else
334 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
335#endif
336 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
337 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nr);
338 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
339 std::vector<std::int64_t> strides;
340 strides.push_back(0);
341 if (rank == 2) { strides.push_back(len[1]); }
342 strides.push_back(1);
343#ifndef AMREX_USE_MKL_DFTI_2024
344 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
345 // Do not set BWD_STRIDES
346#else
347 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
348 // Do not set BWD_STRIDES
349#endif
350 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
351 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
352 pp->commit(amrex::Gpu::Device::streamQueue());
353 plan = pp;
354
355#else /* FFTW */
356
357 if constexpr (std::is_same_v<float,T>) {
358 if constexpr (D == Direction::forward) {
359 plan = fftwf_plan_many_dft_r2c
360 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
361 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
362 } else {
363 plan = fftwf_plan_many_dft_c2r
364 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
365 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
366 }
367 } else {
368 if constexpr (D == Direction::forward) {
369 plan = fftw_plan_many_dft_r2c
370 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
371 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
372 } else {
373 plan = fftw_plan_many_dft_c2r
374 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
375 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
376 }
377 }
378#endif
379 }
380
390 template <Direction D, int M>
391 void init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp = 1);
392
408 template <Direction D>
409 void init_c2c (Box const& box, VendorComplex* p, int ncomp = 1, int ndims = 1)
410 {
411 static_assert(D == Direction::forward || D == Direction::backward);
412
414 defined = true;
415 pf = (void*)p;
416 pb = (void*)p;
417
418 int len[3] = {};
419
420 if (ndims == 1) {
421 n = box.length(0);
422 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
423 howmany *= ncomp;
424 len[0] = box.length(0);
425 }
426#if (AMREX_SPACEDIM >= 2)
427 else if (ndims == 2) {
428 n = box.length(0) * box.length(1);
429#if (AMREX_SPACEDIM == 2)
430 howmany = ncomp;
431#else
432 howmany = box.length(2) * ncomp;
433#endif
434 len[0] = box.length(1);
435 len[1] = box.length(0);
436 }
437#if (AMREX_SPACEDIM == 3)
438 else if (ndims == 3) {
439 n = box.length(0) * box.length(1) * box.length(2);
440 howmany = ncomp;
441 len[0] = box.length(2);
442 len[1] = box.length(1);
443 len[2] = box.length(0);
444 }
445#endif
446#endif
447
448#if defined(AMREX_USE_CUDA)
449 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
450 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
451
452 cufftType t = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
453 std::size_t work_size;
455 (cufftMakePlanMany(plan, ndims, len, nullptr, 1, n, nullptr, 1, n, t, howmany, &work_size));
456
457#elif defined(AMREX_USE_HIP)
458
459 auto prec = std::is_same_v<float,T> ? rocfft_precision_single
460 : rocfft_precision_double;
461 auto dir= (D == Direction::forward) ? rocfft_transform_type_complex_forward
462 : rocfft_transform_type_complex_inverse;
463 std::size_t length[3];
464 if (ndims == 1) {
465 length[0] = len[0];
466 } else if (ndims == 2) {
467 length[0] = len[1];
468 length[1] = len[0];
469 } else {
470 length[0] = len[2];
471 length[1] = len[1];
472 length[2] = len[0];
473 }
474 AMREX_ROCFFT_SAFE_CALL
475 (rocfft_plan_create(&plan, rocfft_placement_inplace, dir, prec, ndims,
476 length, howmany, nullptr));
477
478#elif defined(AMREX_USE_SYCL)
479
480 mkl_desc_c* pp;
481 if (ndims == 1) {
482 pp = new mkl_desc_c(n);
483 } else if (ndims == 2) {
484 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1])});
485 } else {
486 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1]), std::int64_t(len[2])});
487 }
488#ifndef AMREX_USE_MKL_DFTI_2024
489 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
490 oneapi::mkl::dft::config_value::INPLACE);
491#else
492 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
493#endif
494 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
495 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
496 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
497 std::vector<std::int64_t> strides(ndims+1);
498 strides[0] = 0;
499 strides[ndims] = 1;
500 for (int i = ndims-1; i >= 1; --i) {
501 strides[i] = strides[i+1] * len[i];
502 }
503#ifndef AMREX_USE_MKL_DFTI_2024
504 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
505 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
506#else
507 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
508 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
509#endif
510 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
511 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
512 pp->commit(amrex::Gpu::Device::streamQueue());
513 plan = pp;
514
515#else /* FFTW */
516
517 if constexpr (std::is_same_v<float,T>) {
518 if constexpr (D == Direction::forward) {
519 plan = fftwf_plan_many_dft
520 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
521 FFTW_ESTIMATE);
522 } else {
523 plan = fftwf_plan_many_dft
524 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
525 FFTW_ESTIMATE);
526 }
527 } else {
528 if constexpr (D == Direction::forward) {
529 plan = fftw_plan_many_dft
530 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
531 FFTW_ESTIMATE);
532 } else {
533 plan = fftw_plan_many_dft
534 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
535 FFTW_ESTIMATE);
536 }
537 }
538#endif
539 }
540
541#ifndef AMREX_USE_GPU
542 template <Direction D>
543 fftw_r2r_kind get_fftw_kind (std::pair<Boundary,Boundary> const& bc)
544 {
545 if (bc.first == Boundary::even && bc.second == Boundary::even)
546 {
547 return (D == Direction::forward) ? FFTW_REDFT10 : FFTW_REDFT01;
548 }
549 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
550 {
551 return FFTW_REDFT11;
552 }
553 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
554 {
555 return FFTW_RODFT11;
556 }
557 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
558 {
559 return (D == Direction::forward) ? FFTW_RODFT10 : FFTW_RODFT01;
560 }
561 else {
562 amrex::Abort("FFT: unsupported BC");
563 return fftw_r2r_kind{};
564 }
565
566 }
567#endif
568
575 template <Direction D>
576 Kind get_r2r_kind (std::pair<Boundary,Boundary> const& bc)
577 {
578 if (bc.first == Boundary::even && bc.second == Boundary::even)
579 {
581 }
582 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
583 {
584 return Kind::r2r_eo;
585 }
586 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
587 {
588 return Kind::r2r_oe;
589 }
590 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
591 {
593 }
594 else {
595 amrex::Abort("FFT: unsupported BC");
596 return Kind::none;
597 }
598
599 }
600
609 template <Direction D>
610 void init_r2r (Box const& box, T* p, std::pair<Boundary,Boundary> const& bc,
611 int howmany_initval = 1)
612 {
613 static_assert(D == Direction::forward || D == Direction::backward);
614
615 kind = get_r2r_kind<D>(bc);
616 defined = true;
617 pf = (void*)p;
618 pb = (void*)p;
619
620 n = box.length(0);
621 howmany = AMREX_D_TERM(howmany_initval, *box.length(1), *box.length(2));
622
623#if defined(AMREX_USE_GPU)
624 int nex=0;
625 if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
626 Direction::forward == D) {
627 nex = 2*n;
628 } else if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
629 Direction::backward == D) {
630 nex = 4*n;
631 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
632 Direction::forward == D) {
633 nex = 2*n;
634 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
635 Direction::backward == D) {
636 nex = 4*n;
637 } else if ((bc.first == Boundary::even && bc.second == Boundary::odd) ||
638 (bc.first == Boundary::odd && bc.second == Boundary::even)) {
639 nex = 4*n;
640 } else {
641 amrex::Abort("FFT: unsupported BC");
642 }
643 int nc = (nex/2) + 1;
644
645#if defined (AMREX_USE_CUDA)
646
647 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
648 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
649 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
650 std::size_t work_size;
652 (cufftMakePlanMany(plan, 1, &nex, nullptr, 1, nc*2, nullptr, 1, nc, fwd_type, howmany, &work_size));
653
654#elif defined(AMREX_USE_HIP)
655
657 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
658 const std::size_t length = nex;
659 AMREX_ROCFFT_SAFE_CALL
660 (rocfft_plan_create(&plan, rocfft_placement_inplace,
661 rocfft_transform_type_real_forward, prec, 1,
662 &length, howmany, nullptr));
663
664#elif defined(AMREX_USE_SYCL)
665
666 auto* pp = new mkl_desc_r(nex);
667#ifndef AMREX_USE_MKL_DFTI_2024
668 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
669 oneapi::mkl::dft::config_value::INPLACE);
670#else
671 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
672#endif
673 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
674 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nc*2);
675 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
676 std::vector<std::int64_t> strides = {0,1};
677#ifndef AMREX_USE_MKL_DFTI_2024
678 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
679 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
680#else
681 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
682 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
683#endif
684 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
685 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
686 pp->commit(amrex::Gpu::Device::streamQueue());
687 plan = pp;
688
689#endif
690
691#else /* FFTW */
692 auto fftw_kind = get_fftw_kind<D>(bc);
693 if constexpr (std::is_same_v<float,T>) {
694 plan = fftwf_plan_many_r2r
695 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
696 FFTW_ESTIMATE);
697 } else {
698 plan = fftw_plan_many_r2r
699 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
700 FFTW_ESTIMATE);
701 }
702#endif
703 }
704
712 template <Direction D>
713 void init_r2r (Box const& box, VendorComplex* pc,
714 std::pair<Boundary,Boundary> const& bc)
715 {
716 static_assert(D == Direction::forward || D == Direction::backward);
717
718 auto* p = (T*)pc;
719
720#if defined(AMREX_USE_GPU)
721
722 init_r2r<D>(box, p, bc, 2);
723 r2r_data_is_complex = true;
724
725#else
726
727 kind = get_r2r_kind<D>(bc);
728 defined = true;
729 pf = (void*)p;
730 pb = (void*)p;
731
732 n = box.length(0);
733 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
734
735 defined2 = true;
736 auto fftw_kind = get_fftw_kind<D>(bc);
737 if constexpr (std::is_same_v<float,T>) {
738 plan = fftwf_plan_many_r2r
739 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
740 FFTW_ESTIMATE);
741 plan2 = fftwf_plan_many_r2r
742 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
743 FFTW_ESTIMATE);
744 } else {
745 plan = fftw_plan_many_r2r
746 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
747 FFTW_ESTIMATE);
748 plan2 = fftw_plan_many_r2r
749 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
750 FFTW_ESTIMATE);
751 }
752#endif
753 }
754
760 template <Direction D>
762 {
763 static_assert(D == Direction::forward || D == Direction::backward);
764 if (!defined) { return; }
765
766 using TI = std::conditional_t<(D == Direction::forward), T, VendorComplex>;
767 using TO = std::conditional_t<(D == Direction::backward), T, VendorComplex>;
768 auto* pi = (TI*)((D == Direction::forward) ? pf : pb);
769 auto* po = (TO*)((D == Direction::forward) ? pb : pf);
770
771#if defined(AMREX_USE_CUDA)
772 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
773
774 std::size_t work_size = 0;
775 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
776
777 auto* work_area = The_Arena()->alloc(work_size);
778 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
779
780 if constexpr (D == Direction::forward) {
781 if constexpr (std::is_same_v<float,T>) {
782 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, pi, po));
783 } else {
784 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, pi, po));
785 }
786 } else {
787 if constexpr (std::is_same_v<float,T>) {
788 AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan, pi, po));
789 } else {
790 AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan, pi, po));
791 }
792 }
794 The_Arena()->free(work_area);
795#elif defined(AMREX_USE_HIP)
796 detail::hip_execute(plan, (void**)&pi, (void**)&po);
797#elif defined(AMREX_USE_SYCL)
798 detail::sycl_execute<T,D>(std::get<0>(plan), pi, po);
799#else
801 if constexpr (std::is_same_v<float,T>) {
802 fftwf_execute(plan);
803 } else {
804 fftw_execute(plan);
805 }
806#endif
807 }
808
812 template <Direction D>
814 {
815 static_assert(D == Direction::forward || D == Direction::backward);
816 if (!defined) { return; }
817
818 auto* p = (VendorComplex*)pf;
819
820#if defined(AMREX_USE_CUDA)
821 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
822
823 std::size_t work_size = 0;
824 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
825
826 auto* work_area = The_Arena()->alloc(work_size);
827 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
828
829 auto dir = (D == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
830 if constexpr (std::is_same_v<float,T>) {
831 AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan, p, p, dir));
832 } else {
833 AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan, p, p, dir));
834 }
836 The_Arena()->free(work_area);
837#elif defined(AMREX_USE_HIP)
838 detail::hip_execute(plan, (void**)&p, (void**)&p);
839#elif defined(AMREX_USE_SYCL)
840 detail::sycl_execute<T,D>(std::get<1>(plan), p, p);
841#else
843 if constexpr (std::is_same_v<float,T>) {
844 fftwf_execute(plan);
845 } else {
846 fftw_execute(plan);
847 }
848#endif
849 }
850
851#ifdef AMREX_USE_GPU
857 [[nodiscard]] void* alloc_scratch_space () const
858 {
859 int nc = 0;
860 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
861 nc = n + 1;
862 } else if (kind == Kind::r2r_oo_b || kind == Kind::r2r_ee_b ||
864 nc = 2*n+1;
865 } else {
866 amrex::Abort("FFT: alloc_scratch_space: unsupported kind");
867 }
868 return The_Arena()->alloc(sizeof(GpuComplex<T>)*nc*howmany);
869 }
870
874 static void free_scratch_space (void* p) { The_Arena()->free(p); }
875
882 void pack_r2r_buffer (void* pbuf, T const* psrc) const
883 {
884 auto* pdst = (T*) pbuf;
885 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
886 T sign = (kind == Kind::r2r_oo_f) ? T(-1) : T(1);
887 int ostride = (n+1)*2;
888 int istride = n;
889 int nex = 2*n;
890 int norig = n;
891 Long nelems = Long(nex)*howmany;
893 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
894 {
895 auto batch = ielem / Long(nex);
896 auto i = int(ielem - batch*nex);
897 for (int ir = 0; ir < 2; ++ir) {
898 auto* po = pdst + (2*batch+ir)*ostride + i;
899 auto const* pi = psrc + 2*batch*istride + ir;
900 if (i < norig) {
901 *po = pi[i*2];
902 } else {
903 *po = sign * pi[(2*norig-1-i)*2];
904 }
905 }
906 });
907 } else {
908 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
909 {
910 auto batch = ielem / Long(nex);
911 auto i = int(ielem - batch*nex);
912 auto* po = pdst + batch*ostride + i;
913 auto const* pi = psrc + batch*istride;
914 if (i < norig) {
915 *po = pi[i];
916 } else {
917 *po = sign * pi[2*norig-1-i];
918 }
919 });
920 }
921 } else if (kind == Kind::r2r_oo_b) {
922 int ostride = (2*n+1)*2;
923 int istride = n;
924 int nex = 4*n;
925 int norig = n;
926 Long nelems = Long(nex)*howmany;
928 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
929 {
930 auto batch = ielem / Long(nex);
931 auto i = int(ielem - batch*nex);
932 for (int ir = 0; ir < 2; ++ir) {
933 auto* po = pdst + (2*batch+ir)*ostride + i;
934 auto const* pi = psrc + 2*batch*istride + ir;
935 if (i < norig) {
936 *po = pi[i*2];
937 } else if (i < (2*norig-1)) {
938 *po = pi[(2*norig-2-i)*2];
939 } else if (i == (2*norig-1)) {
940 *po = T(0);
941 } else if (i < (3*norig)) {
942 *po = -pi[(i-2*norig)*2];
943 } else if (i < (4*norig-1)) {
944 *po = -pi[(4*norig-2-i)*2];
945 } else {
946 *po = T(0);
947 }
948 }
949 });
950 } else {
951 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
952 {
953 auto batch = ielem / Long(nex);
954 auto i = int(ielem - batch*nex);
955 auto* po = pdst + batch*ostride + i;
956 auto const* pi = psrc + batch*istride;
957 if (i < norig) {
958 *po = pi[i];
959 } else if (i < (2*norig-1)) {
960 *po = pi[2*norig-2-i];
961 } else if (i == (2*norig-1)) {
962 *po = T(0);
963 } else if (i < (3*norig)) {
964 *po = -pi[i-2*norig];
965 } else if (i < (4*norig-1)) {
966 *po = -pi[4*norig-2-i];
967 } else {
968 *po = T(0);
969 }
970 });
971 }
972 } else if (kind == Kind::r2r_ee_b) {
973 int ostride = (2*n+1)*2;
974 int istride = n;
975 int nex = 4*n;
976 int norig = n;
977 Long nelems = Long(nex)*howmany;
979 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
980 {
981 auto batch = ielem / Long(nex);
982 auto i = int(ielem - batch*nex);
983 for (int ir = 0; ir < 2; ++ir) {
984 auto* po = pdst + (2*batch+ir)*ostride + i;
985 auto const* pi = psrc + 2*batch*istride + ir;
986 if (i < norig) {
987 *po = pi[i*2];
988 } else if (i == norig) {
989 *po = T(0);
990 } else if (i < (2*norig+1)) {
991 *po = -pi[(2*norig-i)*2];
992 } else if (i < (3*norig)) {
993 *po = -pi[(i-2*norig)*2];
994 } else if (i == 3*norig) {
995 *po = T(0);
996 } else {
997 *po = pi[(4*norig-i)*2];
998 }
999 }
1000 });
1001 } else {
1002 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1003 {
1004 auto batch = ielem / Long(nex);
1005 auto i = int(ielem - batch*nex);
1006 auto* po = pdst + batch*ostride + i;
1007 auto const* pi = psrc + batch*istride;
1008 if (i < norig) {
1009 *po = pi[i];
1010 } else if (i == norig) {
1011 *po = T(0);
1012 } else if (i < (2*norig+1)) {
1013 *po = -pi[2*norig-i];
1014 } else if (i < (3*norig)) {
1015 *po = -pi[i-2*norig];
1016 } else if (i == 3*norig) {
1017 *po = T(0);
1018 } else {
1019 *po = pi[4*norig-i];
1020 }
1021 });
1022 }
1023 } else if (kind == Kind::r2r_eo) {
1024 int ostride = (2*n+1)*2;
1025 int istride = n;
1026 int nex = 4*n;
1027 int norig = n;
1028 Long nelems = Long(nex)*howmany;
1029 if (r2r_data_is_complex) {
1030 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1031 {
1032 auto batch = ielem / Long(nex);
1033 auto i = int(ielem - batch*nex);
1034 for (int ir = 0; ir < 2; ++ir) {
1035 auto* po = pdst + (2*batch+ir)*ostride + i;
1036 auto const* pi = psrc + 2*batch*istride + ir;
1037 if (i < norig) {
1038 *po = pi[i*2];
1039 } else if (i < (2*norig)) {
1040 *po = -pi[(2*norig-1-i)*2];
1041 } else if (i < (3*norig)) {
1042 *po = -pi[(i-2*norig)*2];
1043 } else {
1044 *po = pi[(4*norig-1-i)*2];
1045 }
1046 }
1047 });
1048 } else {
1049 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1050 {
1051 auto batch = ielem / Long(nex);
1052 auto i = int(ielem - batch*nex);
1053 auto* po = pdst + batch*ostride + i;
1054 auto const* pi = psrc + batch*istride;
1055 if (i < norig) {
1056 *po = pi[i];
1057 } else if (i < (2*norig)) {
1058 *po = -pi[2*norig-1-i];
1059 } else if (i < (3*norig)) {
1060 *po = -pi[i-2*norig];
1061 } else {
1062 *po = pi[4*norig-1-i];
1063 }
1064 });
1065 }
1066 } else if (kind == Kind::r2r_oe) {
1067 int ostride = (2*n+1)*2;
1068 int istride = n;
1069 int nex = 4*n;
1070 int norig = n;
1071 Long nelems = Long(nex)*howmany;
1072 if (r2r_data_is_complex) {
1073 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1074 {
1075 auto batch = ielem / Long(nex);
1076 auto i = int(ielem - batch*nex);
1077 for (int ir = 0; ir < 2; ++ir) {
1078 auto* po = pdst + (2*batch+ir)*ostride + i;
1079 auto const* pi = psrc + 2*batch*istride + ir;
1080 if (i < norig) {
1081 *po = pi[i*2];
1082 } else if (i < (2*norig)) {
1083 *po = pi[(2*norig-1-i)*2];
1084 } else if (i < (3*norig)) {
1085 *po = -pi[(i-2*norig)*2];
1086 } else {
1087 *po = -pi[(4*norig-1-i)*2];
1088 }
1089 }
1090 });
1091 } else {
1092 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1093 {
1094 auto batch = ielem / Long(nex);
1095 auto i = int(ielem - batch*nex);
1096 auto* po = pdst + batch*ostride + i;
1097 auto const* pi = psrc + batch*istride;
1098 if (i < norig) {
1099 *po = pi[i];
1100 } else if (i < (2*norig)) {
1101 *po = pi[2*norig-1-i];
1102 } else if (i < (3*norig)) {
1103 *po = -pi[i-2*norig];
1104 } else {
1105 *po = -pi[4*norig-1-i];
1106 }
1107 });
1108 }
1109 } else {
1110 amrex::Abort("FFT: pack_r2r_buffer: unsupported kind");
1111 }
1112 }
1113
1120 void unpack_r2r_buffer (T* pdst, void const* pbuf) const
1121 {
1122 auto const* psrc = (GpuComplex<T> const*) pbuf;
1123 int norig = n;
1124 Long nelems = Long(norig)*howmany;
1125 int ostride = n;
1126
1127 if (kind == Kind::r2r_oo_f) {
1128 int istride = n+1;
1129 if (r2r_data_is_complex) {
1130 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1131 {
1132 auto batch = ielem / Long(norig);
1133 auto k = int(ielem - batch*norig);
1134 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1135 for (int ir = 0; ir < 2; ++ir) {
1136 auto const& yk = psrc[(2*batch+ir)*istride+k+1];
1137 pdst[2*batch*ostride+ir+k*2] = s * yk.real() - c * yk.imag();
1138 }
1139 });
1140 } else {
1141 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1142 {
1143 auto batch = ielem / Long(norig);
1144 auto k = int(ielem - batch*norig);
1145 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1146 auto const& yk = psrc[batch*istride+k+1];
1147 pdst[batch*ostride+k] = s * yk.real() - c * yk.imag();
1148 });
1149 }
1150 } else if (kind == Kind::r2r_oo_b) {
1151 int istride = 2*n+1;
1152 if (r2r_data_is_complex) {
1153 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1154 {
1155 auto batch = ielem / Long(norig);
1156 auto k = int(ielem - batch*norig);
1157 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1158 for (int ir = 0; ir < 2; ++ir) {
1159 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1160 pdst[2*batch*ostride+ir+k*2] = T(0.5)*(s * yk.real() - c * yk.imag());
1161 }
1162 });
1163 } else {
1164 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1165 {
1166 auto batch = ielem / Long(norig);
1167 auto k = int(ielem - batch*norig);
1168 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1169 auto const& yk = psrc[batch*istride+2*k+1];
1170 pdst[batch*ostride+k] = T(0.5)*(s * yk.real() - c * yk.imag());
1171 });
1172 }
1173 } else if (kind == Kind::r2r_ee_f) {
1174 int istride = n+1;
1175 if (r2r_data_is_complex) {
1176 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1177 {
1178 auto batch = ielem / Long(norig);
1179 auto k = int(ielem - batch*norig);
1180 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1181 for (int ir = 0; ir < 2; ++ir) {
1182 auto const& yk = psrc[(2*batch+ir)*istride+k];
1183 pdst[2*batch*ostride+ir+k*2] = c * yk.real() + s * yk.imag();
1184 }
1185 });
1186 } else {
1187 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1188 {
1189 auto batch = ielem / Long(norig);
1190 auto k = int(ielem - batch*norig);
1191 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1192 auto const& yk = psrc[batch*istride+k];
1193 pdst[batch*ostride+k] = c * yk.real() + s * yk.imag();
1194 });
1195 }
1196 } else if (kind == Kind::r2r_ee_b) {
1197 int istride = 2*n+1;
1198 if (r2r_data_is_complex) {
1199 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1200 {
1201 auto batch = ielem / Long(norig);
1202 auto k = int(ielem - batch*norig);
1203 for (int ir = 0; ir < 2; ++ir) {
1204 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1205 pdst[2*batch*ostride+ir+k*2] = T(0.5) * yk.real();
1206 }
1207 });
1208 } else {
1209 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1210 {
1211 auto batch = ielem / Long(norig);
1212 auto k = int(ielem - batch*norig);
1213 auto const& yk = psrc[batch*istride+2*k+1];
1214 pdst[batch*ostride+k] = T(0.5) * yk.real();
1215 });
1216 }
1217 } else if (kind == Kind::r2r_eo) {
1218 int istride = 2*n+1;
1219 if (r2r_data_is_complex) {
1220 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1221 {
1222 auto batch = ielem / Long(norig);
1223 auto k = int(ielem - batch*norig);
1224 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1225 for (int ir = 0; ir < 2; ++ir) {
1226 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1227 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (c * yk.real() + s * yk.imag());
1228 }
1229 });
1230 } else {
1231 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1232 {
1233 auto batch = ielem / Long(norig);
1234 auto k = int(ielem - batch*norig);
1235 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1236 auto const& yk = psrc[batch*istride+2*k+1];
1237 pdst[batch*ostride+k] = T(0.5) * (c * yk.real() + s * yk.imag());
1238 });
1239 }
1240 } else if (kind == Kind::r2r_oe) {
1241 int istride = 2*n+1;
1242 if (r2r_data_is_complex) {
1243 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1244 {
1245 auto batch = ielem / Long(norig);
1246 auto k = int(ielem - batch*norig);
1247 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1248 for (int ir = 0; ir < 2; ++ir) {
1249 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1250 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (s * yk.real() - c * yk.imag());
1251 }
1252 });
1253 } else {
1254 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1255 {
1256 auto batch = ielem / Long(norig);
1257 auto k = int(ielem - batch*norig);
1258 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1259 auto const& yk = psrc[batch*istride+2*k+1];
1260 pdst[batch*ostride+k] = T(0.5) * (s * yk.real() - c * yk.imag());
1261 });
1262 }
1263 } else {
1264 amrex::Abort("FFT: unpack_r2r_buffer: unsupported kind");
1265 }
1266 }
1267#endif
1268
1272 template <Direction D>
1274 {
1275 static_assert(D == Direction::forward || D == Direction::backward);
1276 if (!defined) { return; }
1277
1278#if defined(AMREX_USE_GPU)
1279
1280 auto* pscratch = alloc_scratch_space();
1281
1282 pack_r2r_buffer(pscratch, (T*)((D == Direction::forward) ? pf : pb));
1283
1284#if defined(AMREX_USE_CUDA)
1285
1286 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
1287
1288 std::size_t work_size = 0;
1289 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
1290
1291 auto* work_area = The_Arena()->alloc(work_size);
1292 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
1293
1294 if constexpr (std::is_same_v<float,T>) {
1295 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, (T*)pscratch, (VendorComplex*)pscratch));
1296 } else {
1297 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, (T*)pscratch, (VendorComplex*)pscratch));
1298 }
1299
1300#elif defined(AMREX_USE_HIP)
1301 detail::hip_execute(plan, (void**)&pscratch, (void**)&pscratch);
1302#elif defined(AMREX_USE_SYCL)
1303 detail::sycl_execute<T,Direction::forward>(std::get<0>(plan), (T*)pscratch, (VendorComplex*)pscratch);
1304#endif
1305
1306 unpack_r2r_buffer((T*)((D == Direction::forward) ? pb : pf), pscratch);
1307
1309 free_scratch_space(pscratch);
1310#if defined(AMREX_USE_CUDA)
1311 The_Arena()->free(work_area);
1312#endif
1313
1314#else /* FFTW */
1315
1316 if constexpr (std::is_same_v<float,T>) {
1317 fftwf_execute(plan);
1318 if (defined2) { fftwf_execute(plan2); }
1319 } else {
1320 fftw_execute(plan);
1321 if (defined2) { fftw_execute(plan2); }
1322 }
1323
1324#endif
1325 }
1326
1331 {
1332#if defined(AMREX_USE_CUDA)
1333 AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
1334#elif defined(AMREX_USE_HIP)
1335 AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
1336#elif defined(AMREX_USE_SYCL)
1337 std::visit([](auto&& p) { delete p; }, plan);
1338#else
1339 if constexpr (std::is_same_v<float,T>) {
1340 fftwf_destroy_plan(plan);
1341 } else {
1342 fftw_destroy_plan(plan);
1343 }
1344#endif
1345 }
1346};
1347
1349
1350using Key = std::tuple<IntVectND<3>,int,Direction,Kind>;
1351using PlanD = typename Plan<double>::VendorPlan;
1352using PlanF = typename Plan<float>::VendorPlan;
1353
1354namespace detail {
1355 PlanD* get_vendor_plan_d (Key const& key);
1356 PlanF* get_vendor_plan_f (Key const& key);
1357
1358 void add_vendor_plan_d (Key const& key, PlanD plan);
1359 void add_vendor_plan_f (Key const& key, PlanF plan);
1360}
1362
1363template <typename T>
1364template <Direction D, int M>
1365void Plan<T>::init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp)
1366{
1367 static_assert(D == Direction::forward || D == Direction::backward);
1368
1369 kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
1370 defined = true;
1371 pf = pbf;
1372 pb = pbb;
1373
1374 n = 1;
1375 for (auto s : fft_size) { n *= s; }
1376 howmany = ncomp;
1377
1378#if defined(AMREX_USE_GPU)
1379 Key key = {fft_size.template expand<3>(), ncomp, D, kind};
1380 if (cache) {
1381 VendorPlan* cached_plan = nullptr;
1382 if constexpr (std::is_same_v<float,T>) {
1383 cached_plan = detail::get_vendor_plan_f(key);
1384 } else {
1385 cached_plan = detail::get_vendor_plan_d(key);
1386 }
1387 if (cached_plan) {
1388 plan = *cached_plan;
1389 return;
1390 }
1391 }
1392#else
1393 amrex::ignore_unused(cache);
1394#endif
1395
1396 int len[M];
1397 for (int i = 0; i < M; ++i) {
1398 len[i] = fft_size[M-1-i];
1399 }
1400
1401 int nc = fft_size[0]/2+1;
1402 for (int i = 1; i < M; ++i) {
1403 nc *= fft_size[i];
1404 }
1405
1406#if defined(AMREX_USE_CUDA)
1407
1408 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
1409 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
1410 cufftType type;
1411 int n_in, n_out;
1412 if constexpr (D == Direction::forward) {
1413 type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
1414 n_in = n;
1415 n_out = nc;
1416 } else {
1417 type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
1418 n_in = nc;
1419 n_out = n;
1420 }
1421 std::size_t work_size;
1423 (cufftMakePlanMany(plan, M, len, nullptr, 1, n_in, nullptr, 1, n_out, type, howmany, &work_size));
1424
1425#elif defined(AMREX_USE_HIP)
1426
1427 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
1428 std::size_t length[M];
1429 for (int idim = 0; idim < M; ++idim) { length[idim] = fft_size[idim]; }
1430 if constexpr (D == Direction::forward) {
1431 AMREX_ROCFFT_SAFE_CALL
1432 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1433 rocfft_transform_type_real_forward, prec, M,
1434 length, howmany, nullptr));
1435 } else {
1436 AMREX_ROCFFT_SAFE_CALL
1437 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1438 rocfft_transform_type_real_inverse, prec, M,
1439 length, howmany, nullptr));
1440 }
1441
1442#elif defined(AMREX_USE_SYCL)
1443
1444 mkl_desc_r* pp;
1445 if (M == 1) {
1446 pp = new mkl_desc_r(fft_size[0]);
1447 } else {
1448 std::vector<std::int64_t> len64(M);
1449 for (int idim = 0; idim < M; ++idim) {
1450 len64[idim] = len[idim];
1451 }
1452 pp = new mkl_desc_r(len64);
1453 }
1454#ifndef AMREX_USE_MKL_DFTI_2024
1455 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
1456 oneapi::mkl::dft::config_value::NOT_INPLACE);
1457#else
1458 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
1459#endif
1460 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
1461 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
1462 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
1463 std::vector<std::int64_t> strides(M+1);
1464 strides[0] = 0;
1465 strides[M] = 1;
1466 for (int i = M-1; i >= 1; --i) {
1467 strides[i] = strides[i+1] * fft_size[M-1-i];
1468 }
1469
1470#ifndef AMREX_USE_MKL_DFTI_2024
1471 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
1472 // Do not set BWD_STRIDES
1473#else
1474 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
1475 // Do not set BWD_STRIDES
1476#endif
1477 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
1478 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
1479 pp->commit(amrex::Gpu::Device::streamQueue());
1480 plan = pp;
1481
1482#else /* FFTW */
1483
1484 if (pf == nullptr || pb == nullptr) {
1485 defined = false;
1486 return;
1487 }
1488
1489 if constexpr (std::is_same_v<float,T>) {
1490 if constexpr (D == Direction::forward) {
1491 plan = fftwf_plan_many_dft_r2c
1492 (M, len, howmany, (float*)pf, nullptr, 1, n, (fftwf_complex*)pb, nullptr, 1, nc,
1493 FFTW_ESTIMATE);
1494 } else {
1495 plan = fftwf_plan_many_dft_c2r
1496 (M, len, howmany, (fftwf_complex*)pb, nullptr, 1, nc, (float*)pf, nullptr, 1, n,
1497 FFTW_ESTIMATE);
1498 }
1499 } else {
1500 if constexpr (D == Direction::forward) {
1501 plan = fftw_plan_many_dft_r2c
1502 (M, len, howmany, (double*)pf, nullptr, 1, n, (fftw_complex*)pb, nullptr, 1, nc,
1503 FFTW_ESTIMATE);
1504 } else {
1505 plan = fftw_plan_many_dft_c2r
1506 (M, len, howmany, (fftw_complex*)pb, nullptr, 1, nc, (double*)pf, nullptr, 1, n,
1507 FFTW_ESTIMATE);
1508 }
1509 }
1510#endif
1511
1512#if defined(AMREX_USE_GPU)
1513 if (cache) {
1514 if constexpr (std::is_same_v<float,T>) {
1515 detail::add_vendor_plan_f(key, plan);
1516 } else {
1517 detail::add_vendor_plan_d(key, plan);
1518 }
1519 }
1520#endif
1521}
1522
1524namespace detail
1525{
1526 DistributionMapping make_iota_distromap (Long n);
1527
1528 template <typename FA>
1529 typename FA::FABType::value_type * get_fab (FA& fa)
1530 {
1531 auto myproc = ParallelContext::MyProcSub();
1532 if (myproc < fa.size()) {
1533 return fa.fabPtr(myproc);
1534 } else {
1535 return nullptr;
1536 }
1537 }
1538
1539 template <typename FA1, typename FA2>
1540 std::unique_ptr<char,DataDeleter> make_mfs_share (FA1& fa1, FA2& fa2)
1541 {
1542 bool not_same_fa = true;
1543 if constexpr (std::is_same_v<FA1,FA2>) {
1544 not_same_fa = (&fa1 != &fa2);
1545 }
1546 using FAB1 = typename FA1::FABType::value_type;
1547 using FAB2 = typename FA2::FABType::value_type;
1548 using T1 = typename FAB1::value_type;
1549 using T2 = typename FAB2::value_type;
1550 auto myproc = ParallelContext::MyProcSub();
1551 bool alloc_1 = (myproc < fa1.size());
1552 bool alloc_2 = (myproc < fa2.size()) && not_same_fa;
1553 void* p = nullptr;
1554 if (alloc_1 && alloc_2) {
1555 Box const& box1 = fa1.fabbox(myproc);
1556 Box const& box2 = fa2.fabbox(myproc);
1557 int ncomp1 = fa1.nComp();
1558 int ncomp2 = fa2.nComp();
1559 p = The_Arena()->alloc(std::max(sizeof(T1)*box1.numPts()*ncomp1,
1560 sizeof(T2)*box2.numPts()*ncomp2));
1561 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1562 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1563 } else if (alloc_1) {
1564 Box const& box1 = fa1.fabbox(myproc);
1565 int ncomp1 = fa1.nComp();
1566 p = The_Arena()->alloc(sizeof(T1)*box1.numPts()*ncomp1);
1567 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1568 } else if (alloc_2) {
1569 Box const& box2 = fa2.fabbox(myproc);
1570 int ncomp2 = fa2.nComp();
1571 p = The_Arena()->alloc(sizeof(T2)*box2.numPts()*ncomp2);
1572 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1573 } else {
1574 return nullptr;
1575 }
1576 return std::unique_ptr<char,DataDeleter>((char*)p, DataDeleter{The_Arena()});
1577 }
1578}
1580
1582
1583struct Swap01
1584{
1585 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1586 {
1587 return {i.y, i.x, i.z};
1588 }
1589
1590 static constexpr Dim3 Inverse (Dim3 i)
1591 {
1592 return {i.y, i.x, i.z};
1593 }
1594
1595 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1596 {
1597 return it;
1598 }
1599
1600 static constexpr IndexType Inverse (IndexType it)
1601 {
1602 return it;
1603 }
1604};
1605
1606struct Swap02
1607{
1608 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1609 {
1610 return {i.z, i.y, i.x};
1611 }
1612
1613 static constexpr Dim3 Inverse (Dim3 i)
1614 {
1615 return {i.z, i.y, i.x};
1616 }
1617
1618 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1619 {
1620 return it;
1621 }
1622
1623 static constexpr IndexType Inverse (IndexType it)
1624 {
1625 return it;
1626 }
1627};
1628
1629struct RotateFwd
1630{
1631 // dest -> src: (x,y,z) -> (y,z,x)
1632 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1633 {
1634 return {i.y, i.z, i.x};
1635 }
1636
1637 // src -> dest: (x,y,z) -> (z,x,y)
1638 static constexpr Dim3 Inverse (Dim3 i)
1639 {
1640 return {i.z, i.x, i.y};
1641 }
1642
1643 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1644 {
1645 return it;
1646 }
1647
1648 static constexpr IndexType Inverse (IndexType it)
1649 {
1650 return it;
1651 }
1652};
1653
1654struct RotateBwd
1655{
1656 // dest -> src: (x,y,z) -> (z,x,y)
1657 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1658 {
1659 return {i.z, i.x, i.y};
1660 }
1661
1662 // src -> dest: (x,y,z) -> (y,z,x)
1663 static constexpr Dim3 Inverse (Dim3 i)
1664 {
1665 return {i.y, i.z, i.x};
1666 }
1667
1668 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1669 {
1670 return it;
1671 }
1672
1673 static constexpr IndexType Inverse (IndexType it)
1674 {
1675 return it;
1676 }
1677};
1678
1680
1682namespace detail
1683{
1684 struct SubHelper
1685 {
1686 explicit SubHelper (Box const& domain);
1687
1688 [[nodiscard]] Box make_box (Box const& box) const;
1689
1690 [[nodiscard]] Periodicity make_periodicity (Periodicity const& period) const;
1691
1692 [[nodiscard]] bool ghost_safe (IntVect const& ng) const;
1693
1694 // This rearranges the order.
1695 [[nodiscard]] IntVect make_iv (IntVect const& iv) const;
1696
1697 // This keeps the order, but zero out the values in the hidden dimension.
1698 [[nodiscard]] IntVect make_safe_ghost (IntVect const& ng) const;
1699
1700 [[nodiscard]] BoxArray inverse_boxarray (BoxArray const& ba) const;
1701
1702 [[nodiscard]] IntVect inverse_order (IntVect const& order) const;
1703
1704 template <typename T>
1705 [[nodiscard]] T make_array (T const& a) const
1706 {
1707#if (AMREX_SPACEDIM == 1)
1709 return a;
1710#elif (AMREX_SPACEDIM == 2)
1711 if (m_case == case_1n) {
1712 return T{a[1],a[0]};
1713 } else {
1714 return a;
1715 }
1716#else
1717 if (m_case == case_11n) {
1718 return T{a[2],a[0],a[1]};
1719 } else if (m_case == case_1n1) {
1720 return T{a[1],a[0],a[2]};
1721 } else if (m_case == case_1nn) {
1722 return T{a[1],a[2],a[0]};
1723 } else if (m_case == case_n1n) {
1724 return T{a[0],a[2],a[1]};
1725 } else {
1726 return a;
1727 }
1728#endif
1729 }
1730
1731 [[nodiscard]] GpuArray<int,3> xyz_order () const;
1732
1733 template <typename FA>
1734 FA make_alias_mf (FA const& mf)
1735 {
1736 BoxList bl = mf.boxArray().boxList();
1737 for (auto& b : bl) {
1738 b = make_box(b);
1739 }
1740 auto const& ng = make_iv(mf.nGrowVect());
1741 FA submf(BoxArray(std::move(bl)), mf.DistributionMap(), mf.nComp(), ng, MFInfo{}.SetAlloc(false));
1742 using FAB = typename FA::fab_type;
1743 for (MFIter mfi(submf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
1744 submf.setFab(mfi, FAB(mfi.fabbox(), mf.nComp(), mf[mfi].dataPtr()));
1745 }
1746 return submf;
1747 }
1748
1749#if (AMREX_SPACEDIM == 2)
1750 enum Case { case_1n, case_other };
1751 int m_case = case_other;
1752#elif (AMREX_SPACEDIM == 3)
1753 enum Case { case_11n, case_1n1, case_1nn, case_n1n, case_other };
1754 int m_case = case_other;
1755#endif
1756 };
1757}
1759
1760}
1761
1762#endif
#define AMREX_ENUM(CLASS,...)
Definition AMReX_Enum.H:208
#define AMREX_CUFFT_SAFE_CALL(call)
Definition AMReX_GpuError.H:92
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
amrex::ParmParse pp
Input file parser instance for the given namespace.
Definition AMReX_HypreIJIface.cpp:15
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1090
#define AMREX_D_TERM(a, b, c)
Definition AMReX_SPACE.H:172
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ IntVectND< dim > length() const noexcept
Return the length of the BoxND.
Definition AMReX_Box.H:154
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
An Integer Vector in dim-Dimensional Space.
Definition AMReX_IntVect.H:57
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
Arena * The_Arena()
Definition AMReX_Arena.cpp:783
Definition AMReX_FFT_Helper.H:52
Direction
Definition AMReX_FFT_Helper.H:54
Boundary
Definition AMReX_FFT_Helper.H:58
DomainStrategy
Definition AMReX_FFT_Helper.H:56
Kind
Definition AMReX_FFT_Helper.H:60
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:263
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:244
__host__ __device__ std::pair< double, double > sincospi(double x)
Return sin(pi*x) and cos(pi*x) given x.
Definition AMReX_Math.H:204
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IndexTypeND< 3 > IndexType
IndexType is an alias for amrex::IndexTypeND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:36
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:230
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FFT_Helper.H:64
bool twod_mode
Definition AMReX_FFT_Helper.H:75
Info & setNumProcs(int n)
Cap the number of MPI ranks used by FFT.
Definition AMReX_FFT_Helper.H:131
bool oned_mode
We might have a special twod_mode: nx or ny == 1 && nz > 1.
Definition AMReX_FFT_Helper.H:78
int batch_size
Batched FFT size. Only support in R2C, not R2X.
Definition AMReX_FFT_Helper.H:81
Info & setDomainStrategy(DomainStrategy s)
Select how the domain is decomposed across MPI ranks.
Definition AMReX_FFT_Helper.H:92
DomainStrategy domain_strategy
Domain composition strategy.
Definition AMReX_FFT_Helper.H:66
int nprocs
Max number of processes to use.
Definition AMReX_FFT_Helper.H:84
int pencil_threshold
Definition AMReX_FFT_Helper.H:70
Info & setOneDMode(bool x)
Flag the degenerate 2-D mode (nx==1 or ny==1) that still batches along z.
Definition AMReX_FFT_Helper.H:117
Info & setBatchSize(int bsize)
Specify the batch size for FFT.
Definition AMReX_FFT_Helper.H:124
Info & setPencilThreshold(int t)
Override the slab→pencil break-even threshold for the automatic strategy.
Definition AMReX_FFT_Helper.H:99
Info & setTwoDMode(bool x)
Restrict transforms to the first two dimensions (3-D problems only).
Definition AMReX_FFT_Helper.H:106
Definition AMReX_FFT_Helper.H:180
void * pf
Definition AMReX_FFT_Helper.H:215
void unpack_r2r_buffer(T *pdst, void const *pbuf) const
Collapse the spectral R2R result back into the original real layout.
Definition AMReX_FFT_Helper.H:1120
std::conditional_t< std::is_same_v< float, T >, cuComplex, cuDoubleComplex > VendorComplex
Definition AMReX_FFT_Helper.H:184
VendorPlan plan2
Definition AMReX_FFT_Helper.H:214
void init_r2c(IntVectND< M > const &fft_size, void *pbf, void *pbb, bool cache, int ncomp=1)
Initialize an M-dimensional batched real-to-complex plan.
Definition AMReX_FFT_Helper.H:1365
int n
Definition AMReX_FFT_Helper.H:207
void destroy()
Release any vendor FFT plan objects owned by this Plan.
Definition AMReX_FFT_Helper.H:234
bool defined2
Definition AMReX_FFT_Helper.H:212
void init_r2r(Box const &box, VendorComplex *pc, std::pair< Boundary, Boundary > const &bc)
Initialize a real-to-real plan that reads/writes complex storage.
Definition AMReX_FFT_Helper.H:713
void pack_r2r_buffer(void *pbuf, T const *psrc) const
Expand the real R2R input into the symmetry-extended buffer expected by CUFFT/rocFFT.
Definition AMReX_FFT_Helper.H:882
static void free_scratch_space(void *p)
Release GPU scratch allocated via alloc_scratch_space().
Definition AMReX_FFT_Helper.H:874
static void destroy_vendor_plan(VendorPlan plan)
Helper that destroys a vendor plan of the appropriate backend type.
Definition AMReX_FFT_Helper.H:1330
Kind get_r2r_kind(std::pair< Boundary, Boundary > const &bc)
Map boundary conditions to the Plan Kind for real-to-real transforms.
Definition AMReX_FFT_Helper.H:576
cufftHandle VendorPlan
Definition AMReX_FFT_Helper.H:182
Kind kind
Definition AMReX_FFT_Helper.H:209
void init_c2c(Box const &box, VendorComplex *p, int ncomp=1, int ndims=1)
Initialize a complex-to-complex plan across 1/2/3 dimensions.
Definition AMReX_FFT_Helper.H:409
int howmany
Definition AMReX_FFT_Helper.H:208
void * pb
Definition AMReX_FFT_Helper.H:216
void init_r2c(Box const &box, T *pr, VendorComplex *pc, bool is_2d_transform=false, int ncomp=1)
Initialize a 1-D or 2-D real-to-complex plan over the supplied Box.
Definition AMReX_FFT_Helper.H:258
void compute_r2r()
Execute the real-to-real plan, including GPU packing/unpacking.
Definition AMReX_FFT_Helper.H:1273
void compute_c2c()
Execute the complex-to-complex plan in place.
Definition AMReX_FFT_Helper.H:813
bool r2r_data_is_complex
Definition AMReX_FFT_Helper.H:210
void * alloc_scratch_space() const
Allocate GPU scratch space large enough to hold packed R2R data.
Definition AMReX_FFT_Helper.H:857
VendorPlan plan
Definition AMReX_FFT_Helper.H:213
void compute_r2c()
Execute the previously initialized real-to-complex plan.
Definition AMReX_FFT_Helper.H:761
bool defined
Definition AMReX_FFT_Helper.H:211
void set_ptrs(void *p0, void *p1)
Register device pointers used by the forward/backward executions.
Definition AMReX_FFT_Helper.H:225
void init_r2r(Box const &box, T *p, std::pair< Boundary, Boundary > const &bc, int howmany_initval=1)
Initialize a real-to-real (cosine/sine) plan that operates on real buffers.
Definition AMReX_FFT_Helper.H:610
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30