Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FFT_Helper.H
Go to the documentation of this file.
1#ifndef AMREX_FFT_HELPER_H_
2#define AMREX_FFT_HELPER_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_BLProfiler.H>
9#include <AMReX_Enum.H>
10#include <AMReX_FabArray.H>
11#include <AMReX_Gpu.H>
12#include <AMReX_GpuComplex.H>
13#include <AMReX_Geometry.H>
14#include <AMReX_Math.H>
15#include <AMReX_Periodicity.H>
16
23#if defined(AMREX_USE_CUDA)
24# include <cufft.h>
25# include <cuComplex.h>
26#elif defined(AMREX_USE_HIP)
27# if __has_include(<rocfft/rocfft.h>) // ROCm 5.3+
28# include <rocfft/rocfft.h>
29# else
30# include <rocfft.h>
31# endif
32# include <hip/hip_complex.h>
33#elif defined(AMREX_USE_SYCL)
34# if __has_include(<oneapi/mkl/dft.hpp>) // oneAPI 2025.0
35# include <oneapi/mkl/dft.hpp>
36#else
37# define AMREX_USE_MKL_DFTI_2024 1
38# include <oneapi/mkl/dfti.hpp>
39# endif
40#else
41# include <fftw3.h>
42#endif
43
44#include <algorithm>
45#include <complex>
46#include <limits>
47#include <memory>
48#include <tuple>
49#include <utility>
50#include <variant>
51
52namespace amrex::FFT
53{
54
55enum struct Direction { forward, backward, both, none };
56
58
60
63
72[[nodiscard]] constexpr int
74{
75#if defined(AMREX_USE_CUDA)
76 return 5;
77#else
78 return 6;
79#endif
80}
81
82struct Info
83{
86
90
94 bool twod_mode = false;
95
103 bool oned_mode = false;
104
106 int batch_size = 1;
107
109 int nprocs = std::numeric_limits<int>::max();
110
112 bool openbc_padding = true;
113
117
131 Info& setPencilThreshold (int t) { pencil_threshold = t; return *this; }
138 Info& setTwoDMode (bool x) { twod_mode = x; return *this; }
149 Info& setOneDMode (bool x) { oned_mode = x; return *this; }
156 Info& setBatchSize (int bsize) { batch_size = bsize; return *this; }
163 Info& setNumProcs (int n) { nprocs = n; return *this; }
170 Info& setOpenBCPadding (bool x) { openbc_padding = x; return *this; }
178 {
180 3 <= n && 6 >= n,
181 "amrex::FFT::Info::setOpenBCPaddingNumPrimeFactors: nfactors must be between 3 and 6");
183 return *this;
184 }
185};
186
188namespace detail
189{
190inline Geometry shift_geom (Geometry const& geom)
191{
192 auto const& dom = geom.Domain();
193 auto const& dlo = dom.smallEnd();
194 if (dlo == 0) {
195 return geom;
196 } else {
197 return Geometry(amrex::shift(dom,-dlo), geom.ProbDomain(), geom.CoordInt(),
198 geom.isPeriodic());
199 }
200}
201
202template <typename MF>
203void shift_mf (IntVect const& domain_lo, MF const& mf, MF& new_mf)
204{
205 BoxArray ba = mf.boxArray();
206 ba.shift(-domain_lo);
207 new_mf.define(ba, mf.DistributionMap(), mf.nComp(), mf.nGrowVect(),
208 MFInfo().SetAlloc(false));
209 for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
210 typename MF::fab_type fab(mf[mfi], amrex::make_alias, 0, mf.nComp());
211 fab.shift(-domain_lo);
212 new_mf.setFab(mfi, std::move(fab));
213 }
214}
215
216template <typename MF>
217void shift_mfs (IntVect const& domain_lo, MF const& mf1, MF const& mf2,
218 MF& new_mf1, MF& new_mf2)
219{
220 AMREX_ALWAYS_ASSERT(mf1.boxArray() == mf2.boxArray() &&
221 mf1.DistributionMap() == mf2.DistributionMap());
222 detail::shift_mf(domain_lo, mf1, new_mf1);
223 detail::shift_mf(domain_lo, mf2, new_mf2);
224}
225
226inline void next_fast_len_doit (Long target, Long n, int const* factors,
227 int nfactors, int ifactor, Long& best)
228{
229 if (n >= target) {
230 best = std::min(best, n);
231 return;
232 }
233
234 if (ifactor == nfactors) {
235 // Factors 2 and 3 are handled here: try each n * 3^b, multiply it
236 // by the smallest power of 2 that reaches target, and keep the best.
237 Long const max_m3 = std::numeric_limits<int>::max() / 3;
238 Long const max_m2 = std::numeric_limits<int>::max() / 2;
239 for (Long m3 = n; m3 < best; ) {
240 Long k = m3;
241 bool overflow = false;
242 while (k < target) {
243 if (k > max_m2) {
244 overflow = true;
245 break;
246 }
247 k *= 2;
248 }
249 if (!overflow && k < best) {
250 best = k;
251 }
252 if (m3 > max_m3) {
253 break;
254 }
255 m3 *= 3;
256 }
257 return;
258 }
259
260 int const factor = factors[ifactor];
261 Long const max_len = std::numeric_limits<int>::max();
262 Long const max_m = max_len / factor;
263 for (Long m = n; m < best; ) {
264 next_fast_len_doit(target, m, factors, nfactors, ifactor+1, best);
265 if (m > max_m) {
266 break;
267 }
268 m *= factor;
269 }
270}
271}
273
285[[nodiscard]] inline int
286nextFastLen (int target, int nfactors = FastNumPrimeFactors())
287{
289 "amrex::FFT::nextFastLen: target must be positive");
290 AMREX_ALWAYS_ASSERT_WITH_MESSAGE(3 <= nfactors && 6 >= nfactors,
291 "amrex::FFT::nextFastLen: nfactors must be between 3 and 6");
292
293 int constexpr factors[] = {5, 7, 11, 13, 17}; // The extra '17' is to make the buggy GCC -Warray-bounds quiet.
294
295 Long const max_len = std::numeric_limits<int>::max();
296 Long best = max_len + Long(1);
297
298 // Factors 2 and 3 are handled directly in the base case.
299 detail::next_fast_len_doit(target, Long(1), factors, nfactors-2, 0, best);
300
301 AMREX_ALWAYS_ASSERT_WITH_MESSAGE(best <= max_len,
302 "amrex::FFT::nextFastLen: result exceeds int range");
303 return static_cast<int>(best);
304}
305
306#ifdef AMREX_USE_HIP
308namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
310#endif
311
312#ifdef AMREX_USE_SYCL
314namespace detail
315{
316inline void assert_no_external_stream ()
317{
320 "SYCL FFT does not support external GPU streams.");
321}
322
323template <typename T, Direction direction, typename P, typename TI, typename TO>
324void sycl_execute (P* plan, TI* in, TO* out)
325{
326 assert_no_external_stream();
327#ifndef AMREX_USE_MKL_DFTI_2024
328 std::int64_t workspaceSize = 0;
329#else
330 std::size_t workspaceSize = 0;
331#endif
332 plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
333 &workspaceSize);
334 auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
335 plan->set_workspace(buffer);
336 sycl::event r;
337 if (std::is_same_v<TI,TO>) {
339 if constexpr (direction == Direction::forward) {
340 r = oneapi::mkl::dft::compute_forward(*plan, out);
341 } else {
342 r = oneapi::mkl::dft::compute_backward(*plan, out);
343 }
344 } else {
345 if constexpr (direction == Direction::forward) {
346 r = oneapi::mkl::dft::compute_forward(*plan, in, out);
347 } else {
348 r = oneapi::mkl::dft::compute_backward(*plan, in, out);
349 }
350 }
351 r.wait();
352 amrex::The_Arena()->free(buffer);
353}
354}
356#endif
357
358template <typename T>
359struct Plan
360{
361#if defined(AMREX_USE_CUDA)
362 using VendorPlan = cufftHandle;
363 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
364 cuComplex, cuDoubleComplex>;
365#elif defined(AMREX_USE_HIP)
366 using VendorPlan = rocfft_plan;
367 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
368 float2, double2>;
369#elif defined(AMREX_USE_SYCL)
370 using mkl_desc_r = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
371 ? oneapi::mkl::dft::precision::SINGLE
372 : oneapi::mkl::dft::precision::DOUBLE,
373 oneapi::mkl::dft::domain::REAL>;
374 using mkl_desc_c = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
375 ? oneapi::mkl::dft::precision::SINGLE
376 : oneapi::mkl::dft::precision::DOUBLE,
377 oneapi::mkl::dft::domain::COMPLEX>;
378 using VendorPlan = std::variant<mkl_desc_r*,mkl_desc_c*>;
379 using VendorComplex = std::complex<T>;
380#else
381 using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
382 fftwf_plan, fftw_plan>;
383 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
384 fftwf_complex, fftw_complex>;
385#endif
386
387 int n = 0;
388 int howmany = 0;
391 bool defined = false;
392 bool defined2 = false;
395 void* pf = nullptr;
396 void* pb = nullptr;
397
398#ifdef AMREX_USE_GPU
405 void set_ptrs (void* p0, void* p1) {
406 pf = p0;
407 pb = p1;
408 }
409#endif
410
414 void destroy ()
415 {
416 if (defined) {
418 defined = false;
419 }
420#if !defined(AMREX_USE_GPU)
421 if (defined2) {
423 defined2 = false;
424 }
425#endif
426 }
427
437 template <Direction D>
438 void init_r2c (Box const& box, T* pr, VendorComplex* pc, bool is_2d_transform = false, int ncomp = 1)
439 {
440 static_assert(D == Direction::forward || D == Direction::backward);
441
442 int rank = is_2d_transform ? 2 : 1;
443
445 defined = true;
446 pf = (void*)pr;
447 pb = (void*)pc;
448
449 int len[2] = {};
450 if (rank == 1) {
451 len[0] = box.length(0);
452 len[1] = box.length(0); // Not used except for HIP. Yes it's `(0)`.
453 } else {
454 len[0] = box.length(1); // Most FFT libraries assume row-major ordering
455 len[1] = box.length(0); // except for rocfft
456 }
457 int nr = (rank == 1) ? len[0] : len[0]*len[1];
458 n = nr;
459 int nc = (rank == 1) ? (len[0]/2+1) : (len[1]/2+1)*len[0];
460#if (AMREX_SPACEDIM == 1)
461 howmany = 1;
462#else
463 howmany = (rank == 1) ? AMREX_D_TERM(1, *box.length(1), *box.length(2))
464 : AMREX_D_TERM(1, *1 , *box.length(2));
465#endif
466 howmany *= ncomp;
467
469
470#if defined(AMREX_USE_CUDA)
471
472 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
473 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
474 std::size_t work_size;
475 if constexpr (D == Direction::forward) {
476 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
478 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nr, nullptr, 1, nc, fwd_type, howmany, &work_size));
479 } else {
480 cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
482 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nc, nullptr, 1, nr, bwd_type, howmany, &work_size));
483 }
484
485#elif defined(AMREX_USE_HIP)
486
487 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
488 // switch to column-major ordering
489 std::size_t length[2] = {std::size_t(len[1]), std::size_t(len[0])};
490 if constexpr (D == Direction::forward) {
491 AMREX_ROCFFT_SAFE_CALL
492 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
493 rocfft_transform_type_real_forward, prec, rank,
494 length, howmany, nullptr));
495 } else {
496 AMREX_ROCFFT_SAFE_CALL
497 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
498 rocfft_transform_type_real_inverse, prec, rank,
499 length, howmany, nullptr));
500 }
501
502#elif defined(AMREX_USE_SYCL)
503
504 mkl_desc_r* pp;
505 if (rank == 1) {
506 pp = new mkl_desc_r(len[0]);
507 } else {
508 pp = new mkl_desc_r({std::int64_t(len[0]), std::int64_t(len[1])});
509 }
510#ifndef AMREX_USE_MKL_DFTI_2024
511 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
512 oneapi::mkl::dft::config_value::NOT_INPLACE);
513#else
514 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
515#endif
516 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
517 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nr);
518 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
519 std::vector<std::int64_t> strides;
520 strides.push_back(0);
521 if (rank == 2) { strides.push_back(len[1]); }
522 strides.push_back(1);
523#ifndef AMREX_USE_MKL_DFTI_2024
524 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
525 // Do not set BWD_STRIDES
526#else
527 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
528 // Do not set BWD_STRIDES
529#endif
530 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
531 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
532 detail::assert_no_external_stream();
533 pp->commit(amrex::Gpu::Device::streamQueue());
534 plan = pp;
535
536#else /* FFTW */
537
538 if constexpr (std::is_same_v<float,T>) {
539 if constexpr (D == Direction::forward) {
540 plan = fftwf_plan_many_dft_r2c
541 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
542 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
543 } else {
544 plan = fftwf_plan_many_dft_c2r
545 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
546 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
547 }
548 } else {
549 if constexpr (D == Direction::forward) {
550 plan = fftw_plan_many_dft_r2c
551 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
552 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
553 } else {
554 plan = fftw_plan_many_dft_c2r
555 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
556 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
557 }
558 }
559#endif
560 }
561
571 template <Direction D, int M>
572 void init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp = 1);
573
589 template <Direction D>
590 void init_c2c (Box const& box, VendorComplex* p, int ncomp = 1, int ndims = 1)
591 {
592 static_assert(D == Direction::forward || D == Direction::backward);
593
595 defined = true;
596 pf = (void*)p;
597 pb = (void*)p;
598
599 int len[3] = {};
600
601 if (ndims == 1) {
602 n = box.length(0);
603 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
604 howmany *= ncomp;
605 len[0] = box.length(0);
606 }
607#if (AMREX_SPACEDIM >= 2)
608 else if (ndims == 2) {
609 n = box.length(0) * box.length(1);
610#if (AMREX_SPACEDIM == 2)
611 howmany = ncomp;
612#else
613 howmany = box.length(2) * ncomp;
614#endif
615 len[0] = box.length(1);
616 len[1] = box.length(0);
617 }
618#if (AMREX_SPACEDIM == 3)
619 else if (ndims == 3) {
620 n = box.length(0) * box.length(1) * box.length(2);
621 howmany = ncomp;
622 len[0] = box.length(2);
623 len[1] = box.length(1);
624 len[2] = box.length(0);
625 }
626#endif
627#endif
628
629#if defined(AMREX_USE_CUDA)
630 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
631 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
632
633 cufftType t = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
634 std::size_t work_size;
636 (cufftMakePlanMany(plan, ndims, len, nullptr, 1, n, nullptr, 1, n, t, howmany, &work_size));
637
638#elif defined(AMREX_USE_HIP)
639
640 auto prec = std::is_same_v<float,T> ? rocfft_precision_single
641 : rocfft_precision_double;
642 auto dir= (D == Direction::forward) ? rocfft_transform_type_complex_forward
643 : rocfft_transform_type_complex_inverse;
644 std::size_t length[3];
645 if (ndims == 1) {
646 length[0] = len[0];
647 } else if (ndims == 2) {
648 length[0] = len[1];
649 length[1] = len[0];
650 } else {
651 length[0] = len[2];
652 length[1] = len[1];
653 length[2] = len[0];
654 }
655 AMREX_ROCFFT_SAFE_CALL
656 (rocfft_plan_create(&plan, rocfft_placement_inplace, dir, prec, ndims,
657 length, howmany, nullptr));
658
659#elif defined(AMREX_USE_SYCL)
660
661 mkl_desc_c* pp;
662 if (ndims == 1) {
663 pp = new mkl_desc_c(n);
664 } else if (ndims == 2) {
665 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1])});
666 } else {
667 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1]), std::int64_t(len[2])});
668 }
669#ifndef AMREX_USE_MKL_DFTI_2024
670 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
671 oneapi::mkl::dft::config_value::INPLACE);
672#else
673 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
674#endif
675 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
676 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
677 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
678 std::vector<std::int64_t> strides(ndims+1);
679 strides[0] = 0;
680 strides[ndims] = 1;
681 for (int i = ndims-1; i >= 1; --i) {
682 strides[i] = strides[i+1] * len[i];
683 }
684#ifndef AMREX_USE_MKL_DFTI_2024
685 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
686 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
687#else
688 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
689 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
690#endif
691 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
692 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
693 detail::assert_no_external_stream();
694 pp->commit(amrex::Gpu::Device::streamQueue());
695 plan = pp;
696
697#else /* FFTW */
698
699 if constexpr (std::is_same_v<float,T>) {
700 if constexpr (D == Direction::forward) {
701 plan = fftwf_plan_many_dft
702 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
703 FFTW_ESTIMATE);
704 } else {
705 plan = fftwf_plan_many_dft
706 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
707 FFTW_ESTIMATE);
708 }
709 } else {
710 if constexpr (D == Direction::forward) {
711 plan = fftw_plan_many_dft
712 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
713 FFTW_ESTIMATE);
714 } else {
715 plan = fftw_plan_many_dft
716 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
717 FFTW_ESTIMATE);
718 }
719 }
720#endif
721 }
722
723#ifndef AMREX_USE_GPU
724 template <Direction D>
725 fftw_r2r_kind get_fftw_kind (std::pair<Boundary,Boundary> const& bc)
726 {
727 if (bc.first == Boundary::even && bc.second == Boundary::even)
728 {
729 return (D == Direction::forward) ? FFTW_REDFT10 : FFTW_REDFT01;
730 }
731 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
732 {
733 return FFTW_REDFT11;
734 }
735 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
736 {
737 return FFTW_RODFT11;
738 }
739 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
740 {
741 return (D == Direction::forward) ? FFTW_RODFT10 : FFTW_RODFT01;
742 }
743 else {
744 amrex::Abort("FFT: unsupported BC");
745 return fftw_r2r_kind{};
746 }
747
748 }
749#endif
750
757 template <Direction D>
758 Kind get_r2r_kind (std::pair<Boundary,Boundary> const& bc)
759 {
760 if (bc.first == Boundary::even && bc.second == Boundary::even)
761 {
763 }
764 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
765 {
766 return Kind::r2r_eo;
767 }
768 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
769 {
770 return Kind::r2r_oe;
771 }
772 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
773 {
775 }
776 else {
777 amrex::Abort("FFT: unsupported BC");
778 return Kind::none;
779 }
780
781 }
782
791 template <Direction D>
792 void init_r2r (Box const& box, T* p, std::pair<Boundary,Boundary> const& bc,
793 int howmany_initval = 1)
794 {
795 static_assert(D == Direction::forward || D == Direction::backward);
796
797 kind = get_r2r_kind<D>(bc);
798 defined = true;
799 pf = (void*)p;
800 pb = (void*)p;
801
802 n = box.length(0);
803 howmany = AMREX_D_TERM(howmany_initval, *box.length(1), *box.length(2));
804
805#if defined(AMREX_USE_GPU)
806 int nex=0;
807 if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
808 Direction::forward == D) {
809 nex = 2*n;
810 } else if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
811 Direction::backward == D) {
812 nex = 4*n;
813 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
814 Direction::forward == D) {
815 nex = 2*n;
816 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
817 Direction::backward == D) {
818 nex = 4*n;
819 } else if ((bc.first == Boundary::even && bc.second == Boundary::odd) ||
820 (bc.first == Boundary::odd && bc.second == Boundary::even)) {
821 nex = 4*n;
822 } else {
823 amrex::Abort("FFT: unsupported BC");
824 }
825 int nc = (nex/2) + 1;
826
827#if defined (AMREX_USE_CUDA)
828
829 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
830 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
831 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
832 std::size_t work_size;
834 (cufftMakePlanMany(plan, 1, &nex, nullptr, 1, nc*2, nullptr, 1, nc, fwd_type, howmany, &work_size));
835
836#elif defined(AMREX_USE_HIP)
837
839 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
840 const std::size_t length = nex;
841 AMREX_ROCFFT_SAFE_CALL
842 (rocfft_plan_create(&plan, rocfft_placement_inplace,
843 rocfft_transform_type_real_forward, prec, 1,
844 &length, howmany, nullptr));
845
846#elif defined(AMREX_USE_SYCL)
847
848 auto* pp = new mkl_desc_r(nex);
849#ifndef AMREX_USE_MKL_DFTI_2024
850 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
851 oneapi::mkl::dft::config_value::INPLACE);
852#else
853 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
854#endif
855 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
856 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nc*2);
857 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
858 std::vector<std::int64_t> strides = {0,1};
859#ifndef AMREX_USE_MKL_DFTI_2024
860 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
861 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
862#else
863 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
864 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
865#endif
866 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
867 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
868 detail::assert_no_external_stream();
869 pp->commit(amrex::Gpu::Device::streamQueue());
870 plan = pp;
871
872#endif
873
874#else /* FFTW */
875 auto fftw_kind = get_fftw_kind<D>(bc);
876 if constexpr (std::is_same_v<float,T>) {
877 plan = fftwf_plan_many_r2r
878 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
879 FFTW_ESTIMATE);
880 } else {
881 plan = fftw_plan_many_r2r
882 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
883 FFTW_ESTIMATE);
884 }
885#endif
886 }
887
895 template <Direction D>
896 void init_r2r (Box const& box, VendorComplex* pc,
897 std::pair<Boundary,Boundary> const& bc)
898 {
899 static_assert(D == Direction::forward || D == Direction::backward);
900
901 auto* p = (T*)pc;
902
903#if defined(AMREX_USE_GPU)
904
905 init_r2r<D>(box, p, bc, 2);
906 r2r_data_is_complex = true;
907
908#else
909
910 kind = get_r2r_kind<D>(bc);
911 defined = true;
912 pf = (void*)p;
913 pb = (void*)p;
914
915 n = box.length(0);
916 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
917
918 defined2 = true;
919 auto fftw_kind = get_fftw_kind<D>(bc);
920 if constexpr (std::is_same_v<float,T>) {
921 plan = fftwf_plan_many_r2r
922 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
923 FFTW_ESTIMATE);
924 plan2 = fftwf_plan_many_r2r
925 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
926 FFTW_ESTIMATE);
927 } else {
928 plan = fftw_plan_many_r2r
929 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
930 FFTW_ESTIMATE);
931 plan2 = fftw_plan_many_r2r
932 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
933 FFTW_ESTIMATE);
934 }
935#endif
936 }
937
943 template <Direction D>
945 {
946 static_assert(D == Direction::forward || D == Direction::backward);
947 if (!defined) { return; }
948
949 using TI = std::conditional_t<(D == Direction::forward), T, VendorComplex>;
950 using TO = std::conditional_t<(D == Direction::backward), T, VendorComplex>;
951 auto* pi = (TI*)((D == Direction::forward) ? pf : pb);
952 auto* po = (TO*)((D == Direction::forward) ? pb : pf);
953
954#if defined(AMREX_USE_CUDA)
955 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
956
957 std::size_t work_size = 0;
958 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
959
960 auto* work_area = The_Arena()->alloc(work_size);
961 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
962
963 if constexpr (D == Direction::forward) {
964 if constexpr (std::is_same_v<float,T>) {
965 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, pi, po));
966 } else {
967 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, pi, po));
968 }
969 } else {
970 if constexpr (std::is_same_v<float,T>) {
971 AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan, pi, po));
972 } else {
973 AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan, pi, po));
974 }
975 }
977 The_Arena()->free(work_area);
978#elif defined(AMREX_USE_HIP)
979 detail::hip_execute(plan, (void**)&pi, (void**)&po);
980#elif defined(AMREX_USE_SYCL)
981 detail::sycl_execute<T,D>(std::get<0>(plan), pi, po);
982#else
984 if constexpr (std::is_same_v<float,T>) {
985 fftwf_execute(plan);
986 } else {
987 fftw_execute(plan);
988 }
989#endif
990 }
991
995 template <Direction D>
997 {
998 static_assert(D == Direction::forward || D == Direction::backward);
999 if (!defined) { return; }
1000
1001 auto* p = (VendorComplex*)pf;
1002
1003#if defined(AMREX_USE_CUDA)
1004 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
1005
1006 std::size_t work_size = 0;
1007 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
1008
1009 auto* work_area = The_Arena()->alloc(work_size);
1010 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
1011
1012 auto dir = (D == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
1013 if constexpr (std::is_same_v<float,T>) {
1014 AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan, p, p, dir));
1015 } else {
1016 AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan, p, p, dir));
1017 }
1019 The_Arena()->free(work_area);
1020#elif defined(AMREX_USE_HIP)
1021 detail::hip_execute(plan, (void**)&p, (void**)&p);
1022#elif defined(AMREX_USE_SYCL)
1023 detail::sycl_execute<T,D>(std::get<1>(plan), p, p);
1024#else
1026 if constexpr (std::is_same_v<float,T>) {
1027 fftwf_execute(plan);
1028 } else {
1029 fftw_execute(plan);
1030 }
1031#endif
1032 }
1033
1034#ifdef AMREX_USE_GPU
1040 [[nodiscard]] void* alloc_scratch_space () const
1041 {
1042 int nc = 0;
1043 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
1044 nc = n + 1;
1045 } else if (kind == Kind::r2r_oo_b || kind == Kind::r2r_ee_b ||
1047 nc = 2*n+1;
1048 } else {
1049 amrex::Abort("FFT: alloc_scratch_space: unsupported kind");
1050 }
1051 return The_Arena()->alloc(sizeof(GpuComplex<T>)*nc*howmany);
1052 }
1053
1057 static void free_scratch_space (void* p) { The_Arena()->free(p); }
1058
1065 void pack_r2r_buffer (void* pbuf, T const* psrc) const
1066 {
1067 auto* pdst = (T*) pbuf;
1068 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
1069 T sign = (kind == Kind::r2r_oo_f) ? T(-1) : T(1);
1070 int ostride = (n+1)*2;
1071 int istride = n;
1072 int nex = 2*n;
1073 int norig = n;
1074 Long nelems = Long(nex)*howmany;
1075 if (r2r_data_is_complex) {
1076 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1077 {
1078 auto batch = ielem / Long(nex);
1079 auto i = int(ielem - batch*nex);
1080 for (int ir = 0; ir < 2; ++ir) {
1081 auto* po = pdst + (2*batch+ir)*ostride + i;
1082 auto const* pi = psrc + 2*batch*istride + ir;
1083 if (i < norig) {
1084 *po = pi[i*2];
1085 } else {
1086 *po = sign * pi[(2*norig-1-i)*2];
1087 }
1088 }
1089 });
1090 } else {
1091 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1092 {
1093 auto batch = ielem / Long(nex);
1094 auto i = int(ielem - batch*nex);
1095 auto* po = pdst + batch*ostride + i;
1096 auto const* pi = psrc + batch*istride;
1097 if (i < norig) {
1098 *po = pi[i];
1099 } else {
1100 *po = sign * pi[2*norig-1-i];
1101 }
1102 });
1103 }
1104 } else if (kind == Kind::r2r_oo_b) {
1105 int ostride = (2*n+1)*2;
1106 int istride = n;
1107 int nex = 4*n;
1108 int norig = n;
1109 Long nelems = Long(nex)*howmany;
1110 if (r2r_data_is_complex) {
1111 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1112 {
1113 auto batch = ielem / Long(nex);
1114 auto i = int(ielem - batch*nex);
1115 for (int ir = 0; ir < 2; ++ir) {
1116 auto* po = pdst + (2*batch+ir)*ostride + i;
1117 auto const* pi = psrc + 2*batch*istride + ir;
1118 if (i < norig) {
1119 *po = pi[i*2];
1120 } else if (i < (2*norig-1)) {
1121 *po = pi[(2*norig-2-i)*2];
1122 } else if (i == (2*norig-1)) {
1123 *po = T(0);
1124 } else if (i < (3*norig)) {
1125 *po = -pi[(i-2*norig)*2];
1126 } else if (i < (4*norig-1)) {
1127 *po = -pi[(4*norig-2-i)*2];
1128 } else {
1129 *po = T(0);
1130 }
1131 }
1132 });
1133 } else {
1134 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1135 {
1136 auto batch = ielem / Long(nex);
1137 auto i = int(ielem - batch*nex);
1138 auto* po = pdst + batch*ostride + i;
1139 auto const* pi = psrc + batch*istride;
1140 if (i < norig) {
1141 *po = pi[i];
1142 } else if (i < (2*norig-1)) {
1143 *po = pi[2*norig-2-i];
1144 } else if (i == (2*norig-1)) {
1145 *po = T(0);
1146 } else if (i < (3*norig)) {
1147 *po = -pi[i-2*norig];
1148 } else if (i < (4*norig-1)) {
1149 *po = -pi[4*norig-2-i];
1150 } else {
1151 *po = T(0);
1152 }
1153 });
1154 }
1155 } else if (kind == Kind::r2r_ee_b) {
1156 int ostride = (2*n+1)*2;
1157 int istride = n;
1158 int nex = 4*n;
1159 int norig = n;
1160 Long nelems = Long(nex)*howmany;
1161 if (r2r_data_is_complex) {
1162 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1163 {
1164 auto batch = ielem / Long(nex);
1165 auto i = int(ielem - batch*nex);
1166 for (int ir = 0; ir < 2; ++ir) {
1167 auto* po = pdst + (2*batch+ir)*ostride + i;
1168 auto const* pi = psrc + 2*batch*istride + ir;
1169 if (i < norig) {
1170 *po = pi[i*2];
1171 } else if (i == norig) {
1172 *po = T(0);
1173 } else if (i < (2*norig+1)) {
1174 *po = -pi[(2*norig-i)*2];
1175 } else if (i < (3*norig)) {
1176 *po = -pi[(i-2*norig)*2];
1177 } else if (i == 3*norig) {
1178 *po = T(0);
1179 } else {
1180 *po = pi[(4*norig-i)*2];
1181 }
1182 }
1183 });
1184 } else {
1185 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1186 {
1187 auto batch = ielem / Long(nex);
1188 auto i = int(ielem - batch*nex);
1189 auto* po = pdst + batch*ostride + i;
1190 auto const* pi = psrc + batch*istride;
1191 if (i < norig) {
1192 *po = pi[i];
1193 } else if (i == norig) {
1194 *po = T(0);
1195 } else if (i < (2*norig+1)) {
1196 *po = -pi[2*norig-i];
1197 } else if (i < (3*norig)) {
1198 *po = -pi[i-2*norig];
1199 } else if (i == 3*norig) {
1200 *po = T(0);
1201 } else {
1202 *po = pi[4*norig-i];
1203 }
1204 });
1205 }
1206 } else if (kind == Kind::r2r_eo) {
1207 int ostride = (2*n+1)*2;
1208 int istride = n;
1209 int nex = 4*n;
1210 int norig = n;
1211 Long nelems = Long(nex)*howmany;
1212 if (r2r_data_is_complex) {
1213 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1214 {
1215 auto batch = ielem / Long(nex);
1216 auto i = int(ielem - batch*nex);
1217 for (int ir = 0; ir < 2; ++ir) {
1218 auto* po = pdst + (2*batch+ir)*ostride + i;
1219 auto const* pi = psrc + 2*batch*istride + ir;
1220 if (i < norig) {
1221 *po = pi[i*2];
1222 } else if (i < (2*norig)) {
1223 *po = -pi[(2*norig-1-i)*2];
1224 } else if (i < (3*norig)) {
1225 *po = -pi[(i-2*norig)*2];
1226 } else {
1227 *po = pi[(4*norig-1-i)*2];
1228 }
1229 }
1230 });
1231 } else {
1232 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1233 {
1234 auto batch = ielem / Long(nex);
1235 auto i = int(ielem - batch*nex);
1236 auto* po = pdst + batch*ostride + i;
1237 auto const* pi = psrc + batch*istride;
1238 if (i < norig) {
1239 *po = pi[i];
1240 } else if (i < (2*norig)) {
1241 *po = -pi[2*norig-1-i];
1242 } else if (i < (3*norig)) {
1243 *po = -pi[i-2*norig];
1244 } else {
1245 *po = pi[4*norig-1-i];
1246 }
1247 });
1248 }
1249 } else if (kind == Kind::r2r_oe) {
1250 int ostride = (2*n+1)*2;
1251 int istride = n;
1252 int nex = 4*n;
1253 int norig = n;
1254 Long nelems = Long(nex)*howmany;
1255 if (r2r_data_is_complex) {
1256 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1257 {
1258 auto batch = ielem / Long(nex);
1259 auto i = int(ielem - batch*nex);
1260 for (int ir = 0; ir < 2; ++ir) {
1261 auto* po = pdst + (2*batch+ir)*ostride + i;
1262 auto const* pi = psrc + 2*batch*istride + ir;
1263 if (i < norig) {
1264 *po = pi[i*2];
1265 } else if (i < (2*norig)) {
1266 *po = pi[(2*norig-1-i)*2];
1267 } else if (i < (3*norig)) {
1268 *po = -pi[(i-2*norig)*2];
1269 } else {
1270 *po = -pi[(4*norig-1-i)*2];
1271 }
1272 }
1273 });
1274 } else {
1275 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1276 {
1277 auto batch = ielem / Long(nex);
1278 auto i = int(ielem - batch*nex);
1279 auto* po = pdst + batch*ostride + i;
1280 auto const* pi = psrc + batch*istride;
1281 if (i < norig) {
1282 *po = pi[i];
1283 } else if (i < (2*norig)) {
1284 *po = pi[2*norig-1-i];
1285 } else if (i < (3*norig)) {
1286 *po = -pi[i-2*norig];
1287 } else {
1288 *po = -pi[4*norig-1-i];
1289 }
1290 });
1291 }
1292 } else {
1293 amrex::Abort("FFT: pack_r2r_buffer: unsupported kind");
1294 }
1295 }
1296
1303 void unpack_r2r_buffer (T* pdst, void const* pbuf) const
1304 {
1305 auto const* psrc = (GpuComplex<T> const*) pbuf;
1306 int norig = n;
1307 Long nelems = Long(norig)*howmany;
1308 int ostride = n;
1309
1310 if (kind == Kind::r2r_oo_f) {
1311 int istride = n+1;
1312 if (r2r_data_is_complex) {
1313 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1314 {
1315 auto batch = ielem / Long(norig);
1316 auto k = int(ielem - batch*norig);
1317 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1318 for (int ir = 0; ir < 2; ++ir) {
1319 auto const& yk = psrc[(2*batch+ir)*istride+k+1];
1320 pdst[2*batch*ostride+ir+k*2] = s * yk.real() - c * yk.imag();
1321 }
1322 });
1323 } else {
1324 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1325 {
1326 auto batch = ielem / Long(norig);
1327 auto k = int(ielem - batch*norig);
1328 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1329 auto const& yk = psrc[batch*istride+k+1];
1330 pdst[batch*ostride+k] = s * yk.real() - c * yk.imag();
1331 });
1332 }
1333 } else if (kind == Kind::r2r_oo_b) {
1334 int istride = 2*n+1;
1335 if (r2r_data_is_complex) {
1336 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1337 {
1338 auto batch = ielem / Long(norig);
1339 auto k = int(ielem - batch*norig);
1340 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1341 for (int ir = 0; ir < 2; ++ir) {
1342 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1343 pdst[2*batch*ostride+ir+k*2] = T(0.5)*(s * yk.real() - c * yk.imag());
1344 }
1345 });
1346 } else {
1347 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1348 {
1349 auto batch = ielem / Long(norig);
1350 auto k = int(ielem - batch*norig);
1351 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1352 auto const& yk = psrc[batch*istride+2*k+1];
1353 pdst[batch*ostride+k] = T(0.5)*(s * yk.real() - c * yk.imag());
1354 });
1355 }
1356 } else if (kind == Kind::r2r_ee_f) {
1357 int istride = n+1;
1358 if (r2r_data_is_complex) {
1359 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1360 {
1361 auto batch = ielem / Long(norig);
1362 auto k = int(ielem - batch*norig);
1363 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1364 for (int ir = 0; ir < 2; ++ir) {
1365 auto const& yk = psrc[(2*batch+ir)*istride+k];
1366 pdst[2*batch*ostride+ir+k*2] = c * yk.real() + s * yk.imag();
1367 }
1368 });
1369 } else {
1370 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1371 {
1372 auto batch = ielem / Long(norig);
1373 auto k = int(ielem - batch*norig);
1374 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1375 auto const& yk = psrc[batch*istride+k];
1376 pdst[batch*ostride+k] = c * yk.real() + s * yk.imag();
1377 });
1378 }
1379 } else if (kind == Kind::r2r_ee_b) {
1380 int istride = 2*n+1;
1381 if (r2r_data_is_complex) {
1382 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1383 {
1384 auto batch = ielem / Long(norig);
1385 auto k = int(ielem - batch*norig);
1386 for (int ir = 0; ir < 2; ++ir) {
1387 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1388 pdst[2*batch*ostride+ir+k*2] = T(0.5) * yk.real();
1389 }
1390 });
1391 } else {
1392 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1393 {
1394 auto batch = ielem / Long(norig);
1395 auto k = int(ielem - batch*norig);
1396 auto const& yk = psrc[batch*istride+2*k+1];
1397 pdst[batch*ostride+k] = T(0.5) * yk.real();
1398 });
1399 }
1400 } else if (kind == Kind::r2r_eo) {
1401 int istride = 2*n+1;
1402 if (r2r_data_is_complex) {
1403 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1404 {
1405 auto batch = ielem / Long(norig);
1406 auto k = int(ielem - batch*norig);
1407 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1408 for (int ir = 0; ir < 2; ++ir) {
1409 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1410 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (c * yk.real() + s * yk.imag());
1411 }
1412 });
1413 } else {
1414 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1415 {
1416 auto batch = ielem / Long(norig);
1417 auto k = int(ielem - batch*norig);
1418 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1419 auto const& yk = psrc[batch*istride+2*k+1];
1420 pdst[batch*ostride+k] = T(0.5) * (c * yk.real() + s * yk.imag());
1421 });
1422 }
1423 } else if (kind == Kind::r2r_oe) {
1424 int istride = 2*n+1;
1425 if (r2r_data_is_complex) {
1426 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1427 {
1428 auto batch = ielem / Long(norig);
1429 auto k = int(ielem - batch*norig);
1430 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1431 for (int ir = 0; ir < 2; ++ir) {
1432 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1433 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (s * yk.real() - c * yk.imag());
1434 }
1435 });
1436 } else {
1437 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1438 {
1439 auto batch = ielem / Long(norig);
1440 auto k = int(ielem - batch*norig);
1441 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1442 auto const& yk = psrc[batch*istride+2*k+1];
1443 pdst[batch*ostride+k] = T(0.5) * (s * yk.real() - c * yk.imag());
1444 });
1445 }
1446 } else {
1447 amrex::Abort("FFT: unpack_r2r_buffer: unsupported kind");
1448 }
1449 }
1450#endif
1451
1455 template <Direction D>
1457 {
1458 static_assert(D == Direction::forward || D == Direction::backward);
1459 if (!defined) { return; }
1460
1461#if defined(AMREX_USE_GPU)
1462
1463 auto* pscratch = alloc_scratch_space();
1464
1465 pack_r2r_buffer(pscratch, (T*)((D == Direction::forward) ? pf : pb));
1466
1467#if defined(AMREX_USE_CUDA)
1468
1469 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
1470
1471 std::size_t work_size = 0;
1472 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
1473
1474 auto* work_area = The_Arena()->alloc(work_size);
1475 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
1476
1477 if constexpr (std::is_same_v<float,T>) {
1478 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, (T*)pscratch, (VendorComplex*)pscratch));
1479 } else {
1480 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, (T*)pscratch, (VendorComplex*)pscratch));
1481 }
1482
1483#elif defined(AMREX_USE_HIP)
1484 detail::hip_execute(plan, (void**)&pscratch, (void**)&pscratch);
1485#elif defined(AMREX_USE_SYCL)
1486 detail::sycl_execute<T,Direction::forward>(std::get<0>(plan), (T*)pscratch, (VendorComplex*)pscratch);
1487#endif
1488
1489 unpack_r2r_buffer((T*)((D == Direction::forward) ? pb : pf), pscratch);
1490
1492 free_scratch_space(pscratch);
1493#if defined(AMREX_USE_CUDA)
1494 The_Arena()->free(work_area);
1495#endif
1496
1497#else /* FFTW */
1498
1499 if constexpr (std::is_same_v<float,T>) {
1500 fftwf_execute(plan);
1501 if (defined2) { fftwf_execute(plan2); }
1502 } else {
1503 fftw_execute(plan);
1504 if (defined2) { fftw_execute(plan2); }
1505 }
1506
1507#endif
1508 }
1509
1514 {
1515#if defined(AMREX_USE_CUDA)
1516 AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
1517#elif defined(AMREX_USE_HIP)
1518 AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
1519#elif defined(AMREX_USE_SYCL)
1520 std::visit([](auto&& p) { delete p; }, plan);
1521#else
1522 if constexpr (std::is_same_v<float,T>) {
1523 fftwf_destroy_plan(plan);
1524 } else {
1525 fftw_destroy_plan(plan);
1526 }
1527#endif
1528 }
1529};
1530
1532
1533using Key = std::tuple<IntVectND<3>,int,Direction,Kind>;
1534using PlanD = typename Plan<double>::VendorPlan;
1535using PlanF = typename Plan<float>::VendorPlan;
1536
1537namespace detail {
1538 PlanD* get_vendor_plan_d (Key const& key);
1539 PlanF* get_vendor_plan_f (Key const& key);
1540
1541 void add_vendor_plan_d (Key const& key, PlanD plan);
1542 void add_vendor_plan_f (Key const& key, PlanF plan);
1543}
1545
1546template <typename T>
1547template <Direction D, int M>
1548void Plan<T>::init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp)
1549{
1550 static_assert(D == Direction::forward || D == Direction::backward);
1551
1552 kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
1553 defined = true;
1554 pf = pbf;
1555 pb = pbb;
1556
1557 n = 1;
1558 for (auto s : fft_size) { n *= s; }
1559 howmany = ncomp;
1560
1561#if defined(AMREX_USE_GPU)
1562 Key key = {fft_size.template expand<3>(), ncomp, D, kind};
1563 if (cache) {
1564 VendorPlan* cached_plan = nullptr;
1565 if constexpr (std::is_same_v<float,T>) {
1566 cached_plan = detail::get_vendor_plan_f(key);
1567 } else {
1568 cached_plan = detail::get_vendor_plan_d(key);
1569 }
1570 if (cached_plan) {
1571 plan = *cached_plan;
1572 return;
1573 }
1574 }
1575#else
1576 amrex::ignore_unused(cache);
1577#endif
1578
1579 int len[M];
1580 for (int i = 0; i < M; ++i) {
1581 len[i] = fft_size[M-1-i];
1582 }
1583
1584 int nc = fft_size[0]/2+1;
1585 for (int i = 1; i < M; ++i) {
1586 nc *= fft_size[i];
1587 }
1588
1589#if defined(AMREX_USE_CUDA)
1590
1591 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
1592 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
1593 cufftType type;
1594 int n_in, n_out;
1595 if constexpr (D == Direction::forward) {
1596 type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
1597 n_in = n;
1598 n_out = nc;
1599 } else {
1600 type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
1601 n_in = nc;
1602 n_out = n;
1603 }
1604 std::size_t work_size;
1606 (cufftMakePlanMany(plan, M, len, nullptr, 1, n_in, nullptr, 1, n_out, type, howmany, &work_size));
1607
1608#elif defined(AMREX_USE_HIP)
1609
1610 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
1611 std::size_t length[M];
1612 for (int idim = 0; idim < M; ++idim) { length[idim] = fft_size[idim]; }
1613 if constexpr (D == Direction::forward) {
1614 AMREX_ROCFFT_SAFE_CALL
1615 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1616 rocfft_transform_type_real_forward, prec, M,
1617 length, howmany, nullptr));
1618 } else {
1619 AMREX_ROCFFT_SAFE_CALL
1620 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1621 rocfft_transform_type_real_inverse, prec, M,
1622 length, howmany, nullptr));
1623 }
1624
1625#elif defined(AMREX_USE_SYCL)
1626
1627 mkl_desc_r* pp;
1628 if (M == 1) {
1629 pp = new mkl_desc_r(fft_size[0]);
1630 } else {
1631 std::vector<std::int64_t> len64(M);
1632 for (int idim = 0; idim < M; ++idim) {
1633 len64[idim] = len[idim];
1634 }
1635 pp = new mkl_desc_r(len64);
1636 }
1637#ifndef AMREX_USE_MKL_DFTI_2024
1638 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
1639 oneapi::mkl::dft::config_value::NOT_INPLACE);
1640#else
1641 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
1642#endif
1643 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
1644 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
1645 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
1646 std::vector<std::int64_t> strides(M+1);
1647 strides[0] = 0;
1648 strides[M] = 1;
1649 for (int i = M-1; i >= 1; --i) {
1650 strides[i] = strides[i+1] * fft_size[M-1-i];
1651 }
1652
1653#ifndef AMREX_USE_MKL_DFTI_2024
1654 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
1655 // Do not set BWD_STRIDES
1656#else
1657 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
1658 // Do not set BWD_STRIDES
1659#endif
1660 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
1661 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
1662 detail::assert_no_external_stream();
1663 pp->commit(amrex::Gpu::Device::streamQueue());
1664 plan = pp;
1665
1666#else /* FFTW */
1667
1668 if (pf == nullptr || pb == nullptr) {
1669 defined = false;
1670 return;
1671 }
1672
1673 if constexpr (std::is_same_v<float,T>) {
1674 if constexpr (D == Direction::forward) {
1675 plan = fftwf_plan_many_dft_r2c
1676 (M, len, howmany, (float*)pf, nullptr, 1, n, (fftwf_complex*)pb, nullptr, 1, nc,
1677 FFTW_ESTIMATE);
1678 } else {
1679 plan = fftwf_plan_many_dft_c2r
1680 (M, len, howmany, (fftwf_complex*)pb, nullptr, 1, nc, (float*)pf, nullptr, 1, n,
1681 FFTW_ESTIMATE);
1682 }
1683 } else {
1684 if constexpr (D == Direction::forward) {
1685 plan = fftw_plan_many_dft_r2c
1686 (M, len, howmany, (double*)pf, nullptr, 1, n, (fftw_complex*)pb, nullptr, 1, nc,
1687 FFTW_ESTIMATE);
1688 } else {
1689 plan = fftw_plan_many_dft_c2r
1690 (M, len, howmany, (fftw_complex*)pb, nullptr, 1, nc, (double*)pf, nullptr, 1, n,
1691 FFTW_ESTIMATE);
1692 }
1693 }
1694#endif
1695
1696#if defined(AMREX_USE_GPU)
1697 if (cache) {
1698 if constexpr (std::is_same_v<float,T>) {
1699 detail::add_vendor_plan_f(key, plan);
1700 } else {
1701 detail::add_vendor_plan_d(key, plan);
1702 }
1703 }
1704#endif
1705}
1706
1708namespace detail
1709{
1710 DistributionMapping make_iota_distromap (Long n);
1711
1712 template <typename FA>
1713 typename FA::FABType::value_type * get_fab (FA& fa)
1714 {
1715 auto myproc = ParallelContext::MyProcSub();
1716 if (myproc < fa.size()) {
1717 return fa.fabPtr(myproc);
1718 } else {
1719 return nullptr;
1720 }
1721 }
1722
1723 template <typename FA1, typename FA2>
1724 std::unique_ptr<char,DataDeleter> make_mfs_share (FA1& fa1, FA2& fa2)
1725 {
1726 bool not_same_fa = true;
1727 if constexpr (std::is_same_v<FA1,FA2>) {
1728 not_same_fa = (&fa1 != &fa2);
1729 }
1730 using FAB1 = typename FA1::FABType::value_type;
1731 using FAB2 = typename FA2::FABType::value_type;
1732 using T1 = typename FAB1::value_type;
1733 using T2 = typename FAB2::value_type;
1734 auto myproc = ParallelContext::MyProcSub();
1735 bool alloc_1 = (myproc < fa1.size());
1736 bool alloc_2 = (myproc < fa2.size()) && not_same_fa;
1737 void* p = nullptr;
1738 if (alloc_1 && alloc_2) {
1739 Box const& box1 = fa1.fabbox(myproc);
1740 Box const& box2 = fa2.fabbox(myproc);
1741 int ncomp1 = fa1.nComp();
1742 int ncomp2 = fa2.nComp();
1743 p = The_Arena()->alloc(std::max(sizeof(T1)*box1.numPts()*ncomp1,
1744 sizeof(T2)*box2.numPts()*ncomp2));
1745 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1746 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1747 } else if (alloc_1) {
1748 Box const& box1 = fa1.fabbox(myproc);
1749 int ncomp1 = fa1.nComp();
1750 p = The_Arena()->alloc(sizeof(T1)*box1.numPts()*ncomp1);
1751 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1752 } else if (alloc_2) {
1753 Box const& box2 = fa2.fabbox(myproc);
1754 int ncomp2 = fa2.nComp();
1755 p = The_Arena()->alloc(sizeof(T2)*box2.numPts()*ncomp2);
1756 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1757 } else {
1758 return nullptr;
1759 }
1760 return std::unique_ptr<char,DataDeleter>((char*)p, DataDeleter{The_Arena()});
1761 }
1762}
1764
1766
1767struct Swap01
1768{
1769 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1770 {
1771 return Dim3{.x = i.y, .y = i.x, .z = i.z};
1772 }
1773
1774 static constexpr Dim3 Inverse (Dim3 i)
1775 {
1776 return Dim3{.x = i.y, .y = i.x, .z = i.z};
1777 }
1778
1779 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1780 {
1781 return it;
1782 }
1783
1784 static constexpr IndexType Inverse (IndexType it)
1785 {
1786 return it;
1787 }
1788};
1789
1790struct Swap02
1791{
1792 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1793 {
1794 return Dim3{.x = i.z, .y = i.y, .z = i.x};
1795 }
1796
1797 static constexpr Dim3 Inverse (Dim3 i)
1798 {
1799 return Dim3{.x = i.z, .y = i.y, .z = i.x};
1800 }
1801
1802 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1803 {
1804 return it;
1805 }
1806
1807 static constexpr IndexType Inverse (IndexType it)
1808 {
1809 return it;
1810 }
1811};
1812
1813struct RotateFwd
1814{
1815 // dest -> src: (x,y,z) -> (y,z,x)
1816 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1817 {
1818 return Dim3{.x = i.y, .y = i.z, .z = i.x};
1819 }
1820
1821 // src -> dest: (x,y,z) -> (z,x,y)
1822 static constexpr Dim3 Inverse (Dim3 i)
1823 {
1824 return Dim3{.x = i.z, .y = i.x, .z = i.y};
1825 }
1826
1827 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1828 {
1829 return it;
1830 }
1831
1832 static constexpr IndexType Inverse (IndexType it)
1833 {
1834 return it;
1835 }
1836};
1837
1838struct RotateBwd
1839{
1840 // dest -> src: (x,y,z) -> (z,x,y)
1841 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1842 {
1843 return Dim3{.x = i.z, .y = i.x, .z = i.y};
1844 }
1845
1846 // src -> dest: (x,y,z) -> (y,z,x)
1847 static constexpr Dim3 Inverse (Dim3 i)
1848 {
1849 return Dim3{.x = i.y, .y = i.z, .z = i.x};
1850 }
1851
1852 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1853 {
1854 return it;
1855 }
1856
1857 static constexpr IndexType Inverse (IndexType it)
1858 {
1859 return it;
1860 }
1861};
1862
1864
1866namespace detail
1867{
1868 struct SubHelper
1869 {
1870 explicit SubHelper (Box const& domain);
1871
1872 [[nodiscard]] Box make_box (Box const& box) const;
1873
1874 [[nodiscard]] Periodicity make_periodicity (Periodicity const& period) const;
1875
1876 [[nodiscard]] bool ghost_safe (IntVect const& ng) const;
1877
1878 // This rearranges the order.
1879 [[nodiscard]] IntVect make_iv (IntVect const& iv) const;
1880
1881 // This keeps the order, but zero out the values in the hidden dimension.
1882 [[nodiscard]] IntVect make_safe_ghost (IntVect const& ng) const;
1883
1884 [[nodiscard]] BoxArray inverse_boxarray (BoxArray const& ba) const;
1885
1886 [[nodiscard]] IntVect inverse_order (IntVect const& order) const;
1887
1888 template <typename T>
1889 [[nodiscard]] T make_array (T const& a) const
1890 {
1891#if (AMREX_SPACEDIM == 1)
1893 return a;
1894#elif (AMREX_SPACEDIM == 2)
1895 if (m_case == case_1n) {
1896 return T{a[1],a[0]};
1897 } else {
1898 return a;
1899 }
1900#else
1901 if (m_case == case_11n) {
1902 return T{a[2],a[0],a[1]};
1903 } else if (m_case == case_1n1) {
1904 return T{a[1],a[0],a[2]};
1905 } else if (m_case == case_1nn) {
1906 return T{a[1],a[2],a[0]};
1907 } else if (m_case == case_n1n) {
1908 return T{a[0],a[2],a[1]};
1909 } else {
1910 return a;
1911 }
1912#endif
1913 }
1914
1915 [[nodiscard]] GpuArray<int,3> xyz_order () const;
1916
1917 template <typename FA>
1918 FA make_alias_mf (FA const& mf)
1919 {
1920 BoxList bl = mf.boxArray().boxList();
1921 for (auto& b : bl) {
1922 b = make_box(b);
1923 }
1924 auto const& ng = make_iv(mf.nGrowVect());
1925 FA submf(BoxArray(std::move(bl)), mf.DistributionMap(), mf.nComp(), ng, MFInfo{}.SetAlloc(false));
1926 using FAB = typename FA::fab_type;
1927 for (MFIter mfi(submf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
1928 submf.setFab(mfi, FAB(mfi.fabbox(), mf.nComp(), mf[mfi].dataPtr()));
1929 }
1930 return submf;
1931 }
1932
1933#if (AMREX_SPACEDIM == 2)
1934 enum Case { case_1n, case_other };
1935 int m_case = case_other;
1936#elif (AMREX_SPACEDIM == 3)
1937 enum Case { case_11n, case_1n1, case_1nn, case_n1n, case_other };
1938 int m_case = case_other;
1939#endif
1940 };
1941}
1943
1944}
1945
1946#endif
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ALWAYS_ASSERT(EX)
Definition AMReX_BLassert.H:50
#define AMREX_ENUM(CLASS,...)
Definition AMReX_Enum.H:209
#define AMREX_CUFFT_SAFE_CALL(call)
Definition AMReX_GpuError.H:92
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
amrex::ParmParse pp
Input file parser instance for the given namespace.
Definition AMReX_HypreIJIface.cpp:15
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
#define AMREX_D_TERM(a, b, c)
Definition AMReX_SPACE.H:172
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ IntVectND< dim > length() const noexcept
Return the length of the BoxND.
Definition AMReX_Box.H:155
__host__ __device__ const IntVectND< dim > & smallEnd() const &noexcept
Return the inclusive lower bound of the box.
Definition AMReX_Box.H:112
int CoordInt() const noexcept
Returns the CoordType as an int.
Definition AMReX_CoordSys.H:44
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
Rectangular problem domain geometry.
Definition AMReX_Geometry.H:75
const Box & Domain() const noexcept
Returns our rectangular domain.
Definition AMReX_Geometry.H:216
const RealBox & ProbDomain() const noexcept
Returns the problem domain.
Definition AMReX_Geometry.H:176
bool isPeriodic(int dir) const noexcept
Is the domain periodic in the specified direction?
Definition AMReX_Geometry.H:337
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:837
An Integer Vector in dim-Dimensional Space.
Definition AMReX_IntVect.H:149
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:328
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1373
Arena * The_Arena()
Definition AMReX_Arena.cpp:820
Definition AMReX_FFT_Helper.H:53
constexpr int FastNumPrimeFactors() noexcept
Return the default factor count used for selecting fast FFT lengths.
Definition AMReX_FFT_Helper.H:73
int nextFastLen(int target, int nfactors=FastNumPrimeFactors())
Return the smallest fast FFT length greater than or equal to target.
Definition AMReX_FFT_Helper.H:286
Direction
Definition AMReX_FFT_Helper.H:55
Boundary
Definition AMReX_FFT_Helper.H:59
DomainStrategy
Definition AMReX_FFT_Helper.H:57
Kind
Definition AMReX_FFT_Helper.H:61
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
__host__ __device__ std::pair< double, double > sincospi(double x)
Return sin(pi*x) and cos(pi*x) given x.
Definition AMReX_Math.H:205
@ make_alias
Definition AMReX_MakeType.H:7
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
__host__ __device__ BoxND< dim > shift(const BoxND< dim > &b, int dir, int nzones) noexcept
Return a BoxND with indices shifted by nzones in dir direction.
Definition AMReX_Box.H:1504
IndexTypeND< 3 > IndexType
IndexType is an alias for amrex::IndexTypeND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:36
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:241
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FFT_Helper.H:83
bool twod_mode
Definition AMReX_FFT_Helper.H:94
Info & setNumProcs(int n)
Cap the number of MPI ranks used by FFT.
Definition AMReX_FFT_Helper.H:163
bool oned_mode
Definition AMReX_FFT_Helper.H:103
int batch_size
Batched FFT size. Only support in R2C, not R2X.
Definition AMReX_FFT_Helper.H:106
Info & setOpenBCPadding(bool x)
Enable or disable OpenBCSolver internal FFT padding.
Definition AMReX_FFT_Helper.H:170
Info & setOpenBCPaddingNumPrimeFactors(int n)
Set the factor count used for OpenBCSolver FFT padding.
Definition AMReX_FFT_Helper.H:177
Info & setDomainStrategy(DomainStrategy s)
Select how the domain is decomposed across MPI ranks.
Definition AMReX_FFT_Helper.H:124
DomainStrategy domain_strategy
Domain composition strategy.
Definition AMReX_FFT_Helper.H:85
int nprocs
Max number of processes to use.
Definition AMReX_FFT_Helper.H:109
int pencil_threshold
Definition AMReX_FFT_Helper.H:89
bool openbc_padding
Whether OpenBCSolver pads internal FFT lengths for better performance.
Definition AMReX_FFT_Helper.H:112
Info & setOneDMode(bool x)
Flag the degenerate 2-D mode (nx==1 or ny==1) that still batches along z.
Definition AMReX_FFT_Helper.H:149
Info & setBatchSize(int bsize)
Specify the batch size for FFT.
Definition AMReX_FFT_Helper.H:156
int openbc_padding_nfactors
Definition AMReX_FFT_Helper.H:116
Info & setPencilThreshold(int t)
Override the slab→pencil break-even threshold for the automatic strategy.
Definition AMReX_FFT_Helper.H:131
Info & setTwoDMode(bool x)
Restrict transforms to the first two dimensions (3-D problems only).
Definition AMReX_FFT_Helper.H:138
Definition AMReX_FFT_Helper.H:360
void * pf
Definition AMReX_FFT_Helper.H:395
void unpack_r2r_buffer(T *pdst, void const *pbuf) const
Collapse the spectral R2R result back into the original real layout.
Definition AMReX_FFT_Helper.H:1303
std::conditional_t< std::is_same_v< float, T >, cuComplex, cuDoubleComplex > VendorComplex
Definition AMReX_FFT_Helper.H:364
VendorPlan plan2
Definition AMReX_FFT_Helper.H:394
void init_r2c(IntVectND< M > const &fft_size, void *pbf, void *pbb, bool cache, int ncomp=1)
Initialize an M-dimensional batched real-to-complex plan.
Definition AMReX_FFT_Helper.H:1548
int n
Definition AMReX_FFT_Helper.H:387
void destroy()
Release any vendor FFT plan objects owned by this Plan.
Definition AMReX_FFT_Helper.H:414
bool defined2
Definition AMReX_FFT_Helper.H:392
void init_r2r(Box const &box, VendorComplex *pc, std::pair< Boundary, Boundary > const &bc)
Initialize a real-to-real plan that reads/writes complex storage.
Definition AMReX_FFT_Helper.H:896
void pack_r2r_buffer(void *pbuf, T const *psrc) const
Expand the real R2R input into the symmetry-extended buffer expected by CUFFT/rocFFT.
Definition AMReX_FFT_Helper.H:1065
static void free_scratch_space(void *p)
Release GPU scratch allocated via alloc_scratch_space().
Definition AMReX_FFT_Helper.H:1057
static void destroy_vendor_plan(VendorPlan plan)
Helper that destroys a vendor plan of the appropriate backend type.
Definition AMReX_FFT_Helper.H:1513
Kind get_r2r_kind(std::pair< Boundary, Boundary > const &bc)
Map boundary conditions to the Plan Kind for real-to-real transforms.
Definition AMReX_FFT_Helper.H:758
cufftHandle VendorPlan
Definition AMReX_FFT_Helper.H:362
Kind kind
Definition AMReX_FFT_Helper.H:389
void init_c2c(Box const &box, VendorComplex *p, int ncomp=1, int ndims=1)
Initialize a complex-to-complex plan across 1/2/3 dimensions.
Definition AMReX_FFT_Helper.H:590
int howmany
Definition AMReX_FFT_Helper.H:388
void * pb
Definition AMReX_FFT_Helper.H:396
void init_r2c(Box const &box, T *pr, VendorComplex *pc, bool is_2d_transform=false, int ncomp=1)
Initialize a 1-D or 2-D real-to-complex plan over the supplied Box.
Definition AMReX_FFT_Helper.H:438
void compute_r2r()
Execute the real-to-real plan, including GPU packing/unpacking.
Definition AMReX_FFT_Helper.H:1456
void compute_c2c()
Execute the complex-to-complex plan in place.
Definition AMReX_FFT_Helper.H:996
bool r2r_data_is_complex
Definition AMReX_FFT_Helper.H:390
void * alloc_scratch_space() const
Allocate GPU scratch space large enough to hold packed R2R data.
Definition AMReX_FFT_Helper.H:1040
VendorPlan plan
Definition AMReX_FFT_Helper.H:393
void compute_r2c()
Execute the previously initialized real-to-complex plan.
Definition AMReX_FFT_Helper.H:944
bool defined
Definition AMReX_FFT_Helper.H:391
void set_ptrs(void *p0, void *p1)
Register device pointers used by the forward/backward executions.
Definition AMReX_FFT_Helper.H:405
void init_r2r(Box const &box, T *p, std::pair< Boundary, Boundary > const &bc, int howmany_initval=1)
Initialize a real-to-real (cosine/sine) plan that operates on real buffers.
Definition AMReX_FFT_Helper.H:792
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30