Block-Structured AMR Software Framework
Loading...
Searching...
No Matches
AMReX_FFT_Helper.H
Go to the documentation of this file.
1#ifndef AMREX_FFT_HELPER_H_
2#define AMREX_FFT_HELPER_H_
3#include <AMReX_Config.H>
4
5#include <AMReX.H>
6#include <AMReX_BLProfiler.H>
9#include <AMReX_Enum.H>
10#include <AMReX_FabArray.H>
11#include <AMReX_Gpu.H>
12#include <AMReX_GpuComplex.H>
13#include <AMReX_Math.H>
14#include <AMReX_Periodicity.H>
15
22#if defined(AMREX_USE_CUDA)
23# include <cufft.h>
24# include <cuComplex.h>
25#elif defined(AMREX_USE_HIP)
26# if __has_include(<rocfft/rocfft.h>) // ROCm 5.3+
27# include <rocfft/rocfft.h>
28# else
29# include <rocfft.h>
30# endif
31# include <hip/hip_complex.h>
32#elif defined(AMREX_USE_SYCL)
33# if __has_include(<oneapi/mkl/dft.hpp>) // oneAPI 2025.0
34# include <oneapi/mkl/dft.hpp>
35#else
36# define AMREX_USE_MKL_DFTI_2024 1
37# include <oneapi/mkl/dfti.hpp>
38# endif
39#else
40# include <fftw3.h>
41#endif
42
43#include <algorithm>
44#include <complex>
45#include <limits>
46#include <memory>
47#include <tuple>
48#include <utility>
49#include <variant>
50
51namespace amrex::FFT
52{
53
54enum struct Direction { forward, backward, both, none };
55
57
59
62
63struct Info
64{
67
71
75 bool twod_mode = false;
76
84 bool oned_mode = false;
85
87 int batch_size = 1;
88
90 int nprocs = std::numeric_limits<int>::max();
91
105 Info& setPencilThreshold (int t) { pencil_threshold = t; return *this; }
112 Info& setTwoDMode (bool x) { twod_mode = x; return *this; }
123 Info& setOneDMode (bool x) { oned_mode = x; return *this; }
130 Info& setBatchSize (int bsize) { batch_size = bsize; return *this; }
137 Info& setNumProcs (int n) { nprocs = n; return *this; }
138};
139
140#ifdef AMREX_USE_HIP
142namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
144#endif
145
146#ifdef AMREX_USE_SYCL
148namespace detail
149{
150inline void assert_no_external_stream ()
151{
154 "SYCL FFT does not support external GPU streams.");
155}
156
157template <typename T, Direction direction, typename P, typename TI, typename TO>
158void sycl_execute (P* plan, TI* in, TO* out)
159{
160 assert_no_external_stream();
161#ifndef AMREX_USE_MKL_DFTI_2024
162 std::int64_t workspaceSize = 0;
163#else
164 std::size_t workspaceSize = 0;
165#endif
166 plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
167 &workspaceSize);
168 auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
169 plan->set_workspace(buffer);
170 sycl::event r;
171 if (std::is_same_v<TI,TO>) {
173 if constexpr (direction == Direction::forward) {
174 r = oneapi::mkl::dft::compute_forward(*plan, out);
175 } else {
176 r = oneapi::mkl::dft::compute_backward(*plan, out);
177 }
178 } else {
179 if constexpr (direction == Direction::forward) {
180 r = oneapi::mkl::dft::compute_forward(*plan, in, out);
181 } else {
182 r = oneapi::mkl::dft::compute_backward(*plan, in, out);
183 }
184 }
185 r.wait();
186 amrex::The_Arena()->free(buffer);
187}
188}
190#endif
191
192template <typename T>
193struct Plan
194{
195#if defined(AMREX_USE_CUDA)
196 using VendorPlan = cufftHandle;
197 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
198 cuComplex, cuDoubleComplex>;
199#elif defined(AMREX_USE_HIP)
200 using VendorPlan = rocfft_plan;
201 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
202 float2, double2>;
203#elif defined(AMREX_USE_SYCL)
204 using mkl_desc_r = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
205 ? oneapi::mkl::dft::precision::SINGLE
206 : oneapi::mkl::dft::precision::DOUBLE,
207 oneapi::mkl::dft::domain::REAL>;
208 using mkl_desc_c = oneapi::mkl::dft::descriptor<std::is_same_v<float,T>
209 ? oneapi::mkl::dft::precision::SINGLE
210 : oneapi::mkl::dft::precision::DOUBLE,
211 oneapi::mkl::dft::domain::COMPLEX>;
212 using VendorPlan = std::variant<mkl_desc_r*,mkl_desc_c*>;
213 using VendorComplex = std::complex<T>;
214#else
215 using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
216 fftwf_plan, fftw_plan>;
217 using VendorComplex = std::conditional_t<std::is_same_v<float,T>,
218 fftwf_complex, fftw_complex>;
219#endif
220
221 int n = 0;
222 int howmany = 0;
225 bool defined = false;
226 bool defined2 = false;
229 void* pf = nullptr;
230 void* pb = nullptr;
231
232#ifdef AMREX_USE_GPU
239 void set_ptrs (void* p0, void* p1) {
240 pf = p0;
241 pb = p1;
242 }
243#endif
244
248 void destroy ()
249 {
250 if (defined) {
252 defined = false;
253 }
254#if !defined(AMREX_USE_GPU)
255 if (defined2) {
257 defined2 = false;
258 }
259#endif
260 }
261
271 template <Direction D>
272 void init_r2c (Box const& box, T* pr, VendorComplex* pc, bool is_2d_transform = false, int ncomp = 1)
273 {
274 static_assert(D == Direction::forward || D == Direction::backward);
275
276 int rank = is_2d_transform ? 2 : 1;
277
279 defined = true;
280 pf = (void*)pr;
281 pb = (void*)pc;
282
283 int len[2] = {};
284 if (rank == 1) {
285 len[0] = box.length(0);
286 len[1] = box.length(0); // Not used except for HIP. Yes it's `(0)`.
287 } else {
288 len[0] = box.length(1); // Most FFT libraries assume row-major ordering
289 len[1] = box.length(0); // except for rocfft
290 }
291 int nr = (rank == 1) ? len[0] : len[0]*len[1];
292 n = nr;
293 int nc = (rank == 1) ? (len[0]/2+1) : (len[1]/2+1)*len[0];
294#if (AMREX_SPACEDIM == 1)
295 howmany = 1;
296#else
297 howmany = (rank == 1) ? AMREX_D_TERM(1, *box.length(1), *box.length(2))
298 : AMREX_D_TERM(1, *1 , *box.length(2));
299#endif
300 howmany *= ncomp;
301
303
304#if defined(AMREX_USE_CUDA)
305
306 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
307 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
308 std::size_t work_size;
309 if constexpr (D == Direction::forward) {
310 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
312 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nr, nullptr, 1, nc, fwd_type, howmany, &work_size));
313 } else {
314 cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
316 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nc, nullptr, 1, nr, bwd_type, howmany, &work_size));
317 }
318
319#elif defined(AMREX_USE_HIP)
320
321 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
322 // switch to column-major ordering
323 std::size_t length[2] = {std::size_t(len[1]), std::size_t(len[0])};
324 if constexpr (D == Direction::forward) {
325 AMREX_ROCFFT_SAFE_CALL
326 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
327 rocfft_transform_type_real_forward, prec, rank,
328 length, howmany, nullptr));
329 } else {
330 AMREX_ROCFFT_SAFE_CALL
331 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
332 rocfft_transform_type_real_inverse, prec, rank,
333 length, howmany, nullptr));
334 }
335
336#elif defined(AMREX_USE_SYCL)
337
338 mkl_desc_r* pp;
339 if (rank == 1) {
340 pp = new mkl_desc_r(len[0]);
341 } else {
342 pp = new mkl_desc_r({std::int64_t(len[0]), std::int64_t(len[1])});
343 }
344#ifndef AMREX_USE_MKL_DFTI_2024
345 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
346 oneapi::mkl::dft::config_value::NOT_INPLACE);
347#else
348 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
349#endif
350 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
351 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nr);
352 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
353 std::vector<std::int64_t> strides;
354 strides.push_back(0);
355 if (rank == 2) { strides.push_back(len[1]); }
356 strides.push_back(1);
357#ifndef AMREX_USE_MKL_DFTI_2024
358 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
359 // Do not set BWD_STRIDES
360#else
361 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
362 // Do not set BWD_STRIDES
363#endif
364 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
365 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
366 detail::assert_no_external_stream();
367 pp->commit(amrex::Gpu::Device::streamQueue());
368 plan = pp;
369
370#else /* FFTW */
371
372 if constexpr (std::is_same_v<float,T>) {
373 if constexpr (D == Direction::forward) {
374 plan = fftwf_plan_many_dft_r2c
375 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
376 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
377 } else {
378 plan = fftwf_plan_many_dft_c2r
379 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
380 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
381 }
382 } else {
383 if constexpr (D == Direction::forward) {
384 plan = fftw_plan_many_dft_r2c
385 (rank, len, howmany, pr, nullptr, 1, nr, pc, nullptr, 1, nc,
386 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
387 } else {
388 plan = fftw_plan_many_dft_c2r
389 (rank, len, howmany, pc, nullptr, 1, nc, pr, nullptr, 1, nr,
390 FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
391 }
392 }
393#endif
394 }
395
405 template <Direction D, int M>
406 void init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp = 1);
407
423 template <Direction D>
424 void init_c2c (Box const& box, VendorComplex* p, int ncomp = 1, int ndims = 1)
425 {
426 static_assert(D == Direction::forward || D == Direction::backward);
427
429 defined = true;
430 pf = (void*)p;
431 pb = (void*)p;
432
433 int len[3] = {};
434
435 if (ndims == 1) {
436 n = box.length(0);
437 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
438 howmany *= ncomp;
439 len[0] = box.length(0);
440 }
441#if (AMREX_SPACEDIM >= 2)
442 else if (ndims == 2) {
443 n = box.length(0) * box.length(1);
444#if (AMREX_SPACEDIM == 2)
445 howmany = ncomp;
446#else
447 howmany = box.length(2) * ncomp;
448#endif
449 len[0] = box.length(1);
450 len[1] = box.length(0);
451 }
452#if (AMREX_SPACEDIM == 3)
453 else if (ndims == 3) {
454 n = box.length(0) * box.length(1) * box.length(2);
455 howmany = ncomp;
456 len[0] = box.length(2);
457 len[1] = box.length(1);
458 len[2] = box.length(0);
459 }
460#endif
461#endif
462
463#if defined(AMREX_USE_CUDA)
464 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
465 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
466
467 cufftType t = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
468 std::size_t work_size;
470 (cufftMakePlanMany(plan, ndims, len, nullptr, 1, n, nullptr, 1, n, t, howmany, &work_size));
471
472#elif defined(AMREX_USE_HIP)
473
474 auto prec = std::is_same_v<float,T> ? rocfft_precision_single
475 : rocfft_precision_double;
476 auto dir= (D == Direction::forward) ? rocfft_transform_type_complex_forward
477 : rocfft_transform_type_complex_inverse;
478 std::size_t length[3];
479 if (ndims == 1) {
480 length[0] = len[0];
481 } else if (ndims == 2) {
482 length[0] = len[1];
483 length[1] = len[0];
484 } else {
485 length[0] = len[2];
486 length[1] = len[1];
487 length[2] = len[0];
488 }
489 AMREX_ROCFFT_SAFE_CALL
490 (rocfft_plan_create(&plan, rocfft_placement_inplace, dir, prec, ndims,
491 length, howmany, nullptr));
492
493#elif defined(AMREX_USE_SYCL)
494
495 mkl_desc_c* pp;
496 if (ndims == 1) {
497 pp = new mkl_desc_c(n);
498 } else if (ndims == 2) {
499 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1])});
500 } else {
501 pp = new mkl_desc_c({std::int64_t(len[0]), std::int64_t(len[1]), std::int64_t(len[2])});
502 }
503#ifndef AMREX_USE_MKL_DFTI_2024
504 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
505 oneapi::mkl::dft::config_value::INPLACE);
506#else
507 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
508#endif
509 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
510 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
511 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
512 std::vector<std::int64_t> strides(ndims+1);
513 strides[0] = 0;
514 strides[ndims] = 1;
515 for (int i = ndims-1; i >= 1; --i) {
516 strides[i] = strides[i+1] * len[i];
517 }
518#ifndef AMREX_USE_MKL_DFTI_2024
519 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
520 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
521#else
522 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
523 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
524#endif
525 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
526 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
527 detail::assert_no_external_stream();
528 pp->commit(amrex::Gpu::Device::streamQueue());
529 plan = pp;
530
531#else /* FFTW */
532
533 if constexpr (std::is_same_v<float,T>) {
534 if constexpr (D == Direction::forward) {
535 plan = fftwf_plan_many_dft
536 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
537 FFTW_ESTIMATE);
538 } else {
539 plan = fftwf_plan_many_dft
540 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
541 FFTW_ESTIMATE);
542 }
543 } else {
544 if constexpr (D == Direction::forward) {
545 plan = fftw_plan_many_dft
546 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, -1,
547 FFTW_ESTIMATE);
548 } else {
549 plan = fftw_plan_many_dft
550 (ndims, len, howmany, p, nullptr, 1, n, p, nullptr, 1, n, +1,
551 FFTW_ESTIMATE);
552 }
553 }
554#endif
555 }
556
557#ifndef AMREX_USE_GPU
558 template <Direction D>
559 fftw_r2r_kind get_fftw_kind (std::pair<Boundary,Boundary> const& bc)
560 {
561 if (bc.first == Boundary::even && bc.second == Boundary::even)
562 {
563 return (D == Direction::forward) ? FFTW_REDFT10 : FFTW_REDFT01;
564 }
565 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
566 {
567 return FFTW_REDFT11;
568 }
569 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
570 {
571 return FFTW_RODFT11;
572 }
573 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
574 {
575 return (D == Direction::forward) ? FFTW_RODFT10 : FFTW_RODFT01;
576 }
577 else {
578 amrex::Abort("FFT: unsupported BC");
579 return fftw_r2r_kind{};
580 }
581
582 }
583#endif
584
591 template <Direction D>
592 Kind get_r2r_kind (std::pair<Boundary,Boundary> const& bc)
593 {
594 if (bc.first == Boundary::even && bc.second == Boundary::even)
595 {
597 }
598 else if (bc.first == Boundary::even && bc.second == Boundary::odd)
599 {
600 return Kind::r2r_eo;
601 }
602 else if (bc.first == Boundary::odd && bc.second == Boundary::even)
603 {
604 return Kind::r2r_oe;
605 }
606 else if (bc.first == Boundary::odd && bc.second == Boundary::odd)
607 {
609 }
610 else {
611 amrex::Abort("FFT: unsupported BC");
612 return Kind::none;
613 }
614
615 }
616
625 template <Direction D>
626 void init_r2r (Box const& box, T* p, std::pair<Boundary,Boundary> const& bc,
627 int howmany_initval = 1)
628 {
629 static_assert(D == Direction::forward || D == Direction::backward);
630
631 kind = get_r2r_kind<D>(bc);
632 defined = true;
633 pf = (void*)p;
634 pb = (void*)p;
635
636 n = box.length(0);
637 howmany = AMREX_D_TERM(howmany_initval, *box.length(1), *box.length(2));
638
639#if defined(AMREX_USE_GPU)
640 int nex=0;
641 if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
642 Direction::forward == D) {
643 nex = 2*n;
644 } else if (bc.first == Boundary::odd && bc.second == Boundary::odd &&
645 Direction::backward == D) {
646 nex = 4*n;
647 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
648 Direction::forward == D) {
649 nex = 2*n;
650 } else if (bc.first == Boundary::even && bc.second == Boundary::even &&
651 Direction::backward == D) {
652 nex = 4*n;
653 } else if ((bc.first == Boundary::even && bc.second == Boundary::odd) ||
654 (bc.first == Boundary::odd && bc.second == Boundary::even)) {
655 nex = 4*n;
656 } else {
657 amrex::Abort("FFT: unsupported BC");
658 }
659 int nc = (nex/2) + 1;
660
661#if defined (AMREX_USE_CUDA)
662
663 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
664 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
665 cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
666 std::size_t work_size;
668 (cufftMakePlanMany(plan, 1, &nex, nullptr, 1, nc*2, nullptr, 1, nc, fwd_type, howmany, &work_size));
669
670#elif defined(AMREX_USE_HIP)
671
673 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
674 const std::size_t length = nex;
675 AMREX_ROCFFT_SAFE_CALL
676 (rocfft_plan_create(&plan, rocfft_placement_inplace,
677 rocfft_transform_type_real_forward, prec, 1,
678 &length, howmany, nullptr));
679
680#elif defined(AMREX_USE_SYCL)
681
682 auto* pp = new mkl_desc_r(nex);
683#ifndef AMREX_USE_MKL_DFTI_2024
684 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
685 oneapi::mkl::dft::config_value::INPLACE);
686#else
687 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
688#endif
689 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
690 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, nc*2);
691 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
692 std::vector<std::int64_t> strides = {0,1};
693#ifndef AMREX_USE_MKL_DFTI_2024
694 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
695 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
696#else
697 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
698 pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
699#endif
700 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
701 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
702 detail::assert_no_external_stream();
703 pp->commit(amrex::Gpu::Device::streamQueue());
704 plan = pp;
705
706#endif
707
708#else /* FFTW */
709 auto fftw_kind = get_fftw_kind<D>(bc);
710 if constexpr (std::is_same_v<float,T>) {
711 plan = fftwf_plan_many_r2r
712 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
713 FFTW_ESTIMATE);
714 } else {
715 plan = fftw_plan_many_r2r
716 (1, &n, howmany, p, nullptr, 1, n, p, nullptr, 1, n, &fftw_kind,
717 FFTW_ESTIMATE);
718 }
719#endif
720 }
721
729 template <Direction D>
730 void init_r2r (Box const& box, VendorComplex* pc,
731 std::pair<Boundary,Boundary> const& bc)
732 {
733 static_assert(D == Direction::forward || D == Direction::backward);
734
735 auto* p = (T*)pc;
736
737#if defined(AMREX_USE_GPU)
738
739 init_r2r<D>(box, p, bc, 2);
740 r2r_data_is_complex = true;
741
742#else
743
744 kind = get_r2r_kind<D>(bc);
745 defined = true;
746 pf = (void*)p;
747 pb = (void*)p;
748
749 n = box.length(0);
750 howmany = AMREX_D_TERM(1, *box.length(1), *box.length(2));
751
752 defined2 = true;
753 auto fftw_kind = get_fftw_kind<D>(bc);
754 if constexpr (std::is_same_v<float,T>) {
755 plan = fftwf_plan_many_r2r
756 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
757 FFTW_ESTIMATE);
758 plan2 = fftwf_plan_many_r2r
759 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
760 FFTW_ESTIMATE);
761 } else {
762 plan = fftw_plan_many_r2r
763 (1, &n, howmany, p, nullptr, 2, n*2, p, nullptr, 2, n*2, &fftw_kind,
764 FFTW_ESTIMATE);
765 plan2 = fftw_plan_many_r2r
766 (1, &n, howmany, p+1, nullptr, 2, n*2, p+1, nullptr, 2, n*2, &fftw_kind,
767 FFTW_ESTIMATE);
768 }
769#endif
770 }
771
777 template <Direction D>
779 {
780 static_assert(D == Direction::forward || D == Direction::backward);
781 if (!defined) { return; }
782
783 using TI = std::conditional_t<(D == Direction::forward), T, VendorComplex>;
784 using TO = std::conditional_t<(D == Direction::backward), T, VendorComplex>;
785 auto* pi = (TI*)((D == Direction::forward) ? pf : pb);
786 auto* po = (TO*)((D == Direction::forward) ? pb : pf);
787
788#if defined(AMREX_USE_CUDA)
789 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
790
791 std::size_t work_size = 0;
792 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
793
794 auto* work_area = The_Arena()->alloc(work_size);
795 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
796
797 if constexpr (D == Direction::forward) {
798 if constexpr (std::is_same_v<float,T>) {
799 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, pi, po));
800 } else {
801 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, pi, po));
802 }
803 } else {
804 if constexpr (std::is_same_v<float,T>) {
805 AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan, pi, po));
806 } else {
807 AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan, pi, po));
808 }
809 }
811 The_Arena()->free(work_area);
812#elif defined(AMREX_USE_HIP)
813 detail::hip_execute(plan, (void**)&pi, (void**)&po);
814#elif defined(AMREX_USE_SYCL)
815 detail::sycl_execute<T,D>(std::get<0>(plan), pi, po);
816#else
818 if constexpr (std::is_same_v<float,T>) {
819 fftwf_execute(plan);
820 } else {
821 fftw_execute(plan);
822 }
823#endif
824 }
825
829 template <Direction D>
831 {
832 static_assert(D == Direction::forward || D == Direction::backward);
833 if (!defined) { return; }
834
835 auto* p = (VendorComplex*)pf;
836
837#if defined(AMREX_USE_CUDA)
838 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
839
840 std::size_t work_size = 0;
841 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
842
843 auto* work_area = The_Arena()->alloc(work_size);
844 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
845
846 auto dir = (D == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
847 if constexpr (std::is_same_v<float,T>) {
848 AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan, p, p, dir));
849 } else {
850 AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan, p, p, dir));
851 }
853 The_Arena()->free(work_area);
854#elif defined(AMREX_USE_HIP)
855 detail::hip_execute(plan, (void**)&p, (void**)&p);
856#elif defined(AMREX_USE_SYCL)
857 detail::sycl_execute<T,D>(std::get<1>(plan), p, p);
858#else
860 if constexpr (std::is_same_v<float,T>) {
861 fftwf_execute(plan);
862 } else {
863 fftw_execute(plan);
864 }
865#endif
866 }
867
868#ifdef AMREX_USE_GPU
874 [[nodiscard]] void* alloc_scratch_space () const
875 {
876 int nc = 0;
877 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
878 nc = n + 1;
879 } else if (kind == Kind::r2r_oo_b || kind == Kind::r2r_ee_b ||
881 nc = 2*n+1;
882 } else {
883 amrex::Abort("FFT: alloc_scratch_space: unsupported kind");
884 }
885 return The_Arena()->alloc(sizeof(GpuComplex<T>)*nc*howmany);
886 }
887
891 static void free_scratch_space (void* p) { The_Arena()->free(p); }
892
899 void pack_r2r_buffer (void* pbuf, T const* psrc) const
900 {
901 auto* pdst = (T*) pbuf;
902 if (kind == Kind::r2r_oo_f || kind == Kind::r2r_ee_f) {
903 T sign = (kind == Kind::r2r_oo_f) ? T(-1) : T(1);
904 int ostride = (n+1)*2;
905 int istride = n;
906 int nex = 2*n;
907 int norig = n;
908 Long nelems = Long(nex)*howmany;
910 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
911 {
912 auto batch = ielem / Long(nex);
913 auto i = int(ielem - batch*nex);
914 for (int ir = 0; ir < 2; ++ir) {
915 auto* po = pdst + (2*batch+ir)*ostride + i;
916 auto const* pi = psrc + 2*batch*istride + ir;
917 if (i < norig) {
918 *po = pi[i*2];
919 } else {
920 *po = sign * pi[(2*norig-1-i)*2];
921 }
922 }
923 });
924 } else {
925 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
926 {
927 auto batch = ielem / Long(nex);
928 auto i = int(ielem - batch*nex);
929 auto* po = pdst + batch*ostride + i;
930 auto const* pi = psrc + batch*istride;
931 if (i < norig) {
932 *po = pi[i];
933 } else {
934 *po = sign * pi[2*norig-1-i];
935 }
936 });
937 }
938 } else if (kind == Kind::r2r_oo_b) {
939 int ostride = (2*n+1)*2;
940 int istride = n;
941 int nex = 4*n;
942 int norig = n;
943 Long nelems = Long(nex)*howmany;
945 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
946 {
947 auto batch = ielem / Long(nex);
948 auto i = int(ielem - batch*nex);
949 for (int ir = 0; ir < 2; ++ir) {
950 auto* po = pdst + (2*batch+ir)*ostride + i;
951 auto const* pi = psrc + 2*batch*istride + ir;
952 if (i < norig) {
953 *po = pi[i*2];
954 } else if (i < (2*norig-1)) {
955 *po = pi[(2*norig-2-i)*2];
956 } else if (i == (2*norig-1)) {
957 *po = T(0);
958 } else if (i < (3*norig)) {
959 *po = -pi[(i-2*norig)*2];
960 } else if (i < (4*norig-1)) {
961 *po = -pi[(4*norig-2-i)*2];
962 } else {
963 *po = T(0);
964 }
965 }
966 });
967 } else {
968 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
969 {
970 auto batch = ielem / Long(nex);
971 auto i = int(ielem - batch*nex);
972 auto* po = pdst + batch*ostride + i;
973 auto const* pi = psrc + batch*istride;
974 if (i < norig) {
975 *po = pi[i];
976 } else if (i < (2*norig-1)) {
977 *po = pi[2*norig-2-i];
978 } else if (i == (2*norig-1)) {
979 *po = T(0);
980 } else if (i < (3*norig)) {
981 *po = -pi[i-2*norig];
982 } else if (i < (4*norig-1)) {
983 *po = -pi[4*norig-2-i];
984 } else {
985 *po = T(0);
986 }
987 });
988 }
989 } else if (kind == Kind::r2r_ee_b) {
990 int ostride = (2*n+1)*2;
991 int istride = n;
992 int nex = 4*n;
993 int norig = n;
994 Long nelems = Long(nex)*howmany;
996 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
997 {
998 auto batch = ielem / Long(nex);
999 auto i = int(ielem - batch*nex);
1000 for (int ir = 0; ir < 2; ++ir) {
1001 auto* po = pdst + (2*batch+ir)*ostride + i;
1002 auto const* pi = psrc + 2*batch*istride + ir;
1003 if (i < norig) {
1004 *po = pi[i*2];
1005 } else if (i == norig) {
1006 *po = T(0);
1007 } else if (i < (2*norig+1)) {
1008 *po = -pi[(2*norig-i)*2];
1009 } else if (i < (3*norig)) {
1010 *po = -pi[(i-2*norig)*2];
1011 } else if (i == 3*norig) {
1012 *po = T(0);
1013 } else {
1014 *po = pi[(4*norig-i)*2];
1015 }
1016 }
1017 });
1018 } else {
1019 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1020 {
1021 auto batch = ielem / Long(nex);
1022 auto i = int(ielem - batch*nex);
1023 auto* po = pdst + batch*ostride + i;
1024 auto const* pi = psrc + batch*istride;
1025 if (i < norig) {
1026 *po = pi[i];
1027 } else if (i == norig) {
1028 *po = T(0);
1029 } else if (i < (2*norig+1)) {
1030 *po = -pi[2*norig-i];
1031 } else if (i < (3*norig)) {
1032 *po = -pi[i-2*norig];
1033 } else if (i == 3*norig) {
1034 *po = T(0);
1035 } else {
1036 *po = pi[4*norig-i];
1037 }
1038 });
1039 }
1040 } else if (kind == Kind::r2r_eo) {
1041 int ostride = (2*n+1)*2;
1042 int istride = n;
1043 int nex = 4*n;
1044 int norig = n;
1045 Long nelems = Long(nex)*howmany;
1046 if (r2r_data_is_complex) {
1047 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1048 {
1049 auto batch = ielem / Long(nex);
1050 auto i = int(ielem - batch*nex);
1051 for (int ir = 0; ir < 2; ++ir) {
1052 auto* po = pdst + (2*batch+ir)*ostride + i;
1053 auto const* pi = psrc + 2*batch*istride + ir;
1054 if (i < norig) {
1055 *po = pi[i*2];
1056 } else if (i < (2*norig)) {
1057 *po = -pi[(2*norig-1-i)*2];
1058 } else if (i < (3*norig)) {
1059 *po = -pi[(i-2*norig)*2];
1060 } else {
1061 *po = pi[(4*norig-1-i)*2];
1062 }
1063 }
1064 });
1065 } else {
1066 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1067 {
1068 auto batch = ielem / Long(nex);
1069 auto i = int(ielem - batch*nex);
1070 auto* po = pdst + batch*ostride + i;
1071 auto const* pi = psrc + batch*istride;
1072 if (i < norig) {
1073 *po = pi[i];
1074 } else if (i < (2*norig)) {
1075 *po = -pi[2*norig-1-i];
1076 } else if (i < (3*norig)) {
1077 *po = -pi[i-2*norig];
1078 } else {
1079 *po = pi[4*norig-1-i];
1080 }
1081 });
1082 }
1083 } else if (kind == Kind::r2r_oe) {
1084 int ostride = (2*n+1)*2;
1085 int istride = n;
1086 int nex = 4*n;
1087 int norig = n;
1088 Long nelems = Long(nex)*howmany;
1089 if (r2r_data_is_complex) {
1090 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1091 {
1092 auto batch = ielem / Long(nex);
1093 auto i = int(ielem - batch*nex);
1094 for (int ir = 0; ir < 2; ++ir) {
1095 auto* po = pdst + (2*batch+ir)*ostride + i;
1096 auto const* pi = psrc + 2*batch*istride + ir;
1097 if (i < norig) {
1098 *po = pi[i*2];
1099 } else if (i < (2*norig)) {
1100 *po = pi[(2*norig-1-i)*2];
1101 } else if (i < (3*norig)) {
1102 *po = -pi[(i-2*norig)*2];
1103 } else {
1104 *po = -pi[(4*norig-1-i)*2];
1105 }
1106 }
1107 });
1108 } else {
1109 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1110 {
1111 auto batch = ielem / Long(nex);
1112 auto i = int(ielem - batch*nex);
1113 auto* po = pdst + batch*ostride + i;
1114 auto const* pi = psrc + batch*istride;
1115 if (i < norig) {
1116 *po = pi[i];
1117 } else if (i < (2*norig)) {
1118 *po = pi[2*norig-1-i];
1119 } else if (i < (3*norig)) {
1120 *po = -pi[i-2*norig];
1121 } else {
1122 *po = -pi[4*norig-1-i];
1123 }
1124 });
1125 }
1126 } else {
1127 amrex::Abort("FFT: pack_r2r_buffer: unsupported kind");
1128 }
1129 }
1130
1137 void unpack_r2r_buffer (T* pdst, void const* pbuf) const
1138 {
1139 auto const* psrc = (GpuComplex<T> const*) pbuf;
1140 int norig = n;
1141 Long nelems = Long(norig)*howmany;
1142 int ostride = n;
1143
1144 if (kind == Kind::r2r_oo_f) {
1145 int istride = n+1;
1146 if (r2r_data_is_complex) {
1147 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1148 {
1149 auto batch = ielem / Long(norig);
1150 auto k = int(ielem - batch*norig);
1151 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1152 for (int ir = 0; ir < 2; ++ir) {
1153 auto const& yk = psrc[(2*batch+ir)*istride+k+1];
1154 pdst[2*batch*ostride+ir+k*2] = s * yk.real() - c * yk.imag();
1155 }
1156 });
1157 } else {
1158 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1159 {
1160 auto batch = ielem / Long(norig);
1161 auto k = int(ielem - batch*norig);
1162 auto [s, c] = Math::sincospi(T(k+1)/T(2*norig));
1163 auto const& yk = psrc[batch*istride+k+1];
1164 pdst[batch*ostride+k] = s * yk.real() - c * yk.imag();
1165 });
1166 }
1167 } else if (kind == Kind::r2r_oo_b) {
1168 int istride = 2*n+1;
1169 if (r2r_data_is_complex) {
1170 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1171 {
1172 auto batch = ielem / Long(norig);
1173 auto k = int(ielem - batch*norig);
1174 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1175 for (int ir = 0; ir < 2; ++ir) {
1176 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1177 pdst[2*batch*ostride+ir+k*2] = T(0.5)*(s * yk.real() - c * yk.imag());
1178 }
1179 });
1180 } else {
1181 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1182 {
1183 auto batch = ielem / Long(norig);
1184 auto k = int(ielem - batch*norig);
1185 auto [s, c] = Math::sincospi(T(2*k+1)/T(2*norig));
1186 auto const& yk = psrc[batch*istride+2*k+1];
1187 pdst[batch*ostride+k] = T(0.5)*(s * yk.real() - c * yk.imag());
1188 });
1189 }
1190 } else if (kind == Kind::r2r_ee_f) {
1191 int istride = n+1;
1192 if (r2r_data_is_complex) {
1193 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1194 {
1195 auto batch = ielem / Long(norig);
1196 auto k = int(ielem - batch*norig);
1197 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1198 for (int ir = 0; ir < 2; ++ir) {
1199 auto const& yk = psrc[(2*batch+ir)*istride+k];
1200 pdst[2*batch*ostride+ir+k*2] = c * yk.real() + s * yk.imag();
1201 }
1202 });
1203 } else {
1204 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1205 {
1206 auto batch = ielem / Long(norig);
1207 auto k = int(ielem - batch*norig);
1208 auto [s, c] = Math::sincospi(T(k)/T(2*norig));
1209 auto const& yk = psrc[batch*istride+k];
1210 pdst[batch*ostride+k] = c * yk.real() + s * yk.imag();
1211 });
1212 }
1213 } else if (kind == Kind::r2r_ee_b) {
1214 int istride = 2*n+1;
1215 if (r2r_data_is_complex) {
1216 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1217 {
1218 auto batch = ielem / Long(norig);
1219 auto k = int(ielem - batch*norig);
1220 for (int ir = 0; ir < 2; ++ir) {
1221 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1222 pdst[2*batch*ostride+ir+k*2] = T(0.5) * yk.real();
1223 }
1224 });
1225 } else {
1226 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1227 {
1228 auto batch = ielem / Long(norig);
1229 auto k = int(ielem - batch*norig);
1230 auto const& yk = psrc[batch*istride+2*k+1];
1231 pdst[batch*ostride+k] = T(0.5) * yk.real();
1232 });
1233 }
1234 } else if (kind == Kind::r2r_eo) {
1235 int istride = 2*n+1;
1236 if (r2r_data_is_complex) {
1237 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1238 {
1239 auto batch = ielem / Long(norig);
1240 auto k = int(ielem - batch*norig);
1241 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1242 for (int ir = 0; ir < 2; ++ir) {
1243 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1244 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (c * yk.real() + s * yk.imag());
1245 }
1246 });
1247 } else {
1248 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1249 {
1250 auto batch = ielem / Long(norig);
1251 auto k = int(ielem - batch*norig);
1252 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1253 auto const& yk = psrc[batch*istride+2*k+1];
1254 pdst[batch*ostride+k] = T(0.5) * (c * yk.real() + s * yk.imag());
1255 });
1256 }
1257 } else if (kind == Kind::r2r_oe) {
1258 int istride = 2*n+1;
1259 if (r2r_data_is_complex) {
1260 ParallelForOMP(nelems/2, [=] AMREX_GPU_DEVICE (Long ielem)
1261 {
1262 auto batch = ielem / Long(norig);
1263 auto k = int(ielem - batch*norig);
1264 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1265 for (int ir = 0; ir < 2; ++ir) {
1266 auto const& yk = psrc[(2*batch+ir)*istride+2*k+1];
1267 pdst[2*batch*ostride+ir+k*2] = T(0.5) * (s * yk.real() - c * yk.imag());
1268 }
1269 });
1270 } else {
1271 ParallelForOMP(nelems, [=] AMREX_GPU_DEVICE (Long ielem)
1272 {
1273 auto batch = ielem / Long(norig);
1274 auto k = int(ielem - batch*norig);
1275 auto [s, c] = Math::sincospi((k+T(0.5))/T(2*norig));
1276 auto const& yk = psrc[batch*istride+2*k+1];
1277 pdst[batch*ostride+k] = T(0.5) * (s * yk.real() - c * yk.imag());
1278 });
1279 }
1280 } else {
1281 amrex::Abort("FFT: unpack_r2r_buffer: unsupported kind");
1282 }
1283 }
1284#endif
1285
1289 template <Direction D>
1291 {
1292 static_assert(D == Direction::forward || D == Direction::backward);
1293 if (!defined) { return; }
1294
1295#if defined(AMREX_USE_GPU)
1296
1297 auto* pscratch = alloc_scratch_space();
1298
1299 pack_r2r_buffer(pscratch, (T*)((D == Direction::forward) ? pf : pb));
1300
1301#if defined(AMREX_USE_CUDA)
1302
1303 AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
1304
1305 std::size_t work_size = 0;
1306 AMREX_CUFFT_SAFE_CALL(cufftGetSize(plan, &work_size));
1307
1308 auto* work_area = The_Arena()->alloc(work_size);
1309 AMREX_CUFFT_SAFE_CALL(cufftSetWorkArea(plan, work_area));
1310
1311 if constexpr (std::is_same_v<float,T>) {
1312 AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan, (T*)pscratch, (VendorComplex*)pscratch));
1313 } else {
1314 AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan, (T*)pscratch, (VendorComplex*)pscratch));
1315 }
1316
1317#elif defined(AMREX_USE_HIP)
1318 detail::hip_execute(plan, (void**)&pscratch, (void**)&pscratch);
1319#elif defined(AMREX_USE_SYCL)
1320 detail::sycl_execute<T,Direction::forward>(std::get<0>(plan), (T*)pscratch, (VendorComplex*)pscratch);
1321#endif
1322
1323 unpack_r2r_buffer((T*)((D == Direction::forward) ? pb : pf), pscratch);
1324
1326 free_scratch_space(pscratch);
1327#if defined(AMREX_USE_CUDA)
1328 The_Arena()->free(work_area);
1329#endif
1330
1331#else /* FFTW */
1332
1333 if constexpr (std::is_same_v<float,T>) {
1334 fftwf_execute(plan);
1335 if (defined2) { fftwf_execute(plan2); }
1336 } else {
1337 fftw_execute(plan);
1338 if (defined2) { fftw_execute(plan2); }
1339 }
1340
1341#endif
1342 }
1343
1348 {
1349#if defined(AMREX_USE_CUDA)
1350 AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
1351#elif defined(AMREX_USE_HIP)
1352 AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
1353#elif defined(AMREX_USE_SYCL)
1354 std::visit([](auto&& p) { delete p; }, plan);
1355#else
1356 if constexpr (std::is_same_v<float,T>) {
1357 fftwf_destroy_plan(plan);
1358 } else {
1359 fftw_destroy_plan(plan);
1360 }
1361#endif
1362 }
1363};
1364
1366
1367using Key = std::tuple<IntVectND<3>,int,Direction,Kind>;
1368using PlanD = typename Plan<double>::VendorPlan;
1369using PlanF = typename Plan<float>::VendorPlan;
1370
1371namespace detail {
1372 PlanD* get_vendor_plan_d (Key const& key);
1373 PlanF* get_vendor_plan_f (Key const& key);
1374
1375 void add_vendor_plan_d (Key const& key, PlanD plan);
1376 void add_vendor_plan_f (Key const& key, PlanF plan);
1377}
1379
1380template <typename T>
1381template <Direction D, int M>
1382void Plan<T>::init_r2c (IntVectND<M> const& fft_size, void* pbf, void* pbb, bool cache, int ncomp)
1383{
1384 static_assert(D == Direction::forward || D == Direction::backward);
1385
1386 kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
1387 defined = true;
1388 pf = pbf;
1389 pb = pbb;
1390
1391 n = 1;
1392 for (auto s : fft_size) { n *= s; }
1393 howmany = ncomp;
1394
1395#if defined(AMREX_USE_GPU)
1396 Key key = {fft_size.template expand<3>(), ncomp, D, kind};
1397 if (cache) {
1398 VendorPlan* cached_plan = nullptr;
1399 if constexpr (std::is_same_v<float,T>) {
1400 cached_plan = detail::get_vendor_plan_f(key);
1401 } else {
1402 cached_plan = detail::get_vendor_plan_d(key);
1403 }
1404 if (cached_plan) {
1405 plan = *cached_plan;
1406 return;
1407 }
1408 }
1409#else
1410 amrex::ignore_unused(cache);
1411#endif
1412
1413 int len[M];
1414 for (int i = 0; i < M; ++i) {
1415 len[i] = fft_size[M-1-i];
1416 }
1417
1418 int nc = fft_size[0]/2+1;
1419 for (int i = 1; i < M; ++i) {
1420 nc *= fft_size[i];
1421 }
1422
1423#if defined(AMREX_USE_CUDA)
1424
1425 AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
1426 AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
1427 cufftType type;
1428 int n_in, n_out;
1429 if constexpr (D == Direction::forward) {
1430 type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
1431 n_in = n;
1432 n_out = nc;
1433 } else {
1434 type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
1435 n_in = nc;
1436 n_out = n;
1437 }
1438 std::size_t work_size;
1440 (cufftMakePlanMany(plan, M, len, nullptr, 1, n_in, nullptr, 1, n_out, type, howmany, &work_size));
1441
1442#elif defined(AMREX_USE_HIP)
1443
1444 auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
1445 std::size_t length[M];
1446 for (int idim = 0; idim < M; ++idim) { length[idim] = fft_size[idim]; }
1447 if constexpr (D == Direction::forward) {
1448 AMREX_ROCFFT_SAFE_CALL
1449 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1450 rocfft_transform_type_real_forward, prec, M,
1451 length, howmany, nullptr));
1452 } else {
1453 AMREX_ROCFFT_SAFE_CALL
1454 (rocfft_plan_create(&plan, rocfft_placement_notinplace,
1455 rocfft_transform_type_real_inverse, prec, M,
1456 length, howmany, nullptr));
1457 }
1458
1459#elif defined(AMREX_USE_SYCL)
1460
1461 mkl_desc_r* pp;
1462 if (M == 1) {
1463 pp = new mkl_desc_r(fft_size[0]);
1464 } else {
1465 std::vector<std::int64_t> len64(M);
1466 for (int idim = 0; idim < M; ++idim) {
1467 len64[idim] = len[idim];
1468 }
1469 pp = new mkl_desc_r(len64);
1470 }
1471#ifndef AMREX_USE_MKL_DFTI_2024
1472 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
1473 oneapi::mkl::dft::config_value::NOT_INPLACE);
1474#else
1475 pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
1476#endif
1477 pp->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, howmany);
1478 pp->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
1479 pp->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, nc);
1480 std::vector<std::int64_t> strides(M+1);
1481 strides[0] = 0;
1482 strides[M] = 1;
1483 for (int i = M-1; i >= 1; --i) {
1484 strides[i] = strides[i+1] * fft_size[M-1-i];
1485 }
1486
1487#ifndef AMREX_USE_MKL_DFTI_2024
1488 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
1489 // Do not set BWD_STRIDES
1490#else
1491 pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
1492 // Do not set BWD_STRIDES
1493#endif
1494 pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
1495 oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
1496 detail::assert_no_external_stream();
1497 pp->commit(amrex::Gpu::Device::streamQueue());
1498 plan = pp;
1499
1500#else /* FFTW */
1501
1502 if (pf == nullptr || pb == nullptr) {
1503 defined = false;
1504 return;
1505 }
1506
1507 if constexpr (std::is_same_v<float,T>) {
1508 if constexpr (D == Direction::forward) {
1509 plan = fftwf_plan_many_dft_r2c
1510 (M, len, howmany, (float*)pf, nullptr, 1, n, (fftwf_complex*)pb, nullptr, 1, nc,
1511 FFTW_ESTIMATE);
1512 } else {
1513 plan = fftwf_plan_many_dft_c2r
1514 (M, len, howmany, (fftwf_complex*)pb, nullptr, 1, nc, (float*)pf, nullptr, 1, n,
1515 FFTW_ESTIMATE);
1516 }
1517 } else {
1518 if constexpr (D == Direction::forward) {
1519 plan = fftw_plan_many_dft_r2c
1520 (M, len, howmany, (double*)pf, nullptr, 1, n, (fftw_complex*)pb, nullptr, 1, nc,
1521 FFTW_ESTIMATE);
1522 } else {
1523 plan = fftw_plan_many_dft_c2r
1524 (M, len, howmany, (fftw_complex*)pb, nullptr, 1, nc, (double*)pf, nullptr, 1, n,
1525 FFTW_ESTIMATE);
1526 }
1527 }
1528#endif
1529
1530#if defined(AMREX_USE_GPU)
1531 if (cache) {
1532 if constexpr (std::is_same_v<float,T>) {
1533 detail::add_vendor_plan_f(key, plan);
1534 } else {
1535 detail::add_vendor_plan_d(key, plan);
1536 }
1537 }
1538#endif
1539}
1540
1542namespace detail
1543{
1544 DistributionMapping make_iota_distromap (Long n);
1545
1546 template <typename FA>
1547 typename FA::FABType::value_type * get_fab (FA& fa)
1548 {
1549 auto myproc = ParallelContext::MyProcSub();
1550 if (myproc < fa.size()) {
1551 return fa.fabPtr(myproc);
1552 } else {
1553 return nullptr;
1554 }
1555 }
1556
1557 template <typename FA1, typename FA2>
1558 std::unique_ptr<char,DataDeleter> make_mfs_share (FA1& fa1, FA2& fa2)
1559 {
1560 bool not_same_fa = true;
1561 if constexpr (std::is_same_v<FA1,FA2>) {
1562 not_same_fa = (&fa1 != &fa2);
1563 }
1564 using FAB1 = typename FA1::FABType::value_type;
1565 using FAB2 = typename FA2::FABType::value_type;
1566 using T1 = typename FAB1::value_type;
1567 using T2 = typename FAB2::value_type;
1568 auto myproc = ParallelContext::MyProcSub();
1569 bool alloc_1 = (myproc < fa1.size());
1570 bool alloc_2 = (myproc < fa2.size()) && not_same_fa;
1571 void* p = nullptr;
1572 if (alloc_1 && alloc_2) {
1573 Box const& box1 = fa1.fabbox(myproc);
1574 Box const& box2 = fa2.fabbox(myproc);
1575 int ncomp1 = fa1.nComp();
1576 int ncomp2 = fa2.nComp();
1577 p = The_Arena()->alloc(std::max(sizeof(T1)*box1.numPts()*ncomp1,
1578 sizeof(T2)*box2.numPts()*ncomp2));
1579 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1580 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1581 } else if (alloc_1) {
1582 Box const& box1 = fa1.fabbox(myproc);
1583 int ncomp1 = fa1.nComp();
1584 p = The_Arena()->alloc(sizeof(T1)*box1.numPts()*ncomp1);
1585 fa1.setFab(myproc, FAB1(box1, ncomp1, (T1*)p));
1586 } else if (alloc_2) {
1587 Box const& box2 = fa2.fabbox(myproc);
1588 int ncomp2 = fa2.nComp();
1589 p = The_Arena()->alloc(sizeof(T2)*box2.numPts()*ncomp2);
1590 fa2.setFab(myproc, FAB2(box2, ncomp2, (T2*)p));
1591 } else {
1592 return nullptr;
1593 }
1594 return std::unique_ptr<char,DataDeleter>((char*)p, DataDeleter{The_Arena()});
1595 }
1596}
1598
1600
1601struct Swap01
1602{
1603 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1604 {
1605 return {i.y, i.x, i.z};
1606 }
1607
1608 static constexpr Dim3 Inverse (Dim3 i)
1609 {
1610 return {i.y, i.x, i.z};
1611 }
1612
1613 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1614 {
1615 return it;
1616 }
1617
1618 static constexpr IndexType Inverse (IndexType it)
1619 {
1620 return it;
1621 }
1622};
1623
1624struct Swap02
1625{
1626 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1627 {
1628 return {i.z, i.y, i.x};
1629 }
1630
1631 static constexpr Dim3 Inverse (Dim3 i)
1632 {
1633 return {i.z, i.y, i.x};
1634 }
1635
1636 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1637 {
1638 return it;
1639 }
1640
1641 static constexpr IndexType Inverse (IndexType it)
1642 {
1643 return it;
1644 }
1645};
1646
1647struct RotateFwd
1648{
1649 // dest -> src: (x,y,z) -> (y,z,x)
1650 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1651 {
1652 return {i.y, i.z, i.x};
1653 }
1654
1655 // src -> dest: (x,y,z) -> (z,x,y)
1656 static constexpr Dim3 Inverse (Dim3 i)
1657 {
1658 return {i.z, i.x, i.y};
1659 }
1660
1661 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1662 {
1663 return it;
1664 }
1665
1666 static constexpr IndexType Inverse (IndexType it)
1667 {
1668 return it;
1669 }
1670};
1671
1672struct RotateBwd
1673{
1674 // dest -> src: (x,y,z) -> (z,x,y)
1675 [[nodiscard]] constexpr Dim3 operator() (Dim3 i) const noexcept
1676 {
1677 return {i.z, i.x, i.y};
1678 }
1679
1680 // src -> dest: (x,y,z) -> (y,z,x)
1681 static constexpr Dim3 Inverse (Dim3 i)
1682 {
1683 return {i.y, i.z, i.x};
1684 }
1685
1686 [[nodiscard]] constexpr IndexType operator() (IndexType it) const noexcept
1687 {
1688 return it;
1689 }
1690
1691 static constexpr IndexType Inverse (IndexType it)
1692 {
1693 return it;
1694 }
1695};
1696
1698
1700namespace detail
1701{
1702 struct SubHelper
1703 {
1704 explicit SubHelper (Box const& domain);
1705
1706 [[nodiscard]] Box make_box (Box const& box) const;
1707
1708 [[nodiscard]] Periodicity make_periodicity (Periodicity const& period) const;
1709
1710 [[nodiscard]] bool ghost_safe (IntVect const& ng) const;
1711
1712 // This rearranges the order.
1713 [[nodiscard]] IntVect make_iv (IntVect const& iv) const;
1714
1715 // This keeps the order, but zero out the values in the hidden dimension.
1716 [[nodiscard]] IntVect make_safe_ghost (IntVect const& ng) const;
1717
1718 [[nodiscard]] BoxArray inverse_boxarray (BoxArray const& ba) const;
1719
1720 [[nodiscard]] IntVect inverse_order (IntVect const& order) const;
1721
1722 template <typename T>
1723 [[nodiscard]] T make_array (T const& a) const
1724 {
1725#if (AMREX_SPACEDIM == 1)
1727 return a;
1728#elif (AMREX_SPACEDIM == 2)
1729 if (m_case == case_1n) {
1730 return T{a[1],a[0]};
1731 } else {
1732 return a;
1733 }
1734#else
1735 if (m_case == case_11n) {
1736 return T{a[2],a[0],a[1]};
1737 } else if (m_case == case_1n1) {
1738 return T{a[1],a[0],a[2]};
1739 } else if (m_case == case_1nn) {
1740 return T{a[1],a[2],a[0]};
1741 } else if (m_case == case_n1n) {
1742 return T{a[0],a[2],a[1]};
1743 } else {
1744 return a;
1745 }
1746#endif
1747 }
1748
1749 [[nodiscard]] GpuArray<int,3> xyz_order () const;
1750
1751 template <typename FA>
1752 FA make_alias_mf (FA const& mf)
1753 {
1754 BoxList bl = mf.boxArray().boxList();
1755 for (auto& b : bl) {
1756 b = make_box(b);
1757 }
1758 auto const& ng = make_iv(mf.nGrowVect());
1759 FA submf(BoxArray(std::move(bl)), mf.DistributionMap(), mf.nComp(), ng, MFInfo{}.SetAlloc(false));
1760 using FAB = typename FA::fab_type;
1761 for (MFIter mfi(submf, MFItInfo().DisableDeviceSync()); mfi.isValid(); ++mfi) {
1762 submf.setFab(mfi, FAB(mfi.fabbox(), mf.nComp(), mf[mfi].dataPtr()));
1763 }
1764 return submf;
1765 }
1766
1767#if (AMREX_SPACEDIM == 2)
1768 enum Case { case_1n, case_other };
1769 int m_case = case_other;
1770#elif (AMREX_SPACEDIM == 3)
1771 enum Case { case_11n, case_1n1, case_1nn, case_n1n, case_other };
1772 int m_case = case_other;
1773#endif
1774 };
1775}
1777
1778}
1779
1780#endif
#define AMREX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition AMReX_BLassert.H:49
#define AMREX_ENUM(CLASS,...)
Definition AMReX_Enum.H:209
#define AMREX_CUFFT_SAFE_CALL(call)
Definition AMReX_GpuError.H:92
#define AMREX_GPU_DEVICE
Definition AMReX_GpuQualifiers.H:18
amrex::ParmParse pp
Input file parser instance for the given namespace.
Definition AMReX_HypreIJIface.cpp:15
Real * pdst
Definition AMReX_HypreMLABecLap.cpp:1140
#define AMREX_D_TERM(a, b, c)
Definition AMReX_SPACE.H:172
virtual void free(void *pt)=0
A pure virtual function for deleting the arena pointed to by pt.
virtual void * alloc(std::size_t sz)=0
__host__ __device__ IntVectND< dim > length() const noexcept
Return the length of the BoxND.
Definition AMReX_Box.H:154
Calculates the distribution of FABs to MPI processes.
Definition AMReX_DistributionMapping.H:43
static bool usingExternalStream() noexcept
Definition AMReX_GpuDevice.cpp:836
An Integer Vector in dim-Dimensional Space.
Definition AMReX_IntVect.H:57
amrex_long Long
Definition AMReX_INT.H:30
void ParallelForOMP(T n, L const &f) noexcept
Performance-portable kernel launch function with optional OpenMP threading.
Definition AMReX_GpuLaunch.H:319
__host__ __device__ Dim3 length(Array4< T > const &a) noexcept
Return the spatial extents of an Array4 in Dim3 form.
Definition AMReX_Array4.H:1345
Arena * The_Arena()
Definition AMReX_Arena.cpp:805
Definition AMReX_FFT_Helper.H:52
Direction
Definition AMReX_FFT_Helper.H:54
Boundary
Definition AMReX_FFT_Helper.H:58
DomainStrategy
Definition AMReX_FFT_Helper.H:56
Kind
Definition AMReX_FFT_Helper.H:60
void streamSynchronize() noexcept
Definition AMReX_GpuDevice.H:310
gpuStream_t gpuStream() noexcept
Definition AMReX_GpuDevice.H:291
__host__ __device__ std::pair< double, double > sincospi(double x)
Return sin(pi*x) and cos(pi*x) given x.
Definition AMReX_Math.H:204
__host__ __device__ void ignore_unused(const Ts &...)
This shuts up the compiler about unused variables.
Definition AMReX.H:139
BoxND< 3 > Box
Box is an alias for amrex::BoxND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:30
IndexTypeND< 3 > IndexType
IndexType is an alias for amrex::IndexTypeND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:36
IntVectND< 3 > IntVect
IntVect is an alias for amrex::IntVectND instantiated with AMREX_SPACEDIM.
Definition AMReX_BaseFwd.H:33
void Abort(const std::string &msg)
Print out message to cerr and exit via abort().
Definition AMReX.cpp:240
const int[]
Definition AMReX_BLProfiler.cpp:1664
Definition AMReX_FFT_Helper.H:64
bool twod_mode
Definition AMReX_FFT_Helper.H:75
Info & setNumProcs(int n)
Cap the number of MPI ranks used by FFT.
Definition AMReX_FFT_Helper.H:137
bool oned_mode
Definition AMReX_FFT_Helper.H:84
int batch_size
Batched FFT size. Only support in R2C, not R2X.
Definition AMReX_FFT_Helper.H:87
Info & setDomainStrategy(DomainStrategy s)
Select how the domain is decomposed across MPI ranks.
Definition AMReX_FFT_Helper.H:98
DomainStrategy domain_strategy
Domain composition strategy.
Definition AMReX_FFT_Helper.H:66
int nprocs
Max number of processes to use.
Definition AMReX_FFT_Helper.H:90
int pencil_threshold
Definition AMReX_FFT_Helper.H:70
Info & setOneDMode(bool x)
Flag the degenerate 2-D mode (nx==1 or ny==1) that still batches along z.
Definition AMReX_FFT_Helper.H:123
Info & setBatchSize(int bsize)
Specify the batch size for FFT.
Definition AMReX_FFT_Helper.H:130
Info & setPencilThreshold(int t)
Override the slab→pencil break-even threshold for the automatic strategy.
Definition AMReX_FFT_Helper.H:105
Info & setTwoDMode(bool x)
Restrict transforms to the first two dimensions (3-D problems only).
Definition AMReX_FFT_Helper.H:112
Definition AMReX_FFT_Helper.H:194
void * pf
Definition AMReX_FFT_Helper.H:229
void unpack_r2r_buffer(T *pdst, void const *pbuf) const
Collapse the spectral R2R result back into the original real layout.
Definition AMReX_FFT_Helper.H:1137
std::conditional_t< std::is_same_v< float, T >, cuComplex, cuDoubleComplex > VendorComplex
Definition AMReX_FFT_Helper.H:198
VendorPlan plan2
Definition AMReX_FFT_Helper.H:228
void init_r2c(IntVectND< M > const &fft_size, void *pbf, void *pbb, bool cache, int ncomp=1)
Initialize an M-dimensional batched real-to-complex plan.
Definition AMReX_FFT_Helper.H:1382
int n
Definition AMReX_FFT_Helper.H:221
void destroy()
Release any vendor FFT plan objects owned by this Plan.
Definition AMReX_FFT_Helper.H:248
bool defined2
Definition AMReX_FFT_Helper.H:226
void init_r2r(Box const &box, VendorComplex *pc, std::pair< Boundary, Boundary > const &bc)
Initialize a real-to-real plan that reads/writes complex storage.
Definition AMReX_FFT_Helper.H:730
void pack_r2r_buffer(void *pbuf, T const *psrc) const
Expand the real R2R input into the symmetry-extended buffer expected by CUFFT/rocFFT.
Definition AMReX_FFT_Helper.H:899
static void free_scratch_space(void *p)
Release GPU scratch allocated via alloc_scratch_space().
Definition AMReX_FFT_Helper.H:891
static void destroy_vendor_plan(VendorPlan plan)
Helper that destroys a vendor plan of the appropriate backend type.
Definition AMReX_FFT_Helper.H:1347
Kind get_r2r_kind(std::pair< Boundary, Boundary > const &bc)
Map boundary conditions to the Plan Kind for real-to-real transforms.
Definition AMReX_FFT_Helper.H:592
cufftHandle VendorPlan
Definition AMReX_FFT_Helper.H:196
Kind kind
Definition AMReX_FFT_Helper.H:223
void init_c2c(Box const &box, VendorComplex *p, int ncomp=1, int ndims=1)
Initialize a complex-to-complex plan across 1/2/3 dimensions.
Definition AMReX_FFT_Helper.H:424
int howmany
Definition AMReX_FFT_Helper.H:222
void * pb
Definition AMReX_FFT_Helper.H:230
void init_r2c(Box const &box, T *pr, VendorComplex *pc, bool is_2d_transform=false, int ncomp=1)
Initialize a 1-D or 2-D real-to-complex plan over the supplied Box.
Definition AMReX_FFT_Helper.H:272
void compute_r2r()
Execute the real-to-real plan, including GPU packing/unpacking.
Definition AMReX_FFT_Helper.H:1290
void compute_c2c()
Execute the complex-to-complex plan in place.
Definition AMReX_FFT_Helper.H:830
bool r2r_data_is_complex
Definition AMReX_FFT_Helper.H:224
void * alloc_scratch_space() const
Allocate GPU scratch space large enough to hold packed R2R data.
Definition AMReX_FFT_Helper.H:874
VendorPlan plan
Definition AMReX_FFT_Helper.H:227
void compute_r2c()
Execute the previously initialized real-to-complex plan.
Definition AMReX_FFT_Helper.H:778
bool defined
Definition AMReX_FFT_Helper.H:225
void set_ptrs(void *p0, void *p1)
Register device pointers used by the forward/backward executions.
Definition AMReX_FFT_Helper.H:239
void init_r2r(Box const &box, T *p, std::pair< Boundary, Boundary > const &bc, int howmany_initval=1)
Initialize a real-to-real (cosine/sine) plan that operates on real buffers.
Definition AMReX_FFT_Helper.H:626
A host / device complex number type, because std::complex doesn't work in device code with Cuda yet.
Definition AMReX_GpuComplex.H:30