diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst index 79dd078913..167da805b0 100644 --- a/docs/source/run/parameters.rst +++ b/docs/source/run/parameters.rst @@ -252,15 +252,31 @@ The default is to use the explicit solver. **We strongly recommend to use the ex Which solver to use. Possible values: ``explicit`` and ``predictor-corrector``. -* ``fields.poisson_solver`` (`string`) optional (default `FFTDirichlet`) +* ``fields.poisson_solver`` (`string`) optional (default CPU: `FFTDirichletDirect`, GPU: `FFTDirichletFast`) Which Poisson solver to use for ``Psi``, ``Ez`` and ``Bz``. The ``predictor-corrector`` BxBy - solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are - ``FFTDirichlet``, ``FFTPeriodic`` and ``MGDirichlet``. + solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are: -* ``hipace.use_small_dst`` (`bool`) optional (default `0` or `1`) - Whether to use a large R2C or a small C2R fft in the dst of the Poisson solver. - The small dst is quicker for simulations with :math:`\geq 511` transverse grid points. - The default is set accordingly. + * ``FFTDirichletDirect`` Use the discrete sine transformation that is directly implemented + by FFTW to solve the Poisson equation with Dirichlet boundary conditions. + This option is only available when compiling for CPUs with FFTW. + Preferred resolution: :math:`2^N-1`. + + * ``FFTDirichletExpanded`` Perform the discrete sine transformation by symmetrically + expanding the field to twice its size. + Preferred resolution: :math:`2^N-1`. + + * ``FFTDirichletFast`` Perform the discrete sine transformation using a fast sine transform + algorithm that uses FFTs of the same size as the fields. + Preferred resolution: :math:`2^N-1`. + + * ``MGDirichlet`` Use the HiPACE++ multigrid solver to solve the Poisson equation with + Dirichlet boundary conditions. + Preferred resolution: :math:`2^N` and :math:`2^N-1`. + + * ``FFTPeriodic`` Use FFTs to solve the Poisson equation with Periodic boundary conditions. + Note that this does not work with features that change the boundary values, + like mesh refinement or open boundaries. + Preferred resolution: :math:`2^N`. * ``fields.extended_solve`` (`bool`) optional (default `0`) Extends the area of the FFT Poisson solver to the ghost cells. This can reduce artifacts diff --git a/src/Hipace.H b/src/Hipace.H index 9a67556413..03e60a69ef 100644 --- a/src/Hipace.H +++ b/src/Hipace.H @@ -44,6 +44,9 @@ struct Hipace_early_init * and Parser Constants */ Hipace_early_init (Hipace* instance); + /** Destructor for FFT cleanup */ + ~Hipace_early_init (); + /** Struct containing physical constants (which values depends on the unit system, determined * at runtime): SI or normalized units. */ PhysConst m_phys_const; diff --git a/src/Hipace.cpp b/src/Hipace.cpp index 4a3a8c93bb..5a2ce8e164 100644 --- a/src/Hipace.cpp +++ b/src/Hipace.cpp @@ -17,6 +17,7 @@ #include "utils/GPUUtil.H" #include "particles/pusher/GetAndSetPosition.H" #include "mg_solver/HpMultiGrid.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include #include @@ -52,6 +53,12 @@ Hipace_early_init::Hipace_early_init (Hipace* instance) int max_level = 0; queryWithParser(pp_amr, "max_level", max_level); m_N_level = max_level + 1; + AnyFFT::setup(); +} + +Hipace_early_init::~Hipace_early_init () +{ + AnyFFT::cleanup(); } Hipace& diff --git a/src/fields/Fields.H b/src/fields/Fields.H index 45132492a7..96e6b485a4 100644 --- a/src/fields/Fields.H +++ b/src/fields/Fields.H @@ -507,7 +507,7 @@ private: /** Vector over levels of all required fields to compute current slice */ amrex::Vector m_slices; /** Type of poisson solver to use */ - std::string m_poisson_solver_str = "FFTDirichlet"; + std::string m_poisson_solver_str = ""; /** Class to handle transverse FFT Poisson solver on 1 slice */ amrex::Vector> m_poisson_solver; /** Stores temporary values for z interpolation in Fields::Copy */ diff --git a/src/fields/Fields.cpp b/src/fields/Fields.cpp index 67495c6df1..efcf9d8db3 100644 --- a/src/fields/Fields.cpp +++ b/src/fields/Fields.cpp @@ -8,7 +8,9 @@ */ #include "Fields.H" #include "fft_poisson_solver/FFTPoissonSolverPeriodic.H" -#include "fft_poisson_solver/FFTPoissonSolverDirichlet.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletDirect.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H" +#include "fft_poisson_solver/FFTPoissonSolverDirichletFast.H" #include "fft_poisson_solver/MGPoissonSolverDirichlet.H" #include "Hipace.H" #include "OpenBoundary.H" @@ -29,6 +31,12 @@ Fields::Fields (const int nlev) { amrex::ParmParse ppf("fields"); DeprecatedInput("fields", "do_dirichlet_poisson", "poisson_solver", ""); + // set default Poisson solver based on the platform +#ifdef AMREX_USE_GPU + m_poisson_solver_str = "FFTDirichletFast"; +#else + m_poisson_solver_str = "FFTDirichletDirect"; +#endif queryWithParser(ppf, "poisson_solver", m_poisson_solver_str); queryWithParser(ppf, "extended_solve", m_extended_solve); queryWithParser(ppf, "open_boundary", m_open_boundary); @@ -178,11 +186,21 @@ Fields::AllocData ( // The Poisson solver operates on transverse slices only. // The constructor takes the BoxArray and the DistributionMap of a slice, // so the FFTPlans are built on a slice. - if (m_poisson_solver_str == "FFTDirichlet"){ - m_poisson_solver.push_back(std::unique_ptr( - new FFTPoissonSolverDirichlet(getSlices(lev).boxArray(), - getSlices(lev).DistributionMap(), - geom)) ); + if (m_poisson_solver_str == "FFTDirichletDirect"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletDirect(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); + } else if (m_poisson_solver_str == "FFTDirichletExpanded"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletExpanded(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); + } else if (m_poisson_solver_str == "FFTDirichletFast"){ + m_poisson_solver.push_back(std::unique_ptr( + new FFTPoissonSolverDirichletFast(getSlices(lev).boxArray(), + getSlices(lev).DistributionMap(), + geom)) ); } else if (m_poisson_solver_str == "FFTPeriodic") { m_poisson_solver.push_back(std::unique_ptr( new FFTPoissonSolverPeriodic(getSlices(lev).boxArray(), @@ -195,7 +213,8 @@ Fields::AllocData ( geom)) ); } else { amrex::Abort("Unknown poisson solver '" + m_poisson_solver_str + - "', must be 'FFTDirichlet', 'FFTPeriodic' or 'MGDirichlet'"); + "', must be 'FFTDirichletDirect', 'FFTDirichletExpanded', 'FFTDirichletFast', " + + "'FFTPeriodic' or 'MGDirichlet'"); } if (lev == 0 && m_insitu_period > 0) { diff --git a/src/fields/fft_poisson_solver/CMakeLists.txt b/src/fields/fft_poisson_solver/CMakeLists.txt index 847506f194..46e480ec2f 100644 --- a/src/fields/fft_poisson_solver/CMakeLists.txt +++ b/src/fields/fft_poisson_solver/CMakeLists.txt @@ -9,7 +9,9 @@ target_sources(HiPACE PRIVATE FFTPoissonSolver.cpp FFTPoissonSolverPeriodic.cpp - FFTPoissonSolverDirichlet.cpp + FFTPoissonSolverDirichletDirect.cpp + FFTPoissonSolverDirichletExpanded.cpp + FFTPoissonSolverDirichletFast.cpp MGPoissonSolverDirichlet.cpp ) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H similarity index 75% rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H index bc4dc76d1c..78f3d5dc38 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H @@ -5,10 +5,10 @@ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs * License: BSD-3-Clause-LBNL */ -#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_ -#define FFT_POISSON_SOLVER_DIRICHLET_H_ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_ -#include "fields/fft_poisson_solver/fft/AnyDST.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "FFTPoissonSolver.H" #include @@ -23,16 +23,16 @@ * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS * in the staging area and return the LHS in mf. */ -class FFTPoissonSolverDirichlet final : public FFTPoissonSolver +class FFTPoissonSolverDirichletDirect final : public FFTPoissonSolver { public: /** Constructor */ - FFTPoissonSolverDirichlet ( amrex::BoxArray const& a_realspace_ba, - amrex::DistributionMapping const& dm, - amrex::Geometry const& gm); + FFTPoissonSolverDirichletDirect ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); /** virtual destructor */ - virtual ~FFTPoissonSolverDirichlet () override final {} + virtual ~FFTPoissonSolverDirichletDirect () override final {} /** * \brief Define real space and spectral space boxes and multifabs, Dirichlet @@ -63,8 +63,12 @@ private: amrex::MultiFab m_tmpSpectralField; /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ amrex::MultiFab m_eigenvalue_matrix; - /** DST plans */ - AnyDST::DSTplans m_plan; + /** forward DST plan */ + AnyFFT m_forward_fft; + /** backward DST plan */ + AnyFFT m_backward_fft; + /** work area for both DST plans */ + amrex::Gpu::DeviceVector m_fft_work_area; }; #endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp similarity index 67% rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp index 3fb2bc93aa..70da5ac1d8 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp @@ -6,14 +6,14 @@ * * License: BSD-3-Clause-LBNL */ -#include "FFTPoissonSolverDirichlet.H" -#include "fft/AnyDST.H" +#include "FFTPoissonSolverDirichletDirect.H" +#include "fft/AnyFFT.H" #include "fields/Fields.H" #include "utils/Constants.H" #include "utils/GPUUtil.H" #include "utils/HipaceProfilerWrapper.H" -FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet ( +FFTPoissonSolverDirichletDirect::FFTPoissonSolverDirichletDirect ( amrex::BoxArray const& realspace_ba, amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) @@ -22,10 +22,11 @@ FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet ( } void -FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, - amrex::DistributionMapping const& dm, - amrex::Geometry const& gm ) +FFTPoissonSolverDirichletDirect::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. @@ -48,16 +49,18 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, "There should be only one box locally."); const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; const auto dx = gm.CellSizeArray(); const amrex::Real dxsquared = dx[0]*dx[0]; const amrex::Real dysquared = dx[1]*dx[1]; - const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_box.length(0) + 1 )); - const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_box.length(1) + 1 )); + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) // This normalization is used regardless of the sine transform library - const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_box.length(0) + 1 ) - *( fft_box.length(1) + 1 ))); + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); // Calculate the array of m_eigenvalue_matrix for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ @@ -67,9 +70,9 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept { /* fast poisson solver diagonal x coeffs */ - amrex::Real sinex_sq = sin(( i - lo[0] + 1 ) * sine_x_factor) * sin(( i - lo[0] + 1 ) * sine_x_factor); + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); /* fast poisson solver diagonal y coeffs */ - amrex::Real siney_sq = sin(( j - lo[1] + 1 ) * sine_y_factor) * sin(( j - lo[1] + 1 ) * sine_y_factor); + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); if ((sinex_sq!=0) && (siney_sq!=0)) { eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); @@ -81,29 +84,25 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba, } // Allocate and initialize the FFT plans - m_plan = AnyDST::DSTplans(a_realspace_ba, dm); - // Loop over boxes and allocate the corresponding plan - // for each box owned by the local MPI proc - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Note: the size of the real-space box and spectral-space box - // differ when using real-to-complex FFT. When initializing - // the FFT plan, the valid dimensions are those of the real-space box. - amrex::IntVect fft_size = fft_box.length(); - m_plan[mfi] = AnyDST::CreatePlan( - fft_size, &m_stagingArea[mfi], &m_tmpSpectralField[mfi]); - } + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]); + + // Allocate work area for both FFTs + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); } void -FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +FFTPoissonSolverDirichletDirect::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { - HIPACE_PROFILE("FFTPoissonSolverDirichlet::SolvePoissonEquation()"); + HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::SolvePoissonEquation()"); - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from the staging area to `tmpSpectralField` - AnyDST::Execute(m_plan[mfi], AnyDST::direction::forward); - } + m_forward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -120,10 +119,7 @@ FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from `tmpSpectralField` to the staging area - AnyDST::Execute(m_plan[mfi], AnyDST::direction::backward); - } + m_backward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H new file mode 100644 index 0000000000..d54393dd72 --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H @@ -0,0 +1,76 @@ +/* Copyright 2020-2021 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_ + +#include "fields/fft_poisson_solver/fft/AnyFFT.H" +#include "FFTPoissonSolver.H" + +#include +#include + +/** + * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves. + * + * For a given source S, it solves equation Laplacian(F) = S and returns F. + * Once an instance is created, a typical use consists in: + * 1. Compute S directly in FFTPoissonSolver::m_stagingArea + * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS + * in the staging area and return the LHS in mf. + */ +class FFTPoissonSolverDirichletExpanded final : public FFTPoissonSolver +{ +public: + /** Constructor */ + FFTPoissonSolverDirichletExpanded ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** virtual destructor */ + virtual ~FFTPoissonSolverDirichletExpanded () override final {} + + /** + * \brief Define real space and spectral space boxes and multifabs, Dirichlet + * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans. + * Currently only works with a single box, i.e., serial FFT. + * + * \param[in] realspace_ba BoxArray on which the FFT is executed. + * \param[in] dm DistributionMapping for the BoxArray. + * \param[in] gm Geometry, contains the box dimensions. + */ + void define ( amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** + * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call. + * + * \param[in] lhs_mf Destination array, where the result is stored. + */ + virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final; + + /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */ + virtual amrex::Real BoundaryOffset() override final { return 1.; } + virtual amrex::Real BoundaryFactor() override final { return 1.; } + +private: + /** Spectral fields, contains (real) field in Fourier space */ + amrex::MultiFab m_tmpSpectralField; + /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ + amrex::MultiFab m_eigenvalue_matrix; + /** Twice expanded field that gets filled symmetrically */ + amrex::FArrayBox m_expanded_position_array; + /** Twice expanded Complex field */ + amrex::BaseFab> m_expanded_fourier_array; + /** FFT plan */ + AnyFFT m_fft; + /** work area for FFT plan */ + amrex::Gpu::DeviceVector m_fft_work_area; +}; + +#endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp new file mode 100644 index 0000000000..5b1c3ad258 --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp @@ -0,0 +1,199 @@ +/* Copyright 2020-2022 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs + * + * License: BSD-3-Clause-LBNL + */ +#include "FFTPoissonSolverDirichletExpanded.H" +#include "fft/AnyFFT.H" +#include "fields/Fields.H" +#include "utils/Constants.H" +#include "utils/GPUUtil.H" +#include "utils/HipaceProfilerWrapper.H" + +FFTPoissonSolverDirichletExpanded::FFTPoissonSolverDirichletExpanded ( + amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + define(realspace_ba, dm, gm); +} + +void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) +{ + const amrex::Box bx = src.box(); + const int nx = bx.length(0); + const int ny = bx.length(1); + const amrex::IntVect lo = bx.smallEnd(); + Array2 const src_array = src.const_array(); + Array2 const dst_array = dst.array(); + + amrex::ParallelFor(bx, + [=] AMREX_GPU_DEVICE(int i, int j, int) + { + /* upper left quadrant */ + dst_array(i+1,j+1) = src_array(i, j); + /* lower left quadrant */ + dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); + /* upper right quadrant */ + dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); + /* lower right quadrant */ + dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); + }); +} + +void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) +{ + const amrex::Box bx = dst.box(); + Array2 const> const src_array = src.const_array(); + Array2 const dst_array = dst.array(); + amrex::ParallelFor(bx, + [=] AMREX_GPU_DEVICE(int i, int j, int) + { + /* upper left quadrant */ + dst_array(i,j) = -src_array(i+1, j+1).real(); + }); +} + +void +FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::define()"); + using namespace amrex::literals; + + // If we are going to support parallel FFT, the constructor needs to take a communicator. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet"); + + // Allocate temporary arrays - in real space and spectral space + // These arrays will store the data just before/after the FFT + // The stagingArea is also created from 0 to nx, because the real space array may have + // an offset for levels > 0 + m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required + m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards); + + // This must be true even for parallel FFT. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1, + "There should be only one box locally."); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1, + "There should be only one box locally."); + + const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; + const auto dx = gm.CellSizeArray(); + const amrex::Real dxsquared = dx[0]*dx[0]; + const amrex::Real dysquared = dx[1]*dx[1]; + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); + + // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) + // This normalization is used regardless of the sine transform library + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); + + // Calculate the array of m_eigenvalue_matrix + for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + amrex::IntVect lo = fft_box.smallEnd(); + amrex::ParallelFor( + fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept + { + /* fast poisson solver diagonal x coeffs */ + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); + /* fast poisson solver diagonal y coeffs */ + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); + + if ((sinex_sq!=0) && (siney_sq!=0)) { + eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); + } else { + // Avoid division by 0 + eigenvalue_matrix(i,j) = 0._rt; + } + }); + } + + // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) + // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) + amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; + amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; + // shift box to match rest of fields + expanded_position_box += fft_box.smallEnd(); + expanded_fourier_box += fft_box.smallEnd(); + + m_expanded_position_array.resize(expanded_position_box); + m_expanded_fourier_array.resize(expanded_fourier_box); + + m_expanded_position_array.setVal(0._rt); + + // Allocate and initialize the FFT plan + std::size_t wrok_size = m_fft.Initialize(FFTType::R2C_2D, expanded_position_box.length(0), + expanded_position_box.length(1)); + + // Allocate work area for the FFT + m_fft_work_area.resize(wrok_size); + + m_fft.SetBuffers(m_expanded_position_array.dataPtr(), m_expanded_fourier_array.dataPtr(), + m_fft_work_area.dataPtr()); +} + + +void +FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::SolvePoissonEquation()"); + using namespace amrex::literals; + + m_expanded_position_array.setVal(0._rt); + + ExpandR2R(m_expanded_position_array, m_stagingArea[0]); + + m_fft.Execute(); + + ShrinkC2R(m_tmpSpectralField[0], m_expanded_fourier_array); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Solve Poisson equation in Fourier space: + // Multiply `tmpSpectralField` by eigenvalue_matrix + Array2 tmp_cmplx_arr = m_tmpSpectralField.array(mfi); + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + + amrex::ParallelFor( mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j); + }); + } + + m_expanded_position_array.setVal(0._rt); + + ExpandR2R(m_expanded_position_array, m_tmpSpectralField[0]); + + m_fft.Execute(); + + ShrinkC2R(m_stagingArea[0], m_expanded_fourier_array); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Copy from the staging area to output array (and normalize) + Array2 tmp_real_arr = m_stagingArea.array(mfi); + Array2 lhs_arr = lhs_mf.array(mfi); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1, + "Slice MFs must be defined on one box only"); + amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + // Copy field + lhs_arr(i,j) = tmp_real_arr(i,j); + }); + } +} diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H new file mode 100644 index 0000000000..c981181c7f --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H @@ -0,0 +1,78 @@ +/* Copyright 2020-2021 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs + * License: BSD-3-Clause-LBNL + */ +#ifndef FFT_POISSON_SOLVER_DIRICHLET_FAST_H_ +#define FFT_POISSON_SOLVER_DIRICHLET_FAST_H_ + +#include "fields/fft_poisson_solver/fft/AnyFFT.H" +#include "FFTPoissonSolver.H" + +#include +#include + +/** + * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves. + * + * For a given source S, it solves equation Laplacian(F) = S and returns F. + * Once an instance is created, a typical use consists in: + * 1. Compute S directly in FFTPoissonSolver::m_stagingArea + * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS + * in the staging area and return the LHS in mf. + */ +class FFTPoissonSolverDirichletFast final : public FFTPoissonSolver +{ +public: + /** Constructor */ + FFTPoissonSolverDirichletFast ( amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** virtual destructor */ + virtual ~FFTPoissonSolverDirichletFast () override final {} + + /** + * \brief Define real space and spectral space boxes and multifabs, Dirichlet + * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans. + * Currently only works with a single box, i.e., serial FFT. + * + * \param[in] realspace_ba BoxArray on which the FFT is executed. + * \param[in] dm DistributionMapping for the BoxArray. + * \param[in] gm Geometry, contains the box dimensions. + */ + void define ( amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm); + + /** + * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call. + * + * \param[in] lhs_mf Destination array, where the result is stored. + */ + virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final; + + /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */ + virtual amrex::Real BoundaryOffset() override final { return 1.; } + virtual amrex::Real BoundaryFactor() override final { return 1.; } + +private: + /** Spectral fields, contains (real) field in Fourier space */ + amrex::MultiFab m_tmpSpectralField; + /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ + amrex::MultiFab m_eigenvalue_matrix; + /** Real array for the FFTs */ + amrex::Gpu::DeviceVector m_position_array; + /** Complex array for the FFTs */ + amrex::Gpu::DeviceVector> m_fourier_array; + /** FFT plan in x direction */ + AnyFFT m_x_fft; + /** FFT plan in y direction */ + AnyFFT m_y_fft; + /** work area for both DST plans */ + amrex::Gpu::DeviceVector m_fft_work_area; +}; + +#endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp new file mode 100644 index 0000000000..50f1547bca --- /dev/null +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -0,0 +1,315 @@ +/* Copyright 2020-2022 + * + * This file is part of HiPACE++. + * + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs + * + * License: BSD-3-Clause-LBNL + */ +#include "FFTPoissonSolverDirichletFast.H" +#include "fft/AnyFFT.H" +#include "fields/Fields.H" +#include "utils/Constants.H" +#include "utils/GPUUtil.H" +#include "utils/HipaceProfilerWrapper.H" + +FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast ( + amrex::BoxArray const& realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + define(realspace_ba, dm, gm); +} + +/** \brief Make Complex array out of Real array to prepare for fft. + * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with + * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1] + * + * \param[in] in input real array + * \param[out] out output complex array + * \param[in] n_data number of (contiguous) rows in position matrix + * \param[in] n_batch number of (strided) columns in position matrix + */ +void ToComplex (const amrex::Real* const AMREX_RESTRICT in, + amrex::GpuComplex* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ + const int n_half = (n_data+1)/2; + if((n_data%2 == 1)) { + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int stride_in = n_data*j; + const int i_is_zero = (i==0); + const int i_is_n_half = (i==n_half); + const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) + +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); + const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] + *!i_is_zero*!i_is_n_half; + out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); + }); + } else { + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int stride_in = n_data*j; + const int i_is_zero = (i==0); + const int i_is_n_half = (i==n_half); + const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) + +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; + const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; + out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); + }); + } +} + +/** \brief Make Sine-space Real array out of array from fft. + * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/ + * (2*sin((idx+1)*pi/(n_data+1)))) for each column + * + * \param[in] in input real array + * \param[out] out output real array + * \param[in] n_data number of (contiguous) rows in position matrix + * \param[in] n_batch number of (strided) columns in position matrix + */ +void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ + using namespace amrex::literals; + + const amrex::Real n_1_real_inv = 1._rt / (n_data+1._rt); + const int n_1 = n_data+1; + amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + const int i_rev = n_1-i; + const int stride_in = n_1*j; + const int stride_out = n_data*j; + const amrex::Real in_a = in[i + stride_in]; + const amrex::Real in_b = in[i_rev + stride_in]; + out[i - 1 + stride_out] = + 0.5_rt*(in_b - in_a + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i * n_1_real_inv))); + out[i_rev - 1 + stride_out] = + 0.5_rt*(in_a - in_b + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i_rev * n_1_real_inv))); + }); +} + +/** \brief Transpose input matrix + * out[idy][idx] = in[idx][idy] + * + * \param[in] in input real array + * \param[out] out output real array + * \param[in] n_data number of (contiguous) rows in input matrix + * \param[in] n_batch number of (strided) columns in input matrix + */ +void Transpose (const amrex::Real* const AMREX_RESTRICT in, + amrex::Real* const AMREX_RESTRICT out, + const int n_data, const int n_batch) +{ +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + constexpr int tile_dim = 32; //must be power of 2 + constexpr int block_rows = 8; + const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; + const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; + amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, + tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE() noexcept + { + amrex::Gpu::SharedMemory gsm; + amrex::Real* const tile = gsm.dataPtr(); + + const int thread_x = threadIdx.x&(tile_dim-1); + const int thread_y = threadIdx.x/tile_dim; + const int block_y = blockIdx.x/num_blocks_x; + const int block_x = blockIdx.x - block_y*num_blocks_x; + int mat_x = block_x * tile_dim + thread_x; + int mat_y = block_y * tile_dim + thread_y; + + for (int i = 0; i < tile_dim; i += block_rows) { + if(mat_x < n_data && (mat_y+i) < n_batch) { + tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; + } + } + + __syncthreads(); + + mat_x = block_y * tile_dim + thread_x; + mat_y = block_x * tile_dim + thread_y; + + for (int i = 0; i < tile_dim; i += block_rows) { + if(mat_x < n_batch && (mat_y+i) < n_data) { + out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; + } + } + }); +#else + amrex::ParallelFor({{0,0,0}, {n_batch-1, n_data-1, 0}}, + [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept + { + out[i + n_batch*j] = in[j + n_data*i]; + }); +#endif +} + +void +FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, + amrex::DistributionMapping const& dm, + amrex::Geometry const& gm ) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletFast::define()"); + using namespace amrex::literals; + + // If we are going to support parallel FFT, the constructor needs to take a communicator. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet"); + + // Allocate temporary arrays - in real space and spectral space + // These arrays will store the data just before/after the FFT + // The stagingArea is also created from 0 to nx, because the real space array may have + // an offset for levels > 0 + m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); + m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required + m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards); + + // This must be true even for parallel FFT. + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1, + "There should be only one box locally."); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1, + "There should be only one box locally."); + + const amrex::Box fft_box = m_stagingArea[0].box(); + const amrex::IntVect fft_size = fft_box.length(); + const int nx = fft_size[0]; + const int ny = fft_size[1]; + const auto dx = gm.CellSizeArray(); + const amrex::Real dxsquared = dx[0]*dx[0]; + const amrex::Real dysquared = dx[1]*dx[1]; + const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 )); + const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 )); + + // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00) + // This normalization is used regardless of the sine transform library + const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); + + // Calculate the array of m_eigenvalue_matrix + for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + amrex::IntVect lo = fft_box.smallEnd(); + amrex::ParallelFor( + fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept + { + /* fast poisson solver diagonal x coeffs */ + amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); + /* fast poisson solver diagonal y coeffs */ + amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); + + if ((sinex_sq!=0) && (siney_sq!=0)) { + eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); + } else { + // Avoid division by 0 + eigenvalue_matrix(i,j) = 0._rt; + } + }); + } + + // Allocate 1d Array for 2d data or 2d transpose data + const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); + const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); + m_position_array.resize(real_1d_size); + m_fourier_array.resize(complex_1d_size); + + // Allocate and initialize the FFT plans + std::size_t fft_x_area = m_x_fft.Initialize(FFTType::C2R_1D_batched, nx+1, ny); + std::size_t fft_y_area = m_y_fft.Initialize(FFTType::C2R_1D_batched, ny+1, nx); + + // Allocate work area for both FFTs + m_fft_work_area.resize(std::max(fft_x_area, fft_y_area)); + + m_x_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), + m_fft_work_area.dataPtr()); + m_y_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), + m_fft_work_area.dataPtr()); +} + + +void +FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) +{ + HIPACE_PROFILE("FFTPoissonSolverDirichletFast::SolvePoissonEquation()"); + + const int nx = m_stagingArea[0].box().length(0); // initially contiguous + const int ny = m_stagingArea[0].box().length(1); // contiguous after transpose + + amrex::Real* const pos_arr = m_stagingArea[0].dataPtr(); + amrex::Real* const fourier_arr = m_tmpSpectralField[0].dataPtr(); + amrex::Real* const real_arr = m_position_array.dataPtr(); + amrex::GpuComplex* comp_arr = m_fourier_array.dataPtr(); + + // 1D DST in x + ToComplex(pos_arr, comp_arr, nx, ny); + + m_x_fft.Execute(); + + ToSine(real_arr, pos_arr, nx, ny); + + Transpose(pos_arr, fourier_arr, nx, ny); + + // 1D DST in y + ToComplex(fourier_arr, comp_arr, ny, nx); + + m_y_fft.Execute(); + + ToSine(real_arr, pos_arr, ny, nx); + + Transpose(pos_arr, fourier_arr, ny, nx); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Solve Poisson equation in Fourier space: + // Multiply `tmpSpectralField` by eigenvalue_matrix + Array2 tmp_cmplx_arr = m_tmpSpectralField.array(mfi); + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); + + amrex::ParallelFor( mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j); + }); + } + + // 1D DST in x + ToComplex(fourier_arr, comp_arr, nx, ny); + + m_x_fft.Execute(); + + ToSine(real_arr, fourier_arr, nx, ny); + + Transpose(fourier_arr, pos_arr, nx, ny); + + // 1D DST in y + ToComplex(pos_arr, comp_arr, ny, nx); + + m_y_fft.Execute(); + + ToSine(real_arr, fourier_arr, ny, nx); + + Transpose(fourier_arr, pos_arr, ny, nx); + +#ifdef AMREX_USE_OMP +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif + for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ + // Copy from the staging area to output array (and normalize) + Array2 tmp_real_arr = m_stagingArea.array(mfi); + Array2 lhs_arr = lhs_mf.array(mfi); + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1, + "Slice MFs must be defined on one box only"); + amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(), + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { + // Copy field + lhs_arr(i,j) = tmp_real_arr(i,j); + }); + } +} diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H index e570e70d15..e18098764f 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H @@ -69,8 +69,12 @@ private: SpectralField m_tmpSpectralField; /** Multifab containing 1/(kx^2 + ky^2), to solve Poisson equation. */ amrex::MultiFab m_inv_k2; - /** FFT plans */ - AnyFFT::FFTplans m_forward_plan, m_backward_plan; + /** forward FFT plan */ + AnyFFT m_forward_fft; + /** backward FFT plan */ + AnyFFT m_backward_fft; + /** work area for both FFT plans */ + amrex::Gpu::DeviceVector m_fft_work_area; }; #endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp index 863a41427a..c83e4af64c 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp @@ -26,6 +26,7 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba, amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("FFTPoissonSolverPeriodic::define()"); using namespace amrex::literals; // If we are going to support parallel FFT, the constructor needs to take a communicator. @@ -93,25 +94,17 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba, } // Allocate and initialize the FFT plans - m_forward_plan = AnyFFT::FFTplans(spectralspace_ba, dm); - m_backward_plan = AnyFFT::FFTplans(spectralspace_ba, dm); - // Loop over boxes and allocate the corresponding plan - // for each box owned by the local MPI proc - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Note: the size of the real-space box and spectral-space box - // differ when using real-to-complex FFT. When initializing - // the FFT plan, the valid dimensions are those of the real-space box. - amrex::IntVect fft_size = m_stagingArea[mfi].box().length(); - m_forward_plan[mfi] = AnyFFT::CreatePlan( - fft_size, m_stagingArea[mfi].dataPtr(), - reinterpret_cast( m_tmpSpectralField[mfi].dataPtr()), - AnyFFT::direction::R2C); - - m_backward_plan[mfi] = AnyFFT::CreatePlan( - fft_size, m_stagingArea[mfi].dataPtr(), - reinterpret_cast( m_tmpSpectralField[mfi].dataPtr()), - AnyFFT::direction::C2R); - } + amrex::IntVect fft_size = m_stagingArea[0].box().length(); + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2C_2D, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2R_2D, fft_size[0], fft_size[1]); + + // Allocate work area for both FFTs + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(), + m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(), + m_fft_work_area.dataPtr()); } @@ -120,10 +113,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf) { HIPACE_PROFILE("FFTPoissonSolverPeriodic::SolvePoissonEquation()"); - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from the staging area to `tmpSpectralField` - AnyFFT::Execute(m_forward_plan[mfi]); - } + m_forward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) @@ -139,10 +129,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf) }); } - for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){ - // Perform Fourier transform from `tmpSpectralField` to the staging area - AnyFFT::Execute(m_backward_plan[mfi]); - } + m_backward_fft.Execute(); #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) diff --git a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp index c8aa30d600..9ce7aa34b2 100644 --- a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp +++ b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp @@ -17,6 +17,7 @@ MGPoissonSolverDirichlet::MGPoissonSolverDirichlet ( amrex::DistributionMapping const& dm, amrex::Geometry const& gm ) { + HIPACE_PROFILE("MGPoissonSolverDirichlet::define()"); // need extra ghost cell for 2^n-1 HPMG m_stagingArea = amrex::MultiFab(ba, dm, 1, Fields::m_poisson_nguards + amrex::IntVect{1, 1, 0}); AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ba.size() == 1, "Parallel MG not supported"); diff --git a/src/fields/fft_poisson_solver/fft/AnyDST.H b/src/fields/fft_poisson_solver/fft/AnyDST.H deleted file mode 100644 index 7997e7c3a6..0000000000 --- a/src/fields/fft_poisson_solver/fft/AnyDST.H +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#ifndef ANYDST_H_ -#define ANYDST_H_ - -#include "AnyFFT.H" - -#include -#include - -#include - -/** - * \brief Wrapper around multiple FFT libraries. - * - * The header file defines the API and the base types - * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is - * done in different cpp files. This wrapper only depends on the underlying FFT library - * AND on AMReX (There is no dependence on WarpX). - */ -namespace AnyDST -{ - - /** Direction in which the FFT is performed. */ - enum struct direction {forward, backward}; - - /** \brief This struct contains the vendor FFT plan and additional metadata - */ - struct DSTplan - { - - /** pointer to array in position space */ - amrex::FArrayBox* m_position_array; - /** pointer to array in Fourier space */ - amrex::FArrayBox* m_fourier_array; - - /** Expanded data in position space, only for Cuda */ - std::unique_ptr m_expanded_position_array; - /** Expanded data in Fourier, only for Cuda */ - std::unique_ptr>> m_expanded_fourier_array; - - /** Vendor FFT plan */ - AnyFFT::VendorFFTPlan m_plan; - /** Vendor FFT plan for the transpose transform. Used for use_small_dst and DSTW */ - AnyFFT::VendorFFTPlan m_plan_b; - - /** Use large R2C or small C2R dst */ - bool use_small_dst; - -#if defined(AMREX_USE_HIP) - /** execution info for rocFFT */ - rocfft_execution_info m_execinfo; - /** work buffer for rocFFT */ - void* m_buffer; -#endif - }; - - /** Collection of FFT plans, one FFTplan per box */ - using DSTplans = amrex::LayoutData; - - /** \brief create FFT plan for the backend FFT library. - * \param[in] real_size Size of the real array, along each dimension. - * \param[out] position_array Real array from/to where R2R DST is performed - * \param[out] fourier_array Real array to/from where R2R DST is performed - */ - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array); - - /** \brief Destroy library FFT plan. - * \param[out] dst_plan plan to destroy - */ - void DestroyPlan (DSTplan& dst_plan); - - /** \brief Perform FFT with backend library. - * \param[out] dst_plan plan for which the FFT is performed - * \param[in] d direction of the FFT - */ - void Execute (DSTplan& dst_plan, AnyDST::direction d); -} - -#endif // ANYDST_H_ diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H index cbbd077954..eb07cc1cb4 100644 --- a/src/fields/fft_poisson_solver/fft/AnyFFT.H +++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H @@ -1,8 +1,8 @@ -/* Copyright 2020-2021 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs + * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs * WeiqunZhang * License: BSD-3-Clause-LBNL */ @@ -15,106 +15,56 @@ #ifndef ANYFFT_H_ #define ANYFFT_H_ -#include +#include -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif +struct VendorPlan; -#include +enum struct FFTType { + C2C_2D_fwd, + C2C_2D_bkw, + C2R_2D, + R2C_2D, + R2R_2D, + C2R_1D_batched +}; -/** - * \brief Wrapper around multiple FFT libraries. - * - * The header file defines the API and the base types - * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is - * done in different cpp files. This wrapper only depends on the underlying FFT library - * AND on AMReX (There is no dependence on WarpX). - */ -namespace AnyFFT -{ - // First, define library-dependent types (complex, FFT plan) - - /** Complex type for FFT, depends on FFT library */ -#ifdef AMREX_USE_CUDA -# ifdef AMREX_USE_FLOAT - using Complex = cuComplex; -# else - using Complex = cuDoubleComplex; -# endif -#elif defined(AMREX_USE_HIP) -# ifdef AMREX_USE_FLOAT - using Complex = float2; -# else - using Complex = double2; -# endif -#else -# ifdef AMREX_USE_FLOAT - using Complex = fftwf_complex; -# else - using Complex = fftw_complex; -# endif -#endif +struct AnyFFT { - /** Library-dependent FFT plans type, which holds one fft plan per box - * (plans are only initialized for the boxes that are owned by the local MPI rank). + /** \brief Initialize an FFT plan for the requested transform type using a Vendor FFT library. + * For 1D batched transforms, ny represents the number of batches to calculate at once. + * This function returns the number of bytes of the work area needed for the FFT. The work area + * has to be allocated by the function that uses the FFT and passed into SetBuffers. + * + * \param[in] type Type of FFT to perform + * \param[in] nx Size of the contiguous dimension of the FFT + * \param[in] ny Size of the second dimension of the FFT */ -#ifdef AMREX_USE_CUDA - using VendorFFTPlan = cufftHandle; -#elif defined(AMREX_USE_HIP) - using VendorFFTPlan = rocfft_plan; -#else -# ifdef AMREX_USE_FLOAT - using VendorFFTPlan = fftwf_plan; -# else - using VendorFFTPlan = fftw_plan; -# endif -#endif - - // Second, define library-independent API - - /** Direction in which the FFT is performed. */ - enum struct direction {R2C, C2R}; + std::size_t Initialize (FFTType type, int nx, int ny); - /** \brief This struct contains the vendor FFT plan and additional metadata + /** \brief Set the pointers to the input, output and work area of the FFT. + * This function has to be called after Initialize and before Execute. + * + * \param[in] in Pointer to the input of the FFT + * \param[in] out Pointer to the output of the FFT + * \param[in] work_area Pointer to the work area for the FFT */ - struct FFTplan - { - amrex::Real* m_real_array; /**< pointer to real array */ - Complex* m_complex_array; /**< pointer to complex array */ - VendorFFTPlan m_plan; /**< Vendor FFT plan */ - direction m_dir; /**< direction (C2R or R2C) */ - }; + void SetBuffers (void* in, void* out, void* work_area); - /** Collection of FFT plans, one FFTplan per box */ - using FFTplans = amrex::LayoutData; + /** \brief Perform the initialized FFT */ + void Execute (); - /** \brief create FFT plan for the backend FFT library. - * \param[in] real_size Size of the real array, along each dimension. - * \param[out] real_array Real array from/to where R2C/C2R FFT is performed - * \param[out] complex_array Complex array to/from where R2C/C2R FFT is performed - * \param[in] dir direction, either R2C or C2R - */ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir); + /** \brief Destructor to destroy the FFT plan */ + ~AnyFFT (); - /** \brief Destroy library FFT plan. - * \param[out] fft_plan plan to destroy - */ - void DestroyPlan (FFTplan& fft_plan); + /** \brief Setup function that has to be called before any FFT plan is initialized. */ + static void setup (); - /** \brief Perform FFT with backend library. - * \param[out] fft_plan plan for which the FFT is performed - */ - void Execute (FFTplan& fft_plan); -} + /** \brief Cleanup function that has to be called at the end of the program. */ + static void cleanup (); + +private: + /** Vendor specific data for the FFT */ + VendorPlan* m_plan = nullptr; +}; #endif // ANYFFT_H_ diff --git a/src/fields/fft_poisson_solver/fft/CMakeLists.txt b/src/fields/fft_poisson_solver/fft/CMakeLists.txt index e89ecd84f3..e8ef1c1fd7 100644 --- a/src/fields/fft_poisson_solver/fft/CMakeLists.txt +++ b/src/fields/fft_poisson_solver/fft/CMakeLists.txt @@ -9,20 +9,15 @@ if (HiPACE_COMPUTE STREQUAL CUDA) target_sources(HiPACE PRIVATE WrapCuFFT.cpp - WrapCuDST.cpp - CuFFTUtils.cpp ) elseif(HiPACE_COMPUTE STREQUAL HIP) target_sources(HiPACE PRIVATE WrapRocFFT.cpp - WrapRocDST.cpp - RocFFTUtils.cpp ) else() target_sources(HiPACE PRIVATE WrapFFTW.cpp - WrapDSTW.cpp ) endif() diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.H b/src/fields/fft_poisson_solver/fft/CuFFTUtils.H deleted file mode 100644 index 01daf5c2b9..0000000000 --- a/src/fields/fft_poisson_solver/fft/CuFFTUtils.H +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright 2020-2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl, MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#ifndef CUFFTUTILS_H_ -#define CUFFTUTILS_H_ - -#include - -#include - - -namespace CuFFTUtils -{ - /** \brief This method converts a cufftResult - * into the corresponding string - * - * @param[in] err a cufftResult - * @return an std::string - */ - std::string cufftErrorToString (const cufftResult& err); -} - -#endif // CUFFTUTILS_H_ diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp b/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp deleted file mode 100644 index 090dcf2b9c..0000000000 --- a/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright 2020 - * - * This file is part of HiPACE++. - * - * Authors: MaxThevenet - * License: BSD-3-Clause-LBNL - */ -#include "CuFFTUtils.H" - -#include - -namespace CuFFTUtils -{ - std::string cufftErrorToString (const cufftResult& err) - { - const auto res2string = std::map{ - {CUFFT_SUCCESS, "CUFFT_SUCCESS"}, - {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"}, - {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"}, - {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"}, - {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"}, - {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"}, - {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"}, - {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"}, - {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"}, - {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"}}; - - const auto it = res2string.find(err); - if(it != res2string.end()){ - return it->second; - } - else{ - return std::to_string(err) + - " (unknown error code)"; - } - } -} diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.H b/src/fields/fft_poisson_solver/fft/RocFFTUtils.H deleted file mode 100644 index 0180b0a97f..0000000000 --- a/src/fields/fft_poisson_solver/fft/RocFFTUtils.H +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright 2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -#ifndef ROCFFTUTILS_H_ -#define ROCFFTUTILS_H_ - -#if __has_include() // ROCm 5.3+ -# include -#else -# include -#endif - -#include - - -namespace RocFFTUtils -{ - void assert_rocfft_status (std::string const& name, rocfft_status status); - - /** \brief This method converts a cufftResult - * into the corresponding string - * - * @param[in] err a cufftResult - * @return an std::string - */ - std::string rocfftErrorToString (const rocfft_status err); -} - -#endif // ROCFFTUTILS_H_ diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp b/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp deleted file mode 100644 index 5c6f3d68ed..0000000000 --- a/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright 2021 - * - * This file is part of HiPACE++. - * - * Authors: Axel Huebl - * License: BSD-3-Clause-LBNL - */ -#include "RocFFTUtils.H" -#include - -#include -#include - -namespace RocFFTUtils -{ - void assert_rocfft_status (std::string const& name, rocfft_status status) - { - if (status != rocfft_status_success) { - amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status)); - } - } - - std::string rocfftErrorToString (const rocfft_status err) - { - if (err == rocfft_status_success) { - return std::string("rocfft_status_success"); - } else if (err == rocfft_status_failure) { - return std::string("rocfft_status_failure"); - } else if (err == rocfft_status_invalid_arg_value) { - return std::string("rocfft_status_invalid_arg_value"); - } else if (err == rocfft_status_invalid_dimensions) { - return std::string("rocfft_status_invalid_dimensions"); - } else if (err == rocfft_status_invalid_array_type) { - return std::string("rocfft_status_invalid_array_type"); - } else if (err == rocfft_status_invalid_strides) { - return std::string("rocfft_status_invalid_strides"); - } else if (err == rocfft_status_invalid_distance) { - return std::string("rocfft_status_invalid_distance"); - } else if (err == rocfft_status_invalid_offset) { - return std::string("rocfft_status_invalid_offset"); - } else { - return std::to_string(err) + " (unknown error code)"; - } - } -} diff --git a/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp b/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp deleted file mode 100644 index 571e402010..0000000000 --- a/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp +++ /dev/null @@ -1,429 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "CuFFTUtils.H" -#include "utils/GPUUtil.H" -#include "utils/HipaceProfilerWrapper.H" -#include "utils/Parser.H" - -#include - -namespace AnyDST -{ -#ifdef AMREX_USE_FLOAT - cufftType VendorR2C = CUFFT_R2C; - cufftType VendorC2R = CUFFT_C2R; -#else - cufftType VendorR2C = CUFFT_D2Z; - cufftType VendorC2R = CUFFT_Z2D; -#endif - - /** \brief Extend src into a symmetrized larger array dst - * - * \param[in,out] dst destination array, odd symmetry around 0 and the middle points in x and y - * \param[in] src source array - */ - void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = src.box(); - const int nx = bx.length(0); - const int ny = bx.length(1); - const amrex::IntVect lo = bx.smallEnd(); - Array2 const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i+1,j+1) = src_array(i, j); - /* lower left quadrant */ - dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); - /* upper right quadrant */ - dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); - /* lower right quadrant */ - dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); - } - ); - }; - - /** \brief Extract symmetrical src array into smaller array dst - * - * \param[in,out] dst destination array - * \param[in] src destination array, symmetric in x and y - */ - void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = dst.box(); - Array2 const> const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i,j) = -src_array(i+1, j+1).real(); - } - ); - }; - - /** \brief Make Complex array out of Real array to prepare for fft. - * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with - * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1] - * - * \param[in] in input real array - * \param[out] out output complex array - * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix - */ - void ToComplex (const amrex::Real* const AMREX_RESTRICT in, - amrex::GpuComplex* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - const int n_half = (n_data+1)/2; - if((n_data%2 == 1)) { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); - const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] - *!i_is_zero*!i_is_n_half; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } else { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; - const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } - }; - - /** \brief Complex to Real fft for every column of the input matrix. - * The output Matrix has its indexes reversed compared to some other libraries - * - * \param[in] plan cuda fft plan for transformation - * \param[in] in input complex array - * \param[out] out output real array - */ - void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex* AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out) - { - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream(plan, stream); - cufftResult result; - -#ifdef AMREX_USE_FLOAT - result = cufftExecC2R(plan, reinterpret_cast(in), out); -#else - result = cufftExecZ2D(plan, reinterpret_cast(in), out); -#endif - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - }; - - /** \brief Make Sine-space Real array out of array from fft. - * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/ - * (2*sin((idx+1)*pi/(n_data+1)))) for each column - * - * \param[in] in input real array - * \param[out] out output real array - * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix - */ - void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - using namespace amrex::literals; - - const amrex::Real n_1_real = n_data+1._rt; - const int n_1 = n_data+1; - amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int i_rev = n_1-i; - const int stride_in = n_1*j; - const int stride_out = n_data*j; - const amrex::Real in_a = in[i+stride_in]; - const amrex::Real in_b = in[i_rev+stride_in]; -#ifdef AMREX_USE_FLOAT - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpif(i_rev/n_1_real))); -#else - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpi(i_rev/n_1_real))); -#endif - }); - }; - - /** \brief Transpose input matrix - * out[idy][idx] = in[idx][idy] - * - * \param[in] in input real array - * \param[out] out output real array - * \param[in] n_data number of (contiguous) rows in input matrix - * \param[in] n_batch number of (strided) columns in input matrix - */ - void Transpose (const amrex::Real* const AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - constexpr int tile_dim = 32; //must be power of 2 - constexpr int block_rows = 8; - const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; - const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; - amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, - tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE() noexcept - { - amrex::Gpu::SharedMemory gsm; - amrex::Real* const tile = gsm.dataPtr(); - - const int thread_x = threadIdx.x&(tile_dim-1); - const int thread_y = threadIdx.x/tile_dim; - const int block_y = blockIdx.x/num_blocks_x; - const int block_x = blockIdx.x - block_y*num_blocks_x; - int mat_x = block_x * tile_dim + thread_x; - int mat_y = block_y * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_data && (mat_y+i) < n_batch) { - tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; - } - } - - __syncthreads(); - - mat_x = block_y * tile_dim + thread_x; - mat_y = block_x * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_batch && (mat_y+i) < n_data) { - out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; - } - } - }); - }; - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - HIPACE_PROFILE("AnyDST::CreatePlan()"); - DSTplan dst_plan; - - amrex::ParmParse pp("hipace"); - dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511); - queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst); - - if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { - AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), - "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " - "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); - } - - if(!dst_plan.use_small_dst) { - const int nx = real_size[0]; - const int ny = real_size[1]; - - // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) - // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) - amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; - amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; - // shift box to match rest of fields - expanded_position_box += position_array->box().smallEnd(); - expanded_fourier_box += fourier_array->box().smallEnd(); - dst_plan.m_expanded_position_array = - std::make_unique( - expanded_position_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - expanded_fourier_box, 1); - - // setting the initial values to 0 - // we don't set the expanded Fourier array, because it will be initialized by the FFT - dst_plan.m_expanded_position_array->setVal(0., - dst_plan.m_expanded_position_array->box(), 0, - dst_plan.m_expanded_position_array->nComp()); - - const amrex::IntVect& expanded_size = expanded_position_box.length(); - - // Initialize fft_plan.m_plan with the vendor fft plan. - cufftResult result; - result = cufftPlan2d( - &(dst_plan.m_plan), expanded_size[1], expanded_size[0], VendorR2C); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - std::size_t buffersize = 0; - cufftGetSize(dst_plan.m_plan, &buffersize); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using R2C cuFFT of size " << expanded_size[0] << " * " - << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - else { - const int nx = real_size[0]; // contiguous - const int ny = real_size[1]; // not contiguous - - // Allocate 1d Array for 2d data or 2d transpose data - const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); - const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); - amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}}; - amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}}; - dst_plan.m_expanded_position_array = - std::make_unique( - real_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - complex_box, 1); - - // Initialize fft_plan.m_plan with the vendor fft plan. - int s_1 = nx+1; - cufftResult result; - result = cufftPlanMany( - &(dst_plan.m_plan), 1, &s_1, NULL, 1, (nx+1)/2+1, NULL, 1, nx+1, VendorC2R, ny); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - - // Initialize transposed fft_plan.m_plan_b with the vendor fft plan. - int s_2 = ny+1; - cufftResult resultb; - resultb = cufftPlanMany( - &(dst_plan.m_plan_b), 1, &s_2, NULL, 1, (ny+1)/2+1, NULL, 1, ny+1, VendorC2R, nx); - - if ( resultb != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(resultb) << "\n"; - } - - std::size_t buffersize = 0; - std::size_t buffersize_b = 0; - cufftGetSize(dst_plan.m_plan, &buffersize); - cufftGetSize(dst_plan.m_plan_b, &buffersize_b); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using C2R cuFFT of sizes " << s_1 << " and " - << s_2 << " with " << (buffersize+buffersize_b+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - } - - void DestroyPlan (DSTplan& dst_plan) - { - cufftDestroy( dst_plan.m_plan ); - cufftDestroy( dst_plan.m_plan_b ); - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - - if(!dst_plan.use_small_dst) { - // Swap position and fourier space based on execute direction - amrex::FArrayBox* position_array = - (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array; - amrex::FArrayBox* fourier_array = - (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array; - - // Expand in position space m_position_array -> m_expanded_position_array - ExpandR2R(*dst_plan.m_expanded_position_array, *position_array); - - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( dst_plan.m_plan, stream); - cufftResult result; - - // R2C FFT m_expanded_position_array -> m_expanded_fourier_array -#ifdef AMREX_USE_FLOAT - result = cufftExecR2C( - dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(), - reinterpret_cast(dst_plan.m_expanded_fourier_array->dataPtr())); -#else - result = cufftExecD2Z( - dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(), - reinterpret_cast(dst_plan.m_expanded_fourier_array->dataPtr())); -#endif - // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array - ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array); - - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - } - else { - const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous - const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose - - amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr(); - amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr(); - amrex::GpuComplex* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr(); - amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr(); - - // Swap position and fourier space based on execute direction - amrex::Real* const pos_arr = - (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr; - amrex::Real* const fourier_arr = - (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr; - - ToComplex(pos_arr, comp_arr, nx, ny); - - C2Rfft(dst_plan.m_plan, comp_arr, real_arr); - - ToSine(real_arr, pos_arr, nx, ny); - - Transpose(pos_arr, fourier_arr, nx, ny); - - ToComplex(fourier_arr, comp_arr, ny, nx); - - C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr); - - ToSine(real_arr, pos_arr, ny, nx); - - Transpose(pos_arr, fourier_arr, ny, nx); - } - } -} diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp index 739ab2cdcb..84471c665c 100644 --- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp @@ -1,8 +1,8 @@ -/* Copyright 2020-2022 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: MaxThevenet, Remi Lehe, Severin Diederichs + * Authors: AlexanderSinn, MaxThevenet, Remi Lehe, Severin Diederichs * License: BSD-3-Clause-LBNL */ /* Copyright 2019-2020 @@ -12,84 +12,239 @@ * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "CuFFTUtils.H" -#include "utils/HipaceProfilerWrapper.H" -namespace AnyFFT -{ +#include +#include + +#include + +#include +#include #ifdef AMREX_USE_FLOAT - cufftType VendorR2C = CUFFT_R2C; - cufftType VendorC2R = CUFFT_C2R; +static constexpr bool use_float = true; #else - cufftType VendorR2C = CUFFT_D2Z; - cufftType VendorC2R = CUFFT_Z2D; +static constexpr bool use_float = false; #endif - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; +struct VendorPlan { + cufftHandle m_cufftplan; + FFTType m_type; + void* m_in; + void* m_out; +}; - if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) { - AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024), - "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA " - "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes."); - } +std::string cufftErrorToString (const cufftResult& err) { + const auto res2string = std::map{ + {CUFFT_SUCCESS, "CUFFT_SUCCESS"}, + {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"}, + {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"}, + {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"}, + {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"}, + {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"}, + {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"}, + {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"}, + {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"}, + {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"} + }; - // Initialize fft_plan.m_plan with the vendor fft plan. - cufftResult result; - if (dir == direction::R2C){ - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorR2C); - } else { - result = cufftPlan2d( - &(fft_plan.m_plan), real_size[1], real_size[0], VendorC2R); - } + const auto it = res2string.find(err); + if(it != res2string.end()){ + return it->second; + } else { + return std::to_string(err) + " (unknown error code)"; + } +} - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } +void assert_cufft_status (std::string const& name, const cufftResult& status) { + if (status != CUFFT_SUCCESS) { + amrex::Abort(name + " failed! Error: " + cufftErrorToString(status)); + } +} - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://docs.nvidia.com/cuda/cufft/index.html#cufft-api-reference + m_plan = new VendorPlan; - return fft_plan; - } + m_plan->m_type = type; + cufftType transform_type; + int rank = 0; + // n is in C order + long long int n[2] = {0, 0}; + long long int batch = 0; - void DestroyPlan (FFTplan& fft_plan) - { - cufftDestroy( fft_plan.m_plan ); + switch (type) { + case FFTType::C2C_2D_fwd: + transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; + rank = 2; + n[0] = ny; + n[1] = nx; + batch = 1; + break; + case FFTType::C2C_2D_bkw: + transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z; + rank = 2; + n[0] = ny; + n[1] = nx; + batch = 1; + break; + case FFTType::C2R_2D: + transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D; + rank = 2; + n[0] = ny; + n[1] = nx; + batch = 1; + break; + case FFTType::R2C_2D: + transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z; + rank = 2; + n[0] = ny; + n[1] = nx; + batch = 1; + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + return 0; + case FFTType::C2R_1D_batched: + transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D; + rank = 1; + n[0] = nx; + batch = ny; + break; } - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("AnyFFT::Execute()"); - // make sure that this is done on the same GPU stream as the above copy - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( fft_plan.m_plan, stream); - cufftResult result; - if (fft_plan.m_dir == direction::R2C){ -#ifdef AMREX_USE_FLOAT - result = cufftExecR2C(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#else - result = cufftExecD2Z(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array); -#endif - } else if (fft_plan.m_dir == direction::C2R){ -#ifdef AMREX_USE_FLOAT - result = cufftExecC2R(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#else - result = cufftExecZ2D(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array); -#endif - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); + cufftResult result; + + result = cufftCreate(&(m_plan->m_cufftplan)); + assert_cufft_status("cufftCreate", result); + + result = cufftSetAutoAllocation(m_plan->m_cufftplan, 0); + assert_cufft_status("cufftSetAutoAllocation", result); + + std::size_t workSize = 0; + + result = cufftMakePlanMany64( + m_plan->m_cufftplan, + rank, + n, + nullptr, 0, 0, + nullptr, 0, 0, + transform_type, + batch, + &workSize); + assert_cufft_status("cufftMakePlanMany64", result); + + result = cufftSetStream(m_plan->m_cufftplan, amrex::Gpu::Device::cudaStream()); + assert_cufft_status("cufftSetStream", result); + + return workSize; +} + +void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { + m_plan->m_in = in; + m_plan->m_out = out; + + cufftResult result; + + result = cufftSetWorkArea(m_plan->m_cufftplan, work_area); + assert_cufft_status("cufftSetWorkArea", result); +} + +void AnyFFT::Execute () { + cufftResult result; + + // There is also cufftXtExec that could replace all of these specific Exec calls, + // however in testing it doesn't work + if constexpr (use_float) { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + result = cufftExecC2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_FORWARD); + assert_cufft_status("cufftExecC2C", result); + break; + case FFTType::C2C_2D_bkw: + result = cufftExecC2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_INVERSE); + assert_cufft_status("cufftExecC2C", result); + break; + case FFTType::C2R_2D: + result = cufftExecC2R(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecC2R", result); + break; + case FFTType::R2C_2D: + result = cufftExecR2C(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecR2C", result); + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + break; + case FFTType::C2R_1D_batched: + result = cufftExecC2R(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecC2R", result); + break; } - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; + } else { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + result = cufftExecZ2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_FORWARD); + assert_cufft_status("cufftExecZ2Z", result); + break; + case FFTType::C2C_2D_bkw: + result = cufftExecZ2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out), + CUFFT_INVERSE); + assert_cufft_status("cufftExecZ2Z", result); + break; + case FFTType::C2R_2D: + result = cufftExecZ2D(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecZ2D", result); + break; + case FFTType::R2C_2D: + result = cufftExecD2Z(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecD2Z", result); + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by cufft"); + break; + case FFTType::C2R_1D_batched: + result = cufftExecZ2D(m_plan->m_cufftplan, + reinterpret_cast(m_plan->m_in), + reinterpret_cast(m_plan->m_out)); + assert_cufft_status("cufftExecZ2D", result); + break; } } } + +AnyFFT::~AnyFFT () { + if (m_plan) { + cufftResult result; + + result = cufftDestroy(m_plan->m_cufftplan); + assert_cufft_status("cufftDestroy", result); + + delete m_plan; + } +} + +void AnyFFT::setup () {} + +void AnyFFT::cleanup () {} diff --git a/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp b/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp deleted file mode 100644 index ca14a5e358..0000000000 --- a/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright 2020-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "utils/HipaceProfilerWrapper.H" - -#ifdef AMREX_USE_OMP -# include -#endif - -namespace AnyDST -{ -#ifdef AMREX_USE_FLOAT - const auto VendorCreatePlanR2R2D = fftwf_plan_r2r_2d; -#else - const auto VendorCreatePlanR2R2D = fftw_plan_r2r_2d; -#endif - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - DSTplan dst_plan; - const int nx = real_size[0]; - const int ny = real_size[1]; - -#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) - if (nx > 32 && ny > 32) { -# ifdef AMREX_USE_FLOAT - fftwf_init_threads(); - fftwf_plan_with_nthreads(omp_get_max_threads()); -# else - fftw_init_threads(); - fftw_plan_with_nthreads(omp_get_max_threads()); -# endif - } -#endif - - // Initialize fft_plan.m_plan with the vendor fft plan. - // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order - dst_plan.m_plan = VendorCreatePlanR2R2D( - ny, nx, position_array->dataPtr(), fourier_array->dataPtr(), - FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); - - // Initialize fft_plan.m_plan_b with the vendor fft plan. - // Swap arrays: now for backward direction. - dst_plan.m_plan_b = VendorCreatePlanR2R2D( - ny, nx, fourier_array->dataPtr(), position_array->dataPtr(), - FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); - - amrex::Print() << "using R2R FFTW of size " << nx << " * " << ny << "\n"; - - // Store meta-data in fft_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - - void DestroyPlan (DSTplan& dst_plan) - { -# ifdef AMREX_USE_FLOAT - fftwf_destroy_plan( dst_plan.m_plan ); - fftwf_destroy_plan( dst_plan.m_plan_b ); -# else - fftw_destroy_plan( dst_plan.m_plan ); - fftw_destroy_plan( dst_plan.m_plan_b ); -# endif - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - // Swap position and fourier space based on execute direction - AnyFFT::VendorFFTPlan& plan = (d==direction::forward) ? dst_plan.m_plan : dst_plan.m_plan_b; -# ifdef AMREX_USE_FLOAT - fftwf_execute( plan ); -# else - fftw_execute( plan ); -# endif - } -} diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp index 8a6565dbed..e3dc010111 100644 --- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp @@ -1,8 +1,8 @@ -/* Copyright 2020 +/* Copyright 2020-2024 * * This file is part of HiPACE++. * - * Authors: MaxThevenet, Remi Lehe + * Authors: AlexanderSinn, MaxThevenet, Remi Lehe * License: BSD-3-Clause-LBNL */ /* Copyright 2019-2020 @@ -12,61 +12,168 @@ * License: BSD-3-Clause-LBNL */ #include "AnyFFT.H" -#include "utils/HipaceProfilerWrapper.H" -namespace AnyFFT -{ +#include + +#ifdef AMREX_USE_OMP +#include +#endif + +#include + #ifdef AMREX_USE_FLOAT - const auto VendorCreatePlanR2C3D = fftwf_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftwf_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftwf_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftwf_plan_dft_c2r_2d; +static constexpr bool use_float = true; #else - const auto VendorCreatePlanR2C3D = fftw_plan_dft_r2c_3d; - const auto VendorCreatePlanC2R3D = fftw_plan_dft_c2r_3d; - const auto VendorCreatePlanR2C2D = fftw_plan_dft_r2c_2d; - const auto VendorCreatePlanC2R2D = fftw_plan_dft_c2r_2d; +static constexpr bool use_float = false; #endif - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - FFTplan fft_plan; +struct VendorPlan { + fftwf_plan m_fftwf_plan; + fftw_plan m_fftw_plan; + FFTType m_type; + int m_nx; + int m_ny; +}; + +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://www.fftw.org/fftw3_doc/FFTW-Reference.html + m_plan = new VendorPlan; - // Initialize fft_plan.m_plan with the vendor fft plan. - // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order - if (dir == direction::R2C){ - fft_plan.m_plan = VendorCreatePlanR2C2D( - real_size[1], real_size[0], real_array, complex_array, FFTW_ESTIMATE); - } else if (dir == direction::C2R){ - fft_plan.m_plan = VendorCreatePlanC2R2D( - real_size[1], real_size[0], complex_array, real_array, FFTW_ESTIMATE); + m_plan->m_type = type; + m_plan->m_nx = nx; + m_plan->m_ny = ny; + // fftw doesn't allow for the manual allocation of work area, additionally the input and output + // arrays have to be provided when planing, so we do all the work in the SetBuffers function. + return 0; +} + +void AnyFFT::SetBuffers (void* in, void* out, [[maybe_unused]] void* work_area) { + if constexpr (use_float) { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + m_plan->m_fftwf_plan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_FORWARD, FFTW_MEASURE); + break; + case FFTType::C2C_2D_bkw: + m_plan->m_fftwf_plan = fftwf_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_BACKWARD, FFTW_MEASURE); + break; + case FFTType::C2R_2D: + m_plan->m_fftwf_plan = fftwf_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); + break; + case FFTType::R2C_2D: + m_plan->m_fftwf_plan = fftwf_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); + break; + case FFTType::R2R_2D: + m_plan->m_fftwf_plan = fftwf_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); + break; + case FFTType::C2R_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftwf_plan = fftwf_plan_many_dft_c2r( + 1, n, m_plan->m_ny, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx, + FFTW_MEASURE); + } + break; } + } else { + switch (m_plan->m_type) { + case FFTType::C2C_2D_fwd: + m_plan->m_fftw_plan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_FORWARD, FFTW_MEASURE); + break; + case FFTType::C2C_2D_bkw: + m_plan->m_fftw_plan = fftw_plan_dft_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_BACKWARD, FFTW_MEASURE); + break; + case FFTType::C2R_2D: + m_plan->m_fftw_plan = fftw_plan_dft_c2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); + break; + case FFTType::R2C_2D: + m_plan->m_fftw_plan = fftw_plan_dft_r2c_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_MEASURE); + break; + case FFTType::R2R_2D: + m_plan->m_fftw_plan = fftw_plan_r2r_2d( + m_plan->m_ny, m_plan->m_nx, + reinterpret_cast(in), reinterpret_cast(out), + FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE); + break; + case FFTType::C2R_1D_batched: + { + int n[1] = {m_plan->m_nx}; + m_plan->m_fftw_plan = fftw_plan_many_dft_c2r( + 1, n, m_plan->m_ny, + reinterpret_cast(in), nullptr, 1, m_plan->m_nx/2+1, + reinterpret_cast(out), nullptr, 1, m_plan->m_nx, + FFTW_MEASURE); + } + break; + } + } +} - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; +void AnyFFT::Execute () { + if constexpr (use_float) { + fftwf_execute(m_plan->m_fftwf_plan); + } else { + fftw_execute(m_plan->m_fftw_plan); + } +} - return fft_plan; +AnyFFT::~AnyFFT () { + if (m_plan) { + if constexpr (use_float) { + fftwf_destroy_plan(m_plan->m_fftwf_plan); + } else { + fftw_destroy_plan(m_plan->m_fftw_plan); + } + delete m_plan; } +} - void DestroyPlan (FFTplan& fft_plan) - { -# ifdef AMREX_USE_FLOAT - fftwf_destroy_plan( fft_plan.m_plan ); -# else - fftw_destroy_plan( fft_plan.m_plan ); -# endif +void AnyFFT::setup () { +#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) + if constexpr (use_float) { + fftwf_init_threads(); + fftwf_plan_with_nthreads(omp_get_max_threads()); + } else { + fftw_init_threads(); + fftw_plan_with_nthreads(omp_get_max_threads()); } +#endif +} - void Execute (FFTplan& fft_plan){ - HIPACE_PROFILE("Execute_FFTplan()"); -# ifdef AMREX_USE_FLOAT - fftwf_execute( fft_plan.m_plan ); -# else - fftw_execute( fft_plan.m_plan ); -# endif +void AnyFFT::cleanup () { +#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP) + if constexpr (use_float) { + fftwf_cleanup_threads(); + } else { + fftw_cleanup_threads(); } +#endif } diff --git a/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp b/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp deleted file mode 100644 index ae96334494..0000000000 --- a/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp +++ /dev/null @@ -1,421 +0,0 @@ -/* Copyright 2021-2022 - * - * This file is part of HiPACE++. - * - * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs - * - * License: BSD-3-Clause-LBNL - */ -#include "AnyDST.H" -#include "RocFFTUtils.H" -#include "utils/GPUUtil.H" -#include "utils/HipaceProfilerWrapper.H" -#include "utils/Parser.H" - -#include - -namespace AnyDST -{ - // see WrapCuDST for documentation - void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = src.box(); - const int nx = bx.length(0); - const int ny = bx.length(1); - const amrex::IntVect lo = bx.smallEnd(); - Array2 const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i+1,j+1) = src_array(i, j); - /* lower left quadrant */ - dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]); - /* upper right quadrant */ - dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j); - /* lower right quadrant */ - dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]); - } - ); - } - - // see WrapCuDST for documentation - void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab>& src) - { - constexpr int scomp = 0; - constexpr int dcomp = 0; - - const amrex::Box bx = dst.box(); - Array2 const> const src_array = src.const_array(scomp); - Array2 const dst_array = dst.array(dcomp); - amrex::ParallelFor( - bx, - [=] AMREX_GPU_DEVICE(int i, int j, int) - { - /* upper left quadrant */ - dst_array(i,j) = -src_array(i+1, j+1).real(); - } - ); - } - - // see WrapCuDST for documentation - void ToComplex (const amrex::Real* const AMREX_RESTRICT in, - amrex::GpuComplex* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - const int n_half = (n_data+1)/2; - if((n_data%2 == 1)) { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); - const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] - *!i_is_zero*!i_is_n_half; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } else { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; - const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); - } - } - - // see WrapCuDST for documentation - void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex* AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, rocfft_execution_info execinfo) - { - rocfft_status result; - - void* in_arr[2] = {in, nullptr}; - void* out_arr[2] = {out, nullptr}; - result = rocfft_execute(plan, in_arr, out_arr, execinfo); - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - } - - // see WrapCuDST for documentation - void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - using namespace amrex::literals; - - const amrex::Real n_1_real = n_data+1._rt; - const int n_1 = n_data+1; - amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int i_rev = n_1-i; - const int stride_in = n_1*j; - const int stride_out = n_data*j; - const amrex::Real in_a = in[i+stride_in]; - const amrex::Real in_b = in[i_rev+stride_in]; -#ifdef AMREX_USE_FLOAT - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpif(i_rev/n_1_real))); -#else - out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real))); - out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt - *sinpi(i_rev/n_1_real))); -#endif - }); - } - - // see WrapCuDST for documentation - void Transpose (const amrex::Real* const AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) - { - constexpr int tile_dim = 32; //must be power of 2 - constexpr int block_rows = 8; - const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; - const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; - amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, - tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE() noexcept - { - amrex::Gpu::SharedMemory gsm; - amrex::Real* const tile = gsm.dataPtr(); - - const int thread_x = threadIdx.x&(tile_dim-1); - const int thread_y = threadIdx.x/tile_dim; - const int block_y = blockIdx.x/num_blocks_x; - const int block_x = blockIdx.x - block_y*num_blocks_x; - int mat_x = block_x * tile_dim + thread_x; - int mat_y = block_y * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_data && (mat_y+i) < n_batch) { - tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; - } - } - - __syncthreads(); - - mat_x = block_y * tile_dim + thread_x; - mat_y = block_x * tile_dim + thread_y; - - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_batch && (mat_y+i) < n_data) { - out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; - } - } - }); - } - - DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array, - amrex::FArrayBox* fourier_array) - { - HIPACE_PROFILE("AnyDST::CreatePlan()"); - DSTplan dst_plan; - - amrex::ParmParse pp("hipace"); - dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511); - queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst); - - if(!dst_plan.use_small_dst) { - const int nx = real_size[0]; - const int ny = real_size[1]; - int dim = 2; - - // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2) - // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2) - amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}}; - amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}}; - // shift box to match rest of fields - expanded_position_box += position_array->box().smallEnd(); - expanded_fourier_box += fourier_array->box().smallEnd(); - dst_plan.m_expanded_position_array = - std::make_unique( - expanded_position_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - expanded_fourier_box, 1); - - // setting the initial values to 0 - // we don't set the expanded Fourier array, because it will be initialized by the FFT - dst_plan.m_expanded_position_array->setVal(0., - dst_plan.m_expanded_position_array->box(), 0, - dst_plan.m_expanded_position_array->nComp()); - - // check for type of expanded size, should be const size_t * - const amrex::IntVect& expanded_size = expanded_position_box.length(); - const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(expanded_size[0]), - std::size_t(expanded_size[1]), - std::size_t(expanded_size[2]))}; - - #ifdef AMREX_USE_FLOAT - rocfft_precision precision = rocfft_precision_single; - #else - rocfft_precision precision = rocfft_precision_double; - #endif - - // Initialize fft_plan.m_plan with the vendor fft plan. - rocfft_status result; - result = rocfft_plan_create(&(dst_plan.m_plan), - rocfft_placement_notinplace, - rocfft_transform_type_real_forward, - precision, - dim, - lengths, - 1, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - std::size_t buffersize = 0; - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_execution_info_create(&(dst_plan.m_execinfo)); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize); - result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer, - buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using R2C rocFFT of size " << expanded_size[0] << " * " - << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - else { - const int nx = real_size[0]; // contiguous - const int ny = real_size[1]; // not contiguous - - // Allocate 1d Array for 2d data or 2d transpose data - const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); - const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx); - amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}}; - amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}}; - dst_plan.m_expanded_position_array = - std::make_unique( - real_box, 1); - dst_plan.m_expanded_fourier_array = - std::make_unique>>( - complex_box, 1); - -#ifdef AMREX_USE_FLOAT - rocfft_precision precision = rocfft_precision_single; -#else - rocfft_precision precision = rocfft_precision_double; -#endif - - rocfft_status result; - - const std::size_t s_1[3] = {std::size_t(nx+1) ,0u ,0u}; - - result = rocfft_plan_create(&(dst_plan.m_plan), - rocfft_placement_notinplace, - rocfft_transform_type_real_inverse, - precision, - 1, - s_1, - ny, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - const std::size_t s_2[3] = {std::size_t(ny+1) ,0u ,0u}; - - result = rocfft_plan_create(&(dst_plan.m_plan_b), - rocfft_placement_notinplace, - rocfft_transform_type_real_inverse, - precision, - 1, - s_2, - nx, - nullptr); - - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - std::size_t work_size = 0; - std::size_t work_size_b = 0; - - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &work_size); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan_b, &work_size_b); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - result = rocfft_execution_info_create(&(dst_plan.m_execinfo)); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - std::size_t buffersize = std::max(work_size, work_size_b); - dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize); - - result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer, - buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - std::size_t mb = 1024*1024; - - amrex::Print() << "using C2R rocFFT of sizes " << s_1[0] << " and " - << s_2[0] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n"; - - // Store meta-data in dst_plan - dst_plan.m_position_array = position_array; - dst_plan.m_fourier_array = fourier_array; - - return dst_plan; - } - } - - void DestroyPlan (DSTplan& dst_plan) - { - rocfft_plan_destroy( dst_plan.m_plan ); - rocfft_plan_destroy( dst_plan.m_plan_b ); - - amrex::The_Arena()->free(dst_plan.m_buffer); - rocfft_execution_info_destroy(dst_plan.m_execinfo); - } - - void Execute (DSTplan& dst_plan, direction d){ - HIPACE_PROFILE("AnyDST::Execute()"); - - if(!dst_plan.use_small_dst) { - // Swap position and fourier space based on execute direction - amrex::FArrayBox* position_array = - (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array; - amrex::FArrayBox* fourier_array = - (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array; - - // Expand in position space m_position_array -> m_expanded_position_array - ExpandR2R(*dst_plan.m_expanded_position_array, *position_array); - - rocfft_status result; - - // R2C FFT m_expanded_position_array -> m_expanded_fourier_array - void* in[2] = {dst_plan.m_expanded_position_array->dataPtr(), nullptr}; - void* out[2] = {dst_plan.m_expanded_fourier_array->dataPtr(), nullptr}; - result = rocfft_execute(dst_plan.m_plan, in, out, dst_plan.m_execinfo); - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - - // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array - ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array); - } - else { - const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous - const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose - - amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr(); - amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr(); - amrex::GpuComplex* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr(); - amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr(); - - // Swap position and fourier space based on execute direction - amrex::Real* const pos_arr = - (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr; - amrex::Real* const fourier_arr = - (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr; - - ToComplex(pos_arr, comp_arr, nx, ny); - - C2Rfft(dst_plan.m_plan, comp_arr, real_arr, dst_plan.m_execinfo); - - ToSine(real_arr, pos_arr, nx, ny); - - Transpose(pos_arr, fourier_arr, nx, ny); - - ToComplex(fourier_arr, comp_arr, ny, nx); - - C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr, dst_plan.m_execinfo); - - ToSine(real_arr, pos_arr, ny, nx); - - Transpose(pos_arr, fourier_arr, ny, nx); - } - } -} diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp index 0ba7a2c7db..18cd7ca0d9 100644 --- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp +++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp @@ -1,8 +1,8 @@ -/* Copyright 2021 +/* Copyright 2021-2024 * * This file is part of HiPACE++. * - * Authors: Axel Huebl + * Authors: AlexanderSinn, Axel Huebl * License: BSD-3-Clause-LBNL */ /* Copyright 2019-2020 @@ -11,92 +11,185 @@ * * License: BSD-3-Clause-LBNL */ - #include "AnyFFT.H" -#include "RocFFTUtils.H" -#include "utils/HipaceProfilerWrapper.H" - -namespace AnyFFT -{ - FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array, - Complex * const complex_array, const direction dir) - { - HIPACE_PROFILE("AnyFFT::CreatePlan()"); - const int dim = 2; - FFTplan fft_plan; - - const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(real_size[0]), - std::size_t(real_size[1]), - std::size_t(real_size[2]))}; - // Initialize fft_plan.m_plan with the vendor fft plan. - rocfft_status result = rocfft_plan_create(&(fft_plan.m_plan), - rocfft_placement_notinplace, - (dir == direction::R2C) - ? rocfft_transform_type_real_forward - : rocfft_transform_type_real_inverse, + +#include +#include + +#if __has_include() // ROCm 5.3+ +#include +#else +#include +#endif + +#include + #ifdef AMREX_USE_FLOAT - rocfft_precision_single, +static constexpr bool use_float = true; #else - rocfft_precision_double, +static constexpr bool use_float = false; #endif - dim, lengths, - 1, // number of transforms, - nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - - // Store meta-data in fft_plan - fft_plan.m_real_array = real_array; - fft_plan.m_complex_array = complex_array; - fft_plan.m_dir = dir; - //fft_plan.m_dim = dim; - - return fft_plan; + +struct VendorPlan { + rocfft_plan m_rocfftplan; + rocfft_execution_info m_execinfo; + std::size_t m_work_size = 0; + void* m_in; + void* m_out; +}; + +std::string rocfftErrorToString (const rocfft_status& err) { + if (err == rocfft_status_success) { + return std::string("rocfft_status_success"); + } else if (err == rocfft_status_failure) { + return std::string("rocfft_status_failure"); + } else if (err == rocfft_status_invalid_arg_value) { + return std::string("rocfft_status_invalid_arg_value"); + } else if (err == rocfft_status_invalid_dimensions) { + return std::string("rocfft_status_invalid_dimensions"); + } else if (err == rocfft_status_invalid_array_type) { + return std::string("rocfft_status_invalid_array_type"); + } else if (err == rocfft_status_invalid_strides) { + return std::string("rocfft_status_invalid_strides"); + } else if (err == rocfft_status_invalid_distance) { + return std::string("rocfft_status_invalid_distance"); + } else if (err == rocfft_status_invalid_offset) { + return std::string("rocfft_status_invalid_offset"); + } else if (err == rocfft_status_invalid_work_buffer) { + return std::string("rocfft_status_invalid_work_buffer"); + }else { + return std::to_string(err) + " (unknown error code)"; + } +} + +void assert_rocfft_status (std::string const& name, const rocfft_status& status) { + if (status != rocfft_status_success) { + amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status)); + } +} + +std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) { + // https://rocm.docs.amd.com/projects/rocFFT/en/latest/reference/allapi.html# + m_plan = new VendorPlan; + + rocfft_transform_type transform_type; + std::size_t dimensions = 0; + // lengths is in FORTRAN order + std::size_t lengths[2] = {0, 0}; + std::size_t number_of_transforms = 0; + + switch (type) { + case FFTType::C2C_2D_fwd: + transform_type = rocfft_transform_type_complex_forward; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::C2C_2D_bkw: + transform_type = rocfft_transform_type_complex_inverse; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::C2R_2D: + transform_type = rocfft_transform_type_real_inverse; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::R2C_2D: + transform_type = rocfft_transform_type_real_forward; + dimensions = 2; + lengths[0] = nx; + lengths[1] = ny; + number_of_transforms = 1; + break; + case FFTType::R2R_2D: + amrex::Abort("R2R FFT not supported by rocfft"); + return 0; + case FFTType::C2R_1D_batched: + transform_type = rocfft_transform_type_real_inverse; + dimensions = 1; + lengths[0] = nx; + number_of_transforms = ny; + break; } - void DestroyPlan (FFTplan& fft_plan) - { - rocfft_plan_destroy( fft_plan.m_plan ); + rocfft_status status; + + status = rocfft_plan_create( + &(m_plan->m_rocfftplan), + rocfft_placement_notinplace, + transform_type, + use_float ? rocfft_precision_single : rocfft_precision_double, + dimensions, + lengths, + number_of_transforms, + nullptr); + assert_rocfft_status("rocfft_plan_create", status); + + status = rocfft_plan_get_work_buffer_size(m_plan->m_rocfftplan, &(m_plan->m_work_size)); + assert_rocfft_status("rocfft_plan_get_work_buffer_size", status); + + return m_plan->m_work_size; +} + +void AnyFFT::SetBuffers (void* in, void* out, void* work_area) { + m_plan->m_in = in; + m_plan->m_out = out; + + rocfft_status status; + + status = rocfft_execution_info_create(&(m_plan->m_execinfo)); + assert_rocfft_status("rocfft_execution_info_create", status); + + if (m_plan->m_work_size > 0) { + status = rocfft_execution_info_set_work_buffer( + m_plan->m_execinfo, work_area, m_plan->m_work_size); + assert_rocfft_status("rocfft_execution_info_set_work_buffer", status); } - void Execute (FFTplan& fft_plan) - { - HIPACE_PROFILE("AnyFFT::Execute()"); - rocfft_execution_info execinfo = NULL; - rocfft_status result = rocfft_execution_info_create(&execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - - std::size_t buffersize = 0; - result = rocfft_plan_get_work_buffer_size(fft_plan.m_plan, &buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result); - - void* buffer = amrex::The_Arena()->alloc(buffersize); - result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result); - - result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - if (fft_plan.m_dir == direction::R2C) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_real_array), // in - (void**)&(fft_plan.m_complex_array), // out - execinfo); - } else if (fft_plan.m_dir == direction::C2R) { - result = rocfft_execute(fft_plan.m_plan, - (void**)&(fft_plan.m_complex_array), // in - (void**)&(fft_plan.m_real_array), // out - execinfo); - } else { - amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R"); - } - - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - - amrex::Gpu::streamSynchronize(); - - amrex::The_Arena()->free(buffer); - - result = rocfft_execution_info_destroy(execinfo); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); + status = rocfft_execution_info_set_stream(m_plan->m_execinfo, amrex::Gpu::gpuStream()); + assert_rocfft_status("rocfft_execution_info_set_stream", status); +} + +void AnyFFT::Execute () { + rocfft_status status; + + void* in_arr[2] = {m_plan->m_in, nullptr}; + void* out_arr[2] = {m_plan->m_out, nullptr}; + + status = rocfft_execute(m_plan->m_rocfftplan, in_arr, out_arr, m_plan->m_execinfo); + assert_rocfft_status("rocfft_execute", status); +} + +AnyFFT::~AnyFFT () { + if (m_plan) { + rocfft_status status; + + status = rocfft_execution_info_destroy(m_plan->m_execinfo); + assert_rocfft_status("rocfft_execution_info_destroy", status); + + status = rocfft_plan_destroy(m_plan->m_rocfftplan); + assert_rocfft_status("rocfft_plan_destroy", status); + + delete m_plan; } } + +void AnyFFT::setup () { + rocfft_status status; + + status = rocfft_setup(); + assert_rocfft_status("rocfft_setup", status); +} + +void AnyFFT::cleanup () { + rocfft_status status; + + status = rocfft_cleanup(); + assert_rocfft_status("rocfft_cleanup", status); +} diff --git a/src/laser/MultiLaser.H b/src/laser/MultiLaser.H index e0619d331a..e8d516ec1b 100644 --- a/src/laser/MultiLaser.H +++ b/src/laser/MultiLaser.H @@ -13,59 +13,13 @@ #include "Laser.H" #include "fields/Fields.H" #include "mg_solver/HpMultiGrid.H" +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include #include #include #include -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - -namespace LaserFFT { -#ifdef AMREX_USE_CUDA - using VendorFFT = cufftHandle; - const auto VendorCreate = cufftPlan2d; - const auto VendorDestroy = cufftDestroy; -# ifdef AMREX_USE_FLOAT - const auto VendorExecute = cufftExecC2C; - const auto cufft_type = CUFFT_C2C; - using cufftComplex = cuComplex; -# else - const auto VendorExecute = cufftExecZ2Z; - const auto cufft_type = CUFFT_Z2Z; - using cufftComplex = cuDoubleComplex; -# endif -#elif defined(AMREX_USE_HIP) - using VendorFFT = rocfft_plan; - const auto VendorDestroy = rocfft_plan_destroy; -// TODO -#else -# ifdef AMREX_USE_FLOAT - using VendorFFT = fftwf_plan; - using FFTWComplex = fftwf_complex; - const auto VendorCreate = fftwf_plan_dft_2d; - const auto VendorExecute = fftwf_execute; - const auto VendorDestroy = fftwf_destroy_plan; -# else - using VendorFFT = fftw_plan; - using FFTWComplex = fftw_complex; - const auto VendorCreate = fftw_plan_dft_2d; - const auto VendorExecute = fftw_execute; - const auto VendorDestroy = fftw_destroy_plan; -# endif -#endif -} - /** \brief describes which slice with respect to the currently calculated is used */ namespace WhichLaserSlice { // n stands for the time step, j for the longitudinal slice. @@ -108,15 +62,6 @@ public: ReadParameters(); } - ~MultiLaser () - { - if (!m_use_laser) return; - if (m_solver_type == "fft") { - LaserFFT::VendorDestroy( m_plan_fwd ); - LaserFFT::VendorDestroy( m_plan_bkw ); - } - } - void ReadParameters (); /** get function for the 2D slices */ @@ -268,26 +213,18 @@ private: /** hpmg solver for the envelope solver */ std::unique_ptr m_mg; - // Elements for the FFT-based laser envelope solver - // This could belong to AnyFFT etc., with 2 caveats: - // - This solver operates on a FArrayBox instead of a MultiFab, which is more adequate - // - The array in position space must be Complex rather than real, which takes up quite some - // rewriting, see https://github.com/MaxThevenet/hipace/tree/laser_solve, - // not sure what the best way to proceed is. /** FFTW plan for forward C2C transform to solve Complex Poisson equation */ - LaserFFT::VendorFFT m_plan_fwd; + AnyFFT m_forward_fft; /** FFTW plan for backward C2C transform to solve Complex Poisson equation */ - LaserFFT::VendorFFT m_plan_bkw; + AnyFFT m_backward_fft; + /** work area for both FFT plans */ + amrex::Gpu::DeviceVector m_fft_work_area; /** Complex FAB to store the solution (e.g. laser envelope on current slice) */ SpectralFieldLoc m_sol; /** Complex FAB to store the RHS in position space */ SpectralFieldLoc m_rhs; /** Complex FAB to store the RHS in Fourier space */ SpectralFieldLoc m_rhs_fourier; -#ifdef AMREX_USE_CUDA - cufftResult m_result_fwd; - cufftResult m_result_bkw; -#endif // Data for in-situ diagnostics: /** Number of real laser properties for in-situ per-slice reduced diagnostics. */ diff --git a/src/laser/MultiLaser.cpp b/src/laser/MultiLaser.cpp index 2d2b008673..ba77cbcdc5 100644 --- a/src/laser/MultiLaser.cpp +++ b/src/laser/MultiLaser.cpp @@ -13,11 +13,7 @@ #include "utils/HipaceProfilerWrapper.H" #include "utils/DeprecatedInput.H" #include "utils/InsituUtil.H" -#ifdef AMREX_USE_CUDA -# include "fields/fft_poisson_solver/fft/CuFFTUtils.H" -#elif defined(AMREX_USE_HIP) -# include "fields/fft_poisson_solver/fft/RocFFTUtils.H" -#endif +#include "fields/fft_poisson_solver/fft/AnyFFT.H" #include "particles/particles_utils/ShapeFactors.H" #ifdef HIPACE_USE_OPENPMD # include @@ -25,18 +21,6 @@ #include -#ifdef AMREX_USE_CUDA -# include -#elif defined(AMREX_USE_HIP) -# if __has_include() // ROCm 5.3+ -# include -# else -# include -# endif -#else -# include -#endif - void MultiLaser::ReadParameters () { @@ -111,62 +95,14 @@ MultiLaser::InitData (const amrex::BoxArray& slice_ba, // Create FFT plans amrex::IntVect fft_size = m_slice_box.length(); -#ifdef AMREX_USE_CUDA - cufftResult result; - // Forward FFT plan - result = LaserFFT::VendorCreate( - &(m_plan_fwd), fft_size[1], fft_size[0], LaserFFT::cufft_type); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } - // Backward FFT plan - result = LaserFFT::VendorCreate( - &(m_plan_bkw), fft_size[1], fft_size[0], LaserFFT::cufft_type); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " cufftplan failed! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - const std::size_t lengths[] = { - static_cast(fft_size[0]), static_cast(fft_size[1]) - }; - - rocfft_status result; - // Forward FFT plan - result = rocfft_plan_create(&(m_plan_fwd), rocfft_placement_notinplace, - rocfft_transform_type_complex_forward, -#ifdef AMREX_USE_FLOAT - rocfft_precision_single, -#else - rocfft_precision_double, -#endif - 2, lengths, 1, nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); - // Backward FFT plan - result = rocfft_plan_create(&(m_plan_bkw), rocfft_placement_notinplace, - rocfft_transform_type_complex_inverse, -#ifdef AMREX_USE_FLOAT - rocfft_precision_single, -#else - rocfft_precision_double, -#endif - 2, lengths, 1, nullptr); - RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result); -#else - // Forward FFT plan - m_plan_fwd = LaserFFT::VendorCreate( - fft_size[1], fft_size[0], - reinterpret_cast(m_rhs.dataPtr()), - reinterpret_cast(m_rhs_fourier.dataPtr()), - FFTW_FORWARD, FFTW_ESTIMATE); - // Backward FFT plan - m_plan_bkw = LaserFFT::VendorCreate( - fft_size[1], fft_size[0], - reinterpret_cast(m_rhs_fourier.dataPtr()), - reinterpret_cast(m_sol.dataPtr()), - FFTW_BACKWARD, FFTW_ESTIMATE); -#endif + std::size_t fwd_area = m_forward_fft.Initialize(FFTType::C2C_2D_fwd, fft_size[0], fft_size[1]); + std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2C_2D_bkw, fft_size[0], fft_size[1]); + + // Allocate work area for both FFTs + m_fft_work_area.resize(std::max(fwd_area, bkw_area)); + + m_forward_fft.SetBuffers(m_rhs.dataPtr(), m_rhs_fourier.dataPtr(), m_fft_work_area.dataPtr()); + m_backward_fft.SetBuffers(m_rhs_fourier.dataPtr(), m_sol.dataPtr(), m_fft_work_area.dataPtr()); } if (m_laser_from_file) { @@ -922,36 +858,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste }); // Transform rhs to Fourier space -#ifdef AMREX_USE_CUDA - cudaStream_t stream = amrex::Gpu::Device::cudaStream(); - cufftSetStream(m_plan_fwd, stream); - cufftResult result; - result = LaserFFT::VendorExecute( - m_plan_fwd, - reinterpret_cast( m_rhs.dataPtr() ), - reinterpret_cast( m_rhs_fourier.dataPtr() ), - CUFFT_FORWARD); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - rocfft_execution_info execinfo_fwd = nullptr; - rocfft_status result = rocfft_execution_info_create(&execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - result = rocfft_execution_info_set_stream(execinfo_fwd, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - void* in_buffer_fwd[2] = {(void*)m_rhs.dataPtr(), nullptr}; - void* out_bufferfwd[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr}; - - result = rocfft_execute(m_plan_fwd, in_buffer_fwd, out_bufferfwd, execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - result = rocfft_execution_info_destroy(execinfo_fwd); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); -#else - LaserFFT::VendorExecute( m_plan_fwd ); -#endif + m_forward_fft.Execute(); // Multiply by appropriate factors in Fourier space amrex::Real dkx = 2.*MathConst::pi/m_laser_geom_3D.ProbLength(0); @@ -973,35 +880,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste }); // Transform rhs to Fourier space to get solution in sol -#ifdef AMREX_USE_CUDA - cudaStream_t stream_bkw = amrex::Gpu::Device::cudaStream(); - cufftSetStream ( m_plan_bkw, stream_bkw); - result = LaserFFT::VendorExecute( - m_plan_bkw, - reinterpret_cast( m_rhs_fourier.dataPtr() ), - reinterpret_cast( m_sol.dataPtr() ), - CUFFT_INVERSE); - if ( result != CUFFT_SUCCESS ) { - amrex::Print() << " forward transform using cufftExec failed ! Error: " << - CuFFTUtils::cufftErrorToString(result) << "\n"; - } -#elif defined(AMREX_USE_HIP) - rocfft_execution_info execinfo_bkw = nullptr; - result = rocfft_execution_info_create(&execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result); - result = rocfft_execution_info_set_stream(execinfo_bkw, amrex::Gpu::gpuStream()); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result); - - void* in_buffer_bkw[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr}; - void* out_buffer_bkw[2] = {(void*)m_sol.dataPtr(), nullptr}; - - result = rocfft_execute(m_plan_bkw, in_buffer_bkw, out_buffer_bkw, execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execute", result); - result = rocfft_execution_info_destroy(execinfo_bkw); - RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result); -#else - LaserFFT::VendorExecute( m_plan_bkw ); -#endif + m_backward_fft.Execute(); // Normalize and store solution in np1j00[0]. Guard cells are filled with 0s. amrex::Box grown_bx = bx;