diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index 79dd078913..167da805b0 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -252,15 +252,31 @@ The default is to use the explicit solver. **We strongly recommend to use the ex
     Which solver to use.
     Possible values: ``explicit`` and ``predictor-corrector``.
 
-* ``fields.poisson_solver`` (`string`) optional (default `FFTDirichlet`)
+* ``fields.poisson_solver`` (`string`) optional (default CPU: `FFTDirichletDirect`, GPU: `FFTDirichletFast`)
     Which Poisson solver to use for ``Psi``, ``Ez`` and ``Bz``. The ``predictor-corrector`` BxBy
-    solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are
-    ``FFTDirichlet``, ``FFTPeriodic`` and ``MGDirichlet``.
+    solver also uses this poisson solver for ``Bx`` and ``By`` internally. Available solvers are:
 
-* ``hipace.use_small_dst`` (`bool`) optional (default `0` or `1`)
-    Whether to use a large R2C or a small C2R fft in the dst of the Poisson solver.
-    The small dst is quicker for simulations with :math:`\geq 511` transverse grid points.
-    The default is set accordingly.
+      * ``FFTDirichletDirect`` Use the discrete sine transformation that is directly implemented
+        by FFTW to solve the Poisson equation with Dirichlet boundary conditions.
+        This option is only available when compiling for CPUs with FFTW.
+        Preferred resolution: :math:`2^N-1`.
+
+      * ``FFTDirichletExpanded`` Perform the discrete sine transformation by symmetrically
+        expanding the field to twice its size.
+        Preferred resolution: :math:`2^N-1`.
+
+      * ``FFTDirichletFast`` Perform the discrete sine transformation using a fast sine transform
+        algorithm that uses FFTs of the same size as the fields.
+        Preferred resolution: :math:`2^N-1`.
+
+      * ``MGDirichlet`` Use the HiPACE++ multigrid solver to solve the Poisson equation with
+        Dirichlet boundary conditions.
+        Preferred resolution: :math:`2^N` and :math:`2^N-1`.
+
+      * ``FFTPeriodic`` Use FFTs to solve the Poisson equation with Periodic boundary conditions.
+        Note that this does not work with features that change the boundary values,
+        like mesh refinement or open boundaries.
+        Preferred resolution: :math:`2^N`.
 
 * ``fields.extended_solve`` (`bool`) optional (default `0`)
     Extends the area of the FFT Poisson solver to the ghost cells. This can reduce artifacts
diff --git a/src/Hipace.H b/src/Hipace.H
index 9a67556413..03e60a69ef 100644
--- a/src/Hipace.H
+++ b/src/Hipace.H
@@ -44,6 +44,9 @@ struct Hipace_early_init
      * and Parser Constants */
     Hipace_early_init (Hipace* instance);
 
+    /** Destructor for FFT cleanup */
+    ~Hipace_early_init ();
+
     /** Struct containing physical constants (which values depends on the unit system, determined
      * at runtime): SI or normalized units. */
     PhysConst m_phys_const;
diff --git a/src/Hipace.cpp b/src/Hipace.cpp
index 4a3a8c93bb..5a2ce8e164 100644
--- a/src/Hipace.cpp
+++ b/src/Hipace.cpp
@@ -17,6 +17,7 @@
 #include "utils/GPUUtil.H"
 #include "particles/pusher/GetAndSetPosition.H"
 #include "mg_solver/HpMultiGrid.H"
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
 
 #include <AMReX_ParmParse.H>
 #include <AMReX_IntVect.H>
@@ -52,6 +53,12 @@ Hipace_early_init::Hipace_early_init (Hipace* instance)
     int max_level = 0;
     queryWithParser(pp_amr, "max_level", max_level);
     m_N_level = max_level + 1;
+    AnyFFT::setup();
+}
+
+Hipace_early_init::~Hipace_early_init ()
+{
+    AnyFFT::cleanup();
 }
 
 Hipace&
diff --git a/src/fields/Fields.H b/src/fields/Fields.H
index 45132492a7..96e6b485a4 100644
--- a/src/fields/Fields.H
+++ b/src/fields/Fields.H
@@ -507,7 +507,7 @@ private:
     /** Vector over levels of all required fields to compute current slice */
     amrex::Vector<amrex::MultiFab> m_slices;
     /** Type of poisson solver to use */
-    std::string m_poisson_solver_str = "FFTDirichlet";
+    std::string m_poisson_solver_str = "";
     /** Class to handle transverse FFT Poisson solver on 1 slice */
     amrex::Vector<std::unique_ptr<FFTPoissonSolver>> m_poisson_solver;
     /** Stores temporary values for z interpolation in Fields::Copy */
diff --git a/src/fields/Fields.cpp b/src/fields/Fields.cpp
index 67495c6df1..efcf9d8db3 100644
--- a/src/fields/Fields.cpp
+++ b/src/fields/Fields.cpp
@@ -8,7 +8,9 @@
  */
 #include "Fields.H"
 #include "fft_poisson_solver/FFTPoissonSolverPeriodic.H"
-#include "fft_poisson_solver/FFTPoissonSolverDirichlet.H"
+#include "fft_poisson_solver/FFTPoissonSolverDirichletDirect.H"
+#include "fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H"
+#include "fft_poisson_solver/FFTPoissonSolverDirichletFast.H"
 #include "fft_poisson_solver/MGPoissonSolverDirichlet.H"
 #include "Hipace.H"
 #include "OpenBoundary.H"
@@ -29,6 +31,12 @@ Fields::Fields (const int nlev)
 {
     amrex::ParmParse ppf("fields");
     DeprecatedInput("fields", "do_dirichlet_poisson", "poisson_solver", "");
+    // set default Poisson solver based on the platform
+#ifdef AMREX_USE_GPU
+    m_poisson_solver_str = "FFTDirichletFast";
+#else
+    m_poisson_solver_str = "FFTDirichletDirect";
+#endif
     queryWithParser(ppf, "poisson_solver", m_poisson_solver_str);
     queryWithParser(ppf, "extended_solve", m_extended_solve);
     queryWithParser(ppf, "open_boundary", m_open_boundary);
@@ -178,11 +186,21 @@ Fields::AllocData (
     // The Poisson solver operates on transverse slices only.
     // The constructor takes the BoxArray and the DistributionMap of a slice,
     // so the FFTPlans are built on a slice.
-    if (m_poisson_solver_str == "FFTDirichlet"){
-        m_poisson_solver.push_back(std::unique_ptr<FFTPoissonSolverDirichlet>(
-            new FFTPoissonSolverDirichlet(getSlices(lev).boxArray(),
-                                          getSlices(lev).DistributionMap(),
-                                          geom)) );
+    if (m_poisson_solver_str == "FFTDirichletDirect"){
+        m_poisson_solver.push_back(std::unique_ptr<FFTPoissonSolverDirichletDirect>(
+            new FFTPoissonSolverDirichletDirect(getSlices(lev).boxArray(),
+                                                getSlices(lev).DistributionMap(),
+                                                geom)) );
+    } else if (m_poisson_solver_str == "FFTDirichletExpanded"){
+        m_poisson_solver.push_back(std::unique_ptr<FFTPoissonSolverDirichletExpanded>(
+            new FFTPoissonSolverDirichletExpanded(getSlices(lev).boxArray(),
+                                                  getSlices(lev).DistributionMap(),
+                                                  geom)) );
+    } else if (m_poisson_solver_str == "FFTDirichletFast"){
+        m_poisson_solver.push_back(std::unique_ptr<FFTPoissonSolverDirichletFast>(
+            new FFTPoissonSolverDirichletFast(getSlices(lev).boxArray(),
+                                              getSlices(lev).DistributionMap(),
+                                              geom)) );
     } else if (m_poisson_solver_str == "FFTPeriodic") {
         m_poisson_solver.push_back(std::unique_ptr<FFTPoissonSolverPeriodic>(
             new FFTPoissonSolverPeriodic(getSlices(lev).boxArray(),
@@ -195,7 +213,8 @@ Fields::AllocData (
                                          geom))  );
     } else {
         amrex::Abort("Unknown poisson solver '" + m_poisson_solver_str +
-            "', must be 'FFTDirichlet', 'FFTPeriodic' or 'MGDirichlet'");
+            "', must be 'FFTDirichletDirect', 'FFTDirichletExpanded', 'FFTDirichletFast', " +
+            "'FFTPeriodic' or 'MGDirichlet'");
     }
 
     if (lev == 0 && m_insitu_period > 0) {
diff --git a/src/fields/fft_poisson_solver/CMakeLists.txt b/src/fields/fft_poisson_solver/CMakeLists.txt
index 847506f194..46e480ec2f 100644
--- a/src/fields/fft_poisson_solver/CMakeLists.txt
+++ b/src/fields/fft_poisson_solver/CMakeLists.txt
@@ -9,7 +9,9 @@ target_sources(HiPACE
   PRIVATE
     FFTPoissonSolver.cpp
     FFTPoissonSolverPeriodic.cpp
-    FFTPoissonSolverDirichlet.cpp
+    FFTPoissonSolverDirichletDirect.cpp
+    FFTPoissonSolverDirichletExpanded.cpp
+    FFTPoissonSolverDirichletFast.cpp
     MGPoissonSolverDirichlet.cpp
 )
 
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H
similarity index 75%
rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H
rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H
index bc4dc76d1c..78f3d5dc38 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.H
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.H
@@ -5,10 +5,10 @@
  * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs
  * License: BSD-3-Clause-LBNL
  */
-#ifndef FFT_POISSON_SOLVER_DIRICHLET_H_
-#define FFT_POISSON_SOLVER_DIRICHLET_H_
+#ifndef FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_
+#define FFT_POISSON_SOLVER_DIRICHLET_DIRECT_H_
 
-#include "fields/fft_poisson_solver/fft/AnyDST.H"
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
 #include "FFTPoissonSolver.H"
 
 #include <AMReX_MultiFab.H>
@@ -23,16 +23,16 @@
  * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS
  *    in the staging area and return the LHS in mf.
  */
-class FFTPoissonSolverDirichlet final : public FFTPoissonSolver
+class FFTPoissonSolverDirichletDirect final : public FFTPoissonSolver
 {
 public:
     /** Constructor */
-    FFTPoissonSolverDirichlet ( amrex::BoxArray const& a_realspace_ba,
-                                amrex::DistributionMapping const& dm,
-                                amrex::Geometry const& gm);
+    FFTPoissonSolverDirichletDirect ( amrex::BoxArray const& a_realspace_ba,
+                                      amrex::DistributionMapping const& dm,
+                                      amrex::Geometry const& gm);
 
     /** virtual destructor */
-    virtual ~FFTPoissonSolverDirichlet () override final {}
+    virtual ~FFTPoissonSolverDirichletDirect () override final {}
 
     /**
      * \brief Define real space and spectral space boxes and multifabs, Dirichlet
@@ -63,8 +63,12 @@ private:
     amrex::MultiFab m_tmpSpectralField;
     /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */
     amrex::MultiFab m_eigenvalue_matrix;
-    /** DST plans */
-    AnyDST::DSTplans m_plan;
+    /** forward DST plan */
+    AnyFFT m_forward_fft;
+    /** backward DST plan */
+    AnyFFT m_backward_fft;
+    /** work area for both DST plans */
+    amrex::Gpu::DeviceVector<char> m_fft_work_area;
 };
 
 #endif
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp
similarity index 67%
rename from src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp
rename to src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp
index 3fb2bc93aa..70da5ac1d8 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichlet.cpp
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletDirect.cpp
@@ -6,14 +6,14 @@
  *
  * License: BSD-3-Clause-LBNL
  */
-#include "FFTPoissonSolverDirichlet.H"
-#include "fft/AnyDST.H"
+#include "FFTPoissonSolverDirichletDirect.H"
+#include "fft/AnyFFT.H"
 #include "fields/Fields.H"
 #include "utils/Constants.H"
 #include "utils/GPUUtil.H"
 #include "utils/HipaceProfilerWrapper.H"
 
-FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet (
+FFTPoissonSolverDirichletDirect::FFTPoissonSolverDirichletDirect (
     amrex::BoxArray const& realspace_ba,
     amrex::DistributionMapping const& dm,
     amrex::Geometry const& gm )
@@ -22,10 +22,11 @@ FFTPoissonSolverDirichlet::FFTPoissonSolverDirichlet (
 }
 
 void
-FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba,
-                                   amrex::DistributionMapping const& dm,
-                                   amrex::Geometry const& gm )
+FFTPoissonSolverDirichletDirect::define (amrex::BoxArray const& a_realspace_ba,
+                                         amrex::DistributionMapping const& dm,
+                                         amrex::Geometry const& gm )
 {
+    HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::define()");
     using namespace amrex::literals;
 
     // If we are going to support parallel FFT, the constructor needs to take a communicator.
@@ -48,16 +49,18 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba,
                                      "There should be only one box locally.");
 
     const amrex::Box fft_box = m_stagingArea[0].box();
+    const amrex::IntVect fft_size = fft_box.length();
+    const int nx = fft_size[0];
+    const int ny = fft_size[1];
     const auto dx = gm.CellSizeArray();
     const amrex::Real dxsquared = dx[0]*dx[0];
     const amrex::Real dysquared = dx[1]*dx[1];
-    const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( fft_box.length(0) + 1 ));
-    const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( fft_box.length(1) + 1 ));
+    const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 ));
+    const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 ));
 
     // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00)
     // This normalization is used regardless of the sine transform library
-    const amrex::Real norm_fac = 0.5 / ( 2 * (( fft_box.length(0) + 1 )
-                                             *( fft_box.length(1) + 1 )));
+    const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 )));
 
     // Calculate the array of m_eigenvalue_matrix
     for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){
@@ -67,9 +70,9 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba,
             fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept
                 {
                     /* fast poisson solver diagonal x coeffs */
-                    amrex::Real sinex_sq = sin(( i - lo[0] + 1 ) * sine_x_factor) * sin(( i - lo[0] + 1 ) * sine_x_factor);
+                    amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor);
                     /* fast poisson solver diagonal y coeffs */
-                    amrex::Real siney_sq = sin(( j - lo[1] + 1 ) * sine_y_factor) * sin(( j - lo[1] + 1 ) * sine_y_factor);
+                    amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor);
 
                     if ((sinex_sq!=0) && (siney_sq!=0)) {
                         eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared ));
@@ -81,29 +84,25 @@ FFTPoissonSolverDirichlet::define (amrex::BoxArray const& a_realspace_ba,
     }
 
     // Allocate and initialize the FFT plans
-    m_plan = AnyDST::DSTplans(a_realspace_ba, dm);
-    // Loop over boxes and allocate the corresponding plan
-    // for each box owned by the local MPI proc
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Note: the size of the real-space box and spectral-space box
-        // differ when using real-to-complex FFT. When initializing
-        // the FFT plan, the valid dimensions are those of the real-space box.
-        amrex::IntVect fft_size = fft_box.length();
-        m_plan[mfi] = AnyDST::CreatePlan(
-            fft_size, &m_stagingArea[mfi], &m_tmpSpectralField[mfi]);
-    }
+    std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]);
+    std::size_t bkw_area = m_backward_fft.Initialize(FFTType::R2R_2D, fft_size[0], fft_size[1]);
+
+    // Allocate work area for both FFTs
+    m_fft_work_area.resize(std::max(fwd_area, bkw_area));
+
+    m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(),
+                             m_fft_work_area.dataPtr());
+    m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(),
+                              m_fft_work_area.dataPtr());
 }
 
 
 void
-FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
+FFTPoissonSolverDirichletDirect::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
 {
-    HIPACE_PROFILE("FFTPoissonSolverDirichlet::SolvePoissonEquation()");
+    HIPACE_PROFILE("FFTPoissonSolverDirichletDirect::SolvePoissonEquation()");
 
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Perform Fourier transform from the staging area to `tmpSpectralField`
-        AnyDST::Execute(m_plan[mfi], AnyDST::direction::forward);
-    }
+    m_forward_fft.Execute();
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
@@ -120,10 +119,7 @@ FFTPoissonSolverDirichlet::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
             });
     }
 
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Perform Fourier transform from `tmpSpectralField` to the staging area
-        AnyDST::Execute(m_plan[mfi], AnyDST::direction::backward);
-    }
+    m_backward_fft.Execute();
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H
new file mode 100644
index 0000000000..d54393dd72
--- /dev/null
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.H
@@ -0,0 +1,76 @@
+/* Copyright 2020-2021
+ *
+ * This file is part of HiPACE++.
+ *
+ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs
+ * License: BSD-3-Clause-LBNL
+ */
+#ifndef FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_
+#define FFT_POISSON_SOLVER_DIRICHLET_EXPANDED_H_
+
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
+#include "FFTPoissonSolver.H"
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_GpuComplex.H>
+
+/**
+ * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves.
+ *
+ * For a given source S, it solves equation Laplacian(F) = S and returns F.
+ * Once an instance is created, a typical use consists in:
+ * 1. Compute S directly in FFTPoissonSolver::m_stagingArea
+ * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS
+ *    in the staging area and return the LHS in mf.
+ */
+class FFTPoissonSolverDirichletExpanded final : public FFTPoissonSolver
+{
+public:
+    /** Constructor */
+    FFTPoissonSolverDirichletExpanded ( amrex::BoxArray const& a_realspace_ba,
+                                        amrex::DistributionMapping const& dm,
+                                        amrex::Geometry const& gm);
+
+    /** virtual destructor */
+    virtual ~FFTPoissonSolverDirichletExpanded () override final {}
+
+    /**
+     * \brief Define real space and spectral space boxes and multifabs, Dirichlet
+     * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans.
+     * Currently only works with a single box, i.e., serial FFT.
+     *
+     * \param[in] realspace_ba BoxArray on which the FFT is executed.
+     * \param[in] dm DistributionMapping for the BoxArray.
+     * \param[in] gm Geometry, contains the box dimensions.
+     */
+    void define ( amrex::BoxArray const& realspace_ba,
+                  amrex::DistributionMapping const& dm,
+                  amrex::Geometry const& gm);
+
+    /**
+     * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call.
+     *
+     * \param[in] lhs_mf Destination array, where the result is stored.
+     */
+    virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final;
+
+    /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */
+    virtual amrex::Real BoundaryOffset() override final { return 1.; }
+    virtual amrex::Real BoundaryFactor() override final { return 1.; }
+
+private:
+    /** Spectral fields, contains (real) field in Fourier space */
+    amrex::MultiFab m_tmpSpectralField;
+    /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */
+    amrex::MultiFab m_eigenvalue_matrix;
+    /** Twice expanded field that gets filled symmetrically */
+    amrex::FArrayBox m_expanded_position_array;
+    /** Twice expanded Complex field */
+    amrex::BaseFab<amrex::GpuComplex<amrex::Real>> m_expanded_fourier_array;
+    /** FFT plan */
+    AnyFFT m_fft;
+    /** work area for FFT plan */
+    amrex::Gpu::DeviceVector<char> m_fft_work_area;
+};
+
+#endif
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp
new file mode 100644
index 0000000000..5b1c3ad258
--- /dev/null
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletExpanded.cpp
@@ -0,0 +1,199 @@
+/* Copyright 2020-2022
+ *
+ * This file is part of HiPACE++.
+ *
+ * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+#include "FFTPoissonSolverDirichletExpanded.H"
+#include "fft/AnyFFT.H"
+#include "fields/Fields.H"
+#include "utils/Constants.H"
+#include "utils/GPUUtil.H"
+#include "utils/HipaceProfilerWrapper.H"
+
+FFTPoissonSolverDirichletExpanded::FFTPoissonSolverDirichletExpanded (
+    amrex::BoxArray const& realspace_ba,
+    amrex::DistributionMapping const& dm,
+    amrex::Geometry const& gm )
+{
+    define(realspace_ba, dm, gm);
+}
+
+void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src)
+{
+    const amrex::Box bx = src.box();
+    const int nx = bx.length(0);
+    const int ny = bx.length(1);
+    const amrex::IntVect lo = bx.smallEnd();
+    Array2<amrex::Real const> const src_array = src.const_array();
+    Array2<amrex::Real> const dst_array = dst.array();
+
+    amrex::ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int)
+        {
+            /* upper left quadrant */
+            dst_array(i+1,j+1) = src_array(i, j);
+            /* lower left quadrant */
+            dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]);
+            /* upper right quadrant */
+            dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j);
+            /* lower right quadrant */
+            dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]);
+        });
+}
+
+void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab<amrex::GpuComplex<amrex::Real>>& src)
+{
+    const amrex::Box bx = dst.box();
+    Array2<amrex::GpuComplex<amrex::Real> const> const src_array = src.const_array();
+    Array2<amrex::Real> const dst_array = dst.array();
+    amrex::ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int)
+        {
+            /* upper left quadrant */
+            dst_array(i,j) = -src_array(i+1, j+1).real();
+        });
+}
+
+void
+FFTPoissonSolverDirichletExpanded::define (amrex::BoxArray const& a_realspace_ba,
+                                           amrex::DistributionMapping const& dm,
+                                           amrex::Geometry const& gm )
+{
+    HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::define()");
+    using namespace amrex::literals;
+
+    // If we are going to support parallel FFT, the constructor needs to take a communicator.
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet");
+
+    // Allocate temporary arrays - in real space and spectral space
+    // These arrays will store the data just before/after the FFT
+    // The stagingArea is also created from 0 to nx, because the real space array may have
+    // an offset for levels > 0
+    m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required
+    m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards);
+
+    // This must be true even for parallel FFT.
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1,
+                                     "There should be only one box locally.");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1,
+                                     "There should be only one box locally.");
+
+    const amrex::Box fft_box = m_stagingArea[0].box();
+    const amrex::IntVect fft_size = fft_box.length();
+    const int nx = fft_size[0];
+    const int ny = fft_size[1];
+    const auto dx = gm.CellSizeArray();
+    const amrex::Real dxsquared = dx[0]*dx[0];
+    const amrex::Real dysquared = dx[1]*dx[1];
+    const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 ));
+    const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 ));
+
+    // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00)
+    // This normalization is used regardless of the sine transform library
+    const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 )));
+
+    // Calculate the array of m_eigenvalue_matrix
+    for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){
+        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
+        amrex::IntVect lo = fft_box.smallEnd();
+        amrex::ParallelFor(
+            fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept
+                {
+                    /* fast poisson solver diagonal x coeffs */
+                    amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor);
+                    /* fast poisson solver diagonal y coeffs */
+                    amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor);
+
+                    if ((sinex_sq!=0) && (siney_sq!=0)) {
+                        eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared ));
+                    } else {
+                        // Avoid division by 0
+                        eigenvalue_matrix(i,j) = 0._rt;
+                    }
+                });
+    }
+
+    // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2)
+    // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2)
+    amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}};
+    amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}};
+    // shift box to match rest of fields
+    expanded_position_box += fft_box.smallEnd();
+    expanded_fourier_box += fft_box.smallEnd();
+
+    m_expanded_position_array.resize(expanded_position_box);
+    m_expanded_fourier_array.resize(expanded_fourier_box);
+
+    m_expanded_position_array.setVal<amrex::RunOn::Device>(0._rt);
+
+    // Allocate and initialize the FFT plan
+    std::size_t wrok_size = m_fft.Initialize(FFTType::R2C_2D, expanded_position_box.length(0),
+                                             expanded_position_box.length(1));
+
+    // Allocate work area for the FFT
+    m_fft_work_area.resize(wrok_size);
+
+    m_fft.SetBuffers(m_expanded_position_array.dataPtr(), m_expanded_fourier_array.dataPtr(),
+                     m_fft_work_area.dataPtr());
+}
+
+
+void
+FFTPoissonSolverDirichletExpanded::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
+{
+    HIPACE_PROFILE("FFTPoissonSolverDirichletExpanded::SolvePoissonEquation()");
+    using namespace amrex::literals;
+
+    m_expanded_position_array.setVal<amrex::RunOn::Device>(0._rt);
+
+    ExpandR2R(m_expanded_position_array, m_stagingArea[0]);
+
+    m_fft.Execute();
+
+    ShrinkC2R(m_tmpSpectralField[0], m_expanded_fourier_array);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
+        // Solve Poisson equation in Fourier space:
+        // Multiply `tmpSpectralField` by eigenvalue_matrix
+        Array2<amrex::Real> tmp_cmplx_arr = m_tmpSpectralField.array(mfi);
+        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
+
+        amrex::ParallelFor( mfi.growntilebox(),
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
+                tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j);
+            });
+    }
+
+    m_expanded_position_array.setVal<amrex::RunOn::Device>(0._rt);
+
+    ExpandR2R(m_expanded_position_array, m_tmpSpectralField[0]);
+
+    m_fft.Execute();
+
+    ShrinkC2R(m_stagingArea[0], m_expanded_fourier_array);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
+        // Copy from the staging area to output array (and normalize)
+        Array2<amrex::Real> tmp_real_arr = m_stagingArea.array(mfi);
+        Array2<amrex::Real> lhs_arr = lhs_mf.array(mfi);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1,
+                                         "Slice MFs must be defined on one box only");
+        amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(),
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
+                // Copy field
+                lhs_arr(i,j) = tmp_real_arr(i,j);
+            });
+    }
+}
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H
new file mode 100644
index 0000000000..c981181c7f
--- /dev/null
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H
@@ -0,0 +1,78 @@
+/* Copyright 2020-2021
+ *
+ * This file is part of HiPACE++.
+ *
+ * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs
+ * License: BSD-3-Clause-LBNL
+ */
+#ifndef FFT_POISSON_SOLVER_DIRICHLET_FAST_H_
+#define FFT_POISSON_SOLVER_DIRICHLET_FAST_H_
+
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
+#include "FFTPoissonSolver.H"
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_GpuComplex.H>
+
+/**
+ * \brief This class handles functions and data to perform transverse Fourier-based Poisson solves.
+ *
+ * For a given source S, it solves equation Laplacian(F) = S and returns F.
+ * Once an instance is created, a typical use consists in:
+ * 1. Compute S directly in FFTPoissonSolver::m_stagingArea
+ * 2. Call FFTPoissonSolver::SolvePoissonEquation(mf), which will solve Poisson equation with RHS
+ *    in the staging area and return the LHS in mf.
+ */
+class FFTPoissonSolverDirichletFast final : public FFTPoissonSolver
+{
+public:
+    /** Constructor */
+    FFTPoissonSolverDirichletFast ( amrex::BoxArray const& a_realspace_ba,
+                                    amrex::DistributionMapping const& dm,
+                                    amrex::Geometry const& gm);
+
+    /** virtual destructor */
+    virtual ~FFTPoissonSolverDirichletFast () override final {}
+
+    /**
+     * \brief Define real space and spectral space boxes and multifabs, Dirichlet
+     * eigenvalue matrix m_eigenvalue_matrix to solve Poisson equation and FFT plans.
+     * Currently only works with a single box, i.e., serial FFT.
+     *
+     * \param[in] realspace_ba BoxArray on which the FFT is executed.
+     * \param[in] dm DistributionMapping for the BoxArray.
+     * \param[in] gm Geometry, contains the box dimensions.
+     */
+    void define ( amrex::BoxArray const& realspace_ba,
+                  amrex::DistributionMapping const& dm,
+                  amrex::Geometry const& gm);
+
+    /**
+     * Solve Poisson equation. The source term must be stored in the staging area m_stagingArea prior to this call.
+     *
+     * \param[in] lhs_mf Destination array, where the result is stored.
+     */
+    virtual void SolvePoissonEquation (amrex::MultiFab& lhs_mf) override final;
+
+    /** Position and relative factor used to apply inhomogeneous Dirichlet boundary conditions */
+    virtual amrex::Real BoundaryOffset() override final { return 1.; }
+    virtual amrex::Real BoundaryFactor() override final { return 1.; }
+
+private:
+    /** Spectral fields, contains (real) field in Fourier space */
+    amrex::MultiFab m_tmpSpectralField;
+    /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */
+    amrex::MultiFab m_eigenvalue_matrix;
+    /** Real array for the FFTs */
+    amrex::Gpu::DeviceVector<amrex::Real> m_position_array;
+    /** Complex array for the FFTs */
+    amrex::Gpu::DeviceVector<amrex::GpuComplex<amrex::Real>> m_fourier_array;
+    /** FFT plan in x direction */
+    AnyFFT m_x_fft;
+    /** FFT plan in y direction */
+    AnyFFT m_y_fft;
+    /** work area for both DST plans */
+    amrex::Gpu::DeviceVector<char> m_fft_work_area;
+};
+
+#endif
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
new file mode 100644
index 0000000000..50f1547bca
--- /dev/null
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
@@ -0,0 +1,315 @@
+/* Copyright 2020-2022
+ *
+ * This file is part of HiPACE++.
+ *
+ * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+#include "FFTPoissonSolverDirichletFast.H"
+#include "fft/AnyFFT.H"
+#include "fields/Fields.H"
+#include "utils/Constants.H"
+#include "utils/GPUUtil.H"
+#include "utils/HipaceProfilerWrapper.H"
+
+FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast (
+    amrex::BoxArray const& realspace_ba,
+    amrex::DistributionMapping const& dm,
+    amrex::Geometry const& gm )
+{
+    define(realspace_ba, dm, gm);
+}
+
+/** \brief Make Complex array out of Real array to prepare for fft.
+ * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with
+ * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1]
+ *
+ * \param[in] in input real array
+ * \param[out] out output complex array
+ * \param[in] n_data number of (contiguous) rows in position matrix
+ * \param[in] n_batch number of (strided) columns in position matrix
+ */
+void ToComplex (const amrex::Real* const AMREX_RESTRICT in,
+                amrex::GpuComplex<amrex::Real>* const AMREX_RESTRICT out,
+                const int n_data, const int n_batch)
+{
+    const int n_half = (n_data+1)/2;
+    if((n_data%2 == 1)) {
+        amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+            {
+                const int stride_in = n_data*j;
+                const int i_is_zero = (i==0);
+                const int i_is_n_half = (i==n_half);
+                const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
+                                            +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half);
+                const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in]
+                                            *!i_is_zero*!i_is_n_half;
+                out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
+            });
+    } else {
+        amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+            {
+                const int stride_in = n_data*j;
+                const int i_is_zero = (i==0);
+                const int i_is_n_half = (i==n_half);
+                const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
+                                            +in[2*i-i_is_n_half+stride_in]*!i_is_n_half;
+                const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero;
+                out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
+            });
+    }
+}
+
+/** \brief Make Sine-space Real array out of array from fft.
+ * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/
+ * (2*sin((idx+1)*pi/(n_data+1)))) for each column
+ *
+ * \param[in] in input real array
+ * \param[out] out output real array
+ * \param[in] n_data number of (contiguous) rows in position matrix
+ * \param[in] n_batch number of (strided) columns in position matrix
+ */
+void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out,
+             const int n_data, const int n_batch)
+{
+    using namespace amrex::literals;
+
+    const amrex::Real n_1_real_inv = 1._rt / (n_data+1._rt);
+    const int n_1 = n_data+1;
+    amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}},
+        [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+        {
+            const int i_rev = n_1-i;
+            const int stride_in = n_1*j;
+            const int stride_out = n_data*j;
+            const amrex::Real in_a = in[i + stride_in];
+            const amrex::Real in_b = in[i_rev + stride_in];
+            out[i - 1 + stride_out] =
+                0.5_rt*(in_b - in_a + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i * n_1_real_inv)));
+            out[i_rev - 1 + stride_out] =
+                0.5_rt*(in_a - in_b + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i_rev * n_1_real_inv)));
+        });
+}
+
+/** \brief Transpose input matrix
+ * out[idy][idx] = in[idx][idy]
+ *
+ * \param[in] in input real array
+ * \param[out] out output real array
+ * \param[in] n_data number of (contiguous) rows in input matrix
+ * \param[in] n_batch number of (strided) columns in input matrix
+ */
+void Transpose (const amrex::Real* const AMREX_RESTRICT in,
+                amrex::Real* const AMREX_RESTRICT out,
+                const int n_data, const int n_batch)
+{
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+    constexpr int tile_dim = 32; //must be power of 2
+    constexpr int block_rows = 8;
+    const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim;
+    const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim;
+    amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows,
+        tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE() noexcept
+        {
+            amrex::Gpu::SharedMemory<amrex::Real> gsm;
+            amrex::Real* const tile = gsm.dataPtr();
+
+            const int thread_x = threadIdx.x&(tile_dim-1);
+            const int thread_y = threadIdx.x/tile_dim;
+            const int block_y = blockIdx.x/num_blocks_x;
+            const int block_x = blockIdx.x - block_y*num_blocks_x;
+            int mat_x = block_x * tile_dim + thread_x;
+            int mat_y = block_y * tile_dim + thread_y;
+
+            for (int i = 0; i < tile_dim; i += block_rows) {
+                if(mat_x < n_data && (mat_y+i) < n_batch) {
+                    tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x];
+                }
+            }
+
+            __syncthreads();
+
+            mat_x = block_y * tile_dim + thread_x;
+            mat_y = block_x * tile_dim + thread_y;
+
+            for (int i = 0; i < tile_dim; i += block_rows) {
+                if(mat_x < n_batch && (mat_y+i) < n_data) {
+                    out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i];
+                }
+            }
+        });
+#else
+    amrex::ParallelFor({{0,0,0}, {n_batch-1, n_data-1, 0}},
+        [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
+        {
+            out[i + n_batch*j] = in[j + n_data*i];
+        });
+#endif
+}
+
+void
+FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba,
+                                       amrex::DistributionMapping const& dm,
+                                       amrex::Geometry const& gm )
+{
+    HIPACE_PROFILE("FFTPoissonSolverDirichletFast::define()");
+    using namespace amrex::literals;
+
+    // If we are going to support parallel FFT, the constructor needs to take a communicator.
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_realspace_ba.size() == 1, "Parallel FFT not supported yet");
+
+    // Allocate temporary arrays - in real space and spectral space
+    // These arrays will store the data just before/after the FFT
+    // The stagingArea is also created from 0 to nx, because the real space array may have
+    // an offset for levels > 0
+    m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
+    m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required
+    m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards);
+
+    // This must be true even for parallel FFT.
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1,
+                                     "There should be only one box locally.");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1,
+                                     "There should be only one box locally.");
+
+    const amrex::Box fft_box = m_stagingArea[0].box();
+    const amrex::IntVect fft_size = fft_box.length();
+    const int nx = fft_size[0];
+    const int ny = fft_size[1];
+    const auto dx = gm.CellSizeArray();
+    const amrex::Real dxsquared = dx[0]*dx[0];
+    const amrex::Real dysquared = dx[1]*dx[1];
+    const amrex::Real sine_x_factor = MathConst::pi / ( 2. * ( nx + 1 ));
+    const amrex::Real sine_y_factor = MathConst::pi / ( 2. * ( ny + 1 ));
+
+    // Normalization of FFTW's 'DST-I' discrete sine transform (FFTW_RODFT00)
+    // This normalization is used regardless of the sine transform library
+    const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 )));
+
+    // Calculate the array of m_eigenvalue_matrix
+    for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){
+        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
+        amrex::IntVect lo = fft_box.smallEnd();
+        amrex::ParallelFor(
+            fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept
+                {
+                    /* fast poisson solver diagonal x coeffs */
+                    amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor);
+                    /* fast poisson solver diagonal y coeffs */
+                    amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor);
+
+                    if ((sinex_sq!=0) && (siney_sq!=0)) {
+                        eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared ));
+                    } else {
+                        // Avoid division by 0
+                        eigenvalue_matrix(i,j) = 0._rt;
+                    }
+                });
+    }
+
+    // Allocate 1d Array for 2d data or 2d transpose data
+    const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx);
+    const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx);
+    m_position_array.resize(real_1d_size);
+    m_fourier_array.resize(complex_1d_size);
+
+    // Allocate and initialize the FFT plans
+    std::size_t fft_x_area = m_x_fft.Initialize(FFTType::C2R_1D_batched, nx+1, ny);
+    std::size_t fft_y_area = m_y_fft.Initialize(FFTType::C2R_1D_batched, ny+1, nx);
+
+    // Allocate work area for both FFTs
+    m_fft_work_area.resize(std::max(fft_x_area, fft_y_area));
+
+    m_x_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(),
+                       m_fft_work_area.dataPtr());
+    m_y_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(),
+                       m_fft_work_area.dataPtr());
+}
+
+
+void
+FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
+{
+    HIPACE_PROFILE("FFTPoissonSolverDirichletFast::SolvePoissonEquation()");
+
+    const int nx = m_stagingArea[0].box().length(0); // initially contiguous
+    const int ny = m_stagingArea[0].box().length(1); // contiguous after transpose
+
+    amrex::Real* const pos_arr = m_stagingArea[0].dataPtr();
+    amrex::Real* const fourier_arr = m_tmpSpectralField[0].dataPtr();
+    amrex::Real* const real_arr = m_position_array.dataPtr();
+    amrex::GpuComplex<amrex::Real>* comp_arr = m_fourier_array.dataPtr();
+
+    // 1D DST in x
+    ToComplex(pos_arr, comp_arr, nx, ny);
+
+    m_x_fft.Execute();
+
+    ToSine(real_arr, pos_arr, nx, ny);
+
+    Transpose(pos_arr, fourier_arr, nx, ny);
+
+    // 1D DST in y
+    ToComplex(fourier_arr, comp_arr, ny, nx);
+
+    m_y_fft.Execute();
+
+    ToSine(real_arr, pos_arr, ny, nx);
+
+    Transpose(pos_arr, fourier_arr, ny, nx);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
+        // Solve Poisson equation in Fourier space:
+        // Multiply `tmpSpectralField` by eigenvalue_matrix
+        Array2<amrex::Real> tmp_cmplx_arr = m_tmpSpectralField.array(mfi);
+        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
+
+        amrex::ParallelFor( mfi.growntilebox(),
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
+                tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j);
+            });
+    }
+
+    // 1D DST in x
+    ToComplex(fourier_arr, comp_arr, nx, ny);
+
+    m_x_fft.Execute();
+
+    ToSine(real_arr, fourier_arr, nx, ny);
+
+    Transpose(fourier_arr, pos_arr, nx, ny);
+
+    // 1D DST in y
+    ToComplex(pos_arr, comp_arr, ny, nx);
+
+    m_y_fft.Execute();
+
+    ToSine(real_arr, fourier_arr, ny, nx);
+
+    Transpose(fourier_arr, pos_arr, ny, nx);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
+        // Copy from the staging area to output array (and normalize)
+        Array2<amrex::Real> tmp_real_arr = m_stagingArea.array(mfi);
+        Array2<amrex::Real> lhs_arr = lhs_mf.array(mfi);
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1,
+                                         "Slice MFs must be defined on one box only");
+        amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(),
+            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
+                // Copy field
+                lhs_arr(i,j) = tmp_real_arr(i,j);
+            });
+    }
+}
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H
index e570e70d15..e18098764f 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.H
@@ -69,8 +69,12 @@ private:
     SpectralField m_tmpSpectralField;
     /** Multifab containing 1/(kx^2 + ky^2), to solve Poisson equation. */
     amrex::MultiFab m_inv_k2;
-    /** FFT plans */
-    AnyFFT::FFTplans m_forward_plan, m_backward_plan;
+    /** forward FFT plan */
+    AnyFFT m_forward_fft;
+    /** backward FFT plan */
+    AnyFFT m_backward_fft;
+    /** work area for both FFT plans */
+    amrex::Gpu::DeviceVector<char> m_fft_work_area;
 };
 
 #endif
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp
index 863a41427a..c83e4af64c 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverPeriodic.cpp
@@ -26,6 +26,7 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba,
                                    amrex::DistributionMapping const& dm,
                                    amrex::Geometry const& gm )
 {
+    HIPACE_PROFILE("FFTPoissonSolverPeriodic::define()");
     using namespace amrex::literals;
 
     // If we are going to support parallel FFT, the constructor needs to take a communicator.
@@ -93,25 +94,17 @@ FFTPoissonSolverPeriodic::define ( amrex::BoxArray const& realspace_ba,
     }
 
     // Allocate and initialize the FFT plans
-    m_forward_plan = AnyFFT::FFTplans(spectralspace_ba, dm);
-    m_backward_plan = AnyFFT::FFTplans(spectralspace_ba, dm);
-    // Loop over boxes and allocate the corresponding plan
-    // for each box owned by the local MPI proc
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Note: the size of the real-space box and spectral-space box
-        // differ when using real-to-complex FFT. When initializing
-        // the FFT plan, the valid dimensions are those of the real-space box.
-        amrex::IntVect fft_size = m_stagingArea[mfi].box().length();
-        m_forward_plan[mfi] = AnyFFT::CreatePlan(
-            fft_size, m_stagingArea[mfi].dataPtr(),
-            reinterpret_cast<AnyFFT::Complex*>( m_tmpSpectralField[mfi].dataPtr()),
-            AnyFFT::direction::R2C);
-
-        m_backward_plan[mfi] = AnyFFT::CreatePlan(
-            fft_size, m_stagingArea[mfi].dataPtr(),
-            reinterpret_cast<AnyFFT::Complex*>( m_tmpSpectralField[mfi].dataPtr()),
-            AnyFFT::direction::C2R);
-    }
+    amrex::IntVect fft_size = m_stagingArea[0].box().length();
+    std::size_t fwd_area = m_forward_fft.Initialize(FFTType::R2C_2D, fft_size[0], fft_size[1]);
+    std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2R_2D, fft_size[0], fft_size[1]);
+
+    // Allocate work area for both FFTs
+    m_fft_work_area.resize(std::max(fwd_area, bkw_area));
+
+    m_forward_fft.SetBuffers(m_stagingArea[0].dataPtr(), m_tmpSpectralField[0].dataPtr(),
+                             m_fft_work_area.dataPtr());
+    m_backward_fft.SetBuffers(m_tmpSpectralField[0].dataPtr(), m_stagingArea[0].dataPtr(),
+                              m_fft_work_area.dataPtr());
 }
 
 
@@ -120,10 +113,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
 {
     HIPACE_PROFILE("FFTPoissonSolverPeriodic::SolvePoissonEquation()");
 
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Perform Fourier transform from the staging area to `tmpSpectralField`
-        AnyFFT::Execute(m_forward_plan[mfi]);
-    }
+    m_forward_fft.Execute();
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
@@ -139,10 +129,7 @@ FFTPoissonSolverPeriodic::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
             });
     }
 
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfi); mfi.isValid(); ++mfi ){
-        // Perform Fourier transform from `tmpSpectralField` to the staging area
-        AnyFFT::Execute(m_backward_plan[mfi]);
-    }
+    m_backward_fft.Execute();
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
diff --git a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp
index c8aa30d600..9ce7aa34b2 100644
--- a/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp
+++ b/src/fields/fft_poisson_solver/MGPoissonSolverDirichlet.cpp
@@ -17,6 +17,7 @@ MGPoissonSolverDirichlet::MGPoissonSolverDirichlet (
     amrex::DistributionMapping const& dm,
     amrex::Geometry const& gm )
 {
+    HIPACE_PROFILE("MGPoissonSolverDirichlet::define()");
     // need extra ghost cell for 2^n-1 HPMG
     m_stagingArea = amrex::MultiFab(ba, dm, 1, Fields::m_poisson_nguards + amrex::IntVect{1, 1, 0});
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ba.size() == 1, "Parallel MG not supported");
diff --git a/src/fields/fft_poisson_solver/fft/AnyDST.H b/src/fields/fft_poisson_solver/fft/AnyDST.H
deleted file mode 100644
index 7997e7c3a6..0000000000
--- a/src/fields/fft_poisson_solver/fft/AnyDST.H
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2020-2022
- *
- * This file is part of HiPACE++.
- *
- * Authors: AlexanderSinn, MaxThevenet
- * License: BSD-3-Clause-LBNL
- */
-#ifndef ANYDST_H_
-#define ANYDST_H_
-
-#include "AnyFFT.H"
-
-#include <AMReX_GpuComplex.H>
-#include <AMReX_FArrayBox.H>
-
-#include <memory>
-
-/**
- * \brief Wrapper around multiple FFT libraries.
- *
- * The header file defines the API and the base types
- * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is
- * done in different cpp files. This wrapper only depends on the underlying FFT library
- * AND on AMReX (There is no dependence on WarpX).
- */
-namespace AnyDST
-{
-
-    /** Direction in which the FFT is performed. */
-    enum struct direction {forward, backward};
-
-    /** \brief This struct contains the vendor FFT plan and additional metadata
-     */
-    struct DSTplan
-    {
-
-        /** pointer to array in position space */
-        amrex::FArrayBox* m_position_array;
-        /** pointer to array in Fourier space */
-        amrex::FArrayBox* m_fourier_array;
-
-        /** Expanded data in position space, only for Cuda */
-        std::unique_ptr<amrex::FArrayBox> m_expanded_position_array;
-        /** Expanded data in Fourier, only for Cuda */
-        std::unique_ptr<amrex::BaseFab<amrex::GpuComplex<amrex::Real>>> m_expanded_fourier_array;
-
-        /** Vendor FFT plan */
-        AnyFFT::VendorFFTPlan m_plan;
-        /** Vendor FFT plan for the transpose transform. Used for use_small_dst and DSTW */
-        AnyFFT::VendorFFTPlan m_plan_b;
-
-        /** Use large R2C or small C2R dst */
-        bool use_small_dst;
-
-#if defined(AMREX_USE_HIP)
-        /** execution info for rocFFT */
-        rocfft_execution_info m_execinfo;
-        /** work buffer for rocFFT */
-        void* m_buffer;
-#endif
-    };
-
-    /** Collection of FFT plans, one FFTplan per box */
-    using DSTplans = amrex::LayoutData<DSTplan>;
-
-    /** \brief create FFT plan for the backend FFT library.
-     * \param[in] real_size Size of the real array, along each dimension.
-     * \param[out] position_array Real array from/to where R2R DST is performed
-     * \param[out] fourier_array Real array to/from where R2R DST is performed
-     */
-    DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array,
-                        amrex::FArrayBox* fourier_array);
-
-    /** \brief Destroy library FFT plan.
-     * \param[out] dst_plan plan to destroy
-     */
-    void DestroyPlan (DSTplan& dst_plan);
-
-    /** \brief Perform FFT with backend library.
-     * \param[out] dst_plan plan for which the FFT is performed
-     * \param[in] d direction of the FFT
-     */
-    void Execute (DSTplan& dst_plan, AnyDST::direction d);
-}
-
-#endif // ANYDST_H_
diff --git a/src/fields/fft_poisson_solver/fft/AnyFFT.H b/src/fields/fft_poisson_solver/fft/AnyFFT.H
index cbbd077954..eb07cc1cb4 100644
--- a/src/fields/fft_poisson_solver/fft/AnyFFT.H
+++ b/src/fields/fft_poisson_solver/fft/AnyFFT.H
@@ -1,8 +1,8 @@
-/* Copyright 2020-2021
+/* Copyright 2020-2024
  *
  * This file is part of HiPACE++.
  *
- * Authors: Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs
+ * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Remi Lehe, Severin Diederichs
  * WeiqunZhang
  * License: BSD-3-Clause-LBNL
  */
@@ -15,106 +15,56 @@
 #ifndef ANYFFT_H_
 #define ANYFFT_H_
 
-#include <AMReX_Config.H>
+#include <cstddef>
 
-#ifdef AMREX_USE_CUDA
-#  include <cufft.h>
-#elif defined(AMREX_USE_HIP)
-#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#    include <rocfft/rocfft.h>
-#  else
-#    include <rocfft.h>
-#  endif
-#else
-#  include <fftw3.h>
-#endif
+struct VendorPlan;
 
-#include <AMReX_LayoutData.H>
+enum struct FFTType {
+    C2C_2D_fwd,
+    C2C_2D_bkw,
+    C2R_2D,
+    R2C_2D,
+    R2R_2D,
+    C2R_1D_batched
+};
 
-/**
- * \brief Wrapper around multiple FFT libraries.
- *
- * The header file defines the API and the base types
- * (Complex and VendorFFTPlan), and the implementation for different FFT libraries is
- * done in different cpp files. This wrapper only depends on the underlying FFT library
- * AND on AMReX (There is no dependence on WarpX).
- */
-namespace AnyFFT
-{
-    // First, define library-dependent types (complex, FFT plan)
-
-    /** Complex type for FFT, depends on FFT library */
-#ifdef AMREX_USE_CUDA
-#  ifdef AMREX_USE_FLOAT
-    using Complex = cuComplex;
-#  else
-    using Complex = cuDoubleComplex;
-#  endif
-#elif defined(AMREX_USE_HIP)
-#  ifdef AMREX_USE_FLOAT
-    using Complex = float2;
-#  else
-    using Complex = double2;
-#  endif
-#else
-#  ifdef AMREX_USE_FLOAT
-    using Complex = fftwf_complex;
-#  else
-    using Complex = fftw_complex;
-#  endif
-#endif
+struct AnyFFT {
 
-    /** Library-dependent FFT plans type, which holds one fft plan per box
-     * (plans are only initialized for the boxes that are owned by the local MPI rank).
+    /** \brief Initialize an FFT plan for the requested transform type using a Vendor FFT library.
+     * For 1D batched transforms, ny represents the number of batches to calculate at once.
+     * This function returns the number of bytes of the work area needed for the FFT. The work area
+     * has to be allocated by the function that uses the FFT and passed into SetBuffers.
+     *
+     * \param[in] type Type of FFT to perform
+     * \param[in] nx Size of the contiguous dimension of the FFT
+     * \param[in] ny Size of the second dimension of the FFT
      */
-#ifdef AMREX_USE_CUDA
-    using VendorFFTPlan = cufftHandle;
-#elif defined(AMREX_USE_HIP)
-    using VendorFFTPlan = rocfft_plan;
-#else
-#  ifdef AMREX_USE_FLOAT
-    using VendorFFTPlan = fftwf_plan;
-#  else
-    using VendorFFTPlan = fftw_plan;
-#  endif
-#endif
-
-    // Second, define library-independent API
-
-    /** Direction in which the FFT is performed. */
-    enum struct direction {R2C, C2R};
+    std::size_t Initialize (FFTType type, int nx, int ny);
 
-    /** \brief This struct contains the vendor FFT plan and additional metadata
+    /** \brief Set the pointers to the input, output and work area of the FFT.
+     * This function has to be called after Initialize and before Execute.
+     *
+     * \param[in] in Pointer to the input of the FFT
+     * \param[in] out Pointer to the output of the FFT
+     * \param[in] work_area Pointer to the work area for the FFT
      */
-    struct FFTplan
-    {
-        amrex::Real* m_real_array; /**< pointer to real array */
-        Complex* m_complex_array; /**< pointer to complex array */
-        VendorFFTPlan m_plan; /**< Vendor FFT plan */
-        direction m_dir;  /**< direction (C2R or R2C) */
-    };
+    void SetBuffers (void* in, void* out, void* work_area);
 
-    /** Collection of FFT plans, one FFTplan per box */
-    using FFTplans = amrex::LayoutData<FFTplan>;
+    /** \brief Perform the initialized FFT */
+    void Execute ();
 
-    /** \brief create FFT plan for the backend FFT library.
-     * \param[in] real_size Size of the real array, along each dimension.
-     * \param[out] real_array Real array from/to where R2C/C2R FFT is performed
-     * \param[out] complex_array Complex array to/from where R2C/C2R FFT is performed
-     * \param[in] dir direction, either R2C or C2R
-     */
-    FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array,
-                        Complex * const complex_array, const direction dir);
+    /** \brief Destructor to destroy the FFT plan */
+    ~AnyFFT ();
 
-    /** \brief Destroy library FFT plan.
-     * \param[out] fft_plan plan to destroy
-     */
-    void DestroyPlan (FFTplan& fft_plan);
+    /** \brief Setup function that has to be called before any FFT plan is initialized. */
+    static void setup ();
 
-    /** \brief Perform FFT with backend library.
-     * \param[out] fft_plan plan for which the FFT is performed
-     */
-    void Execute (FFTplan& fft_plan);
-}
+    /** \brief Cleanup function that has to be called at the end of the program. */
+    static void cleanup ();
+
+private:
+    /** Vendor specific data for the FFT */
+    VendorPlan* m_plan = nullptr;
+};
 
 #endif // ANYFFT_H_
diff --git a/src/fields/fft_poisson_solver/fft/CMakeLists.txt b/src/fields/fft_poisson_solver/fft/CMakeLists.txt
index e89ecd84f3..e8ef1c1fd7 100644
--- a/src/fields/fft_poisson_solver/fft/CMakeLists.txt
+++ b/src/fields/fft_poisson_solver/fft/CMakeLists.txt
@@ -9,20 +9,15 @@ if (HiPACE_COMPUTE STREQUAL CUDA)
   target_sources(HiPACE
     PRIVATE
         WrapCuFFT.cpp
-        WrapCuDST.cpp
-        CuFFTUtils.cpp
   )
 elseif(HiPACE_COMPUTE STREQUAL HIP)
   target_sources(HiPACE
     PRIVATE
         WrapRocFFT.cpp
-        WrapRocDST.cpp
-        RocFFTUtils.cpp
   )
 else()
   target_sources(HiPACE
     PRIVATE
         WrapFFTW.cpp
-        WrapDSTW.cpp
   )
 endif()
diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.H b/src/fields/fft_poisson_solver/fft/CuFFTUtils.H
deleted file mode 100644
index 01daf5c2b9..0000000000
--- a/src/fields/fft_poisson_solver/fft/CuFFTUtils.H
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2020-2021
- *
- * This file is part of HiPACE++.
- *
- * Authors: Axel Huebl, MaxThevenet
- * License: BSD-3-Clause-LBNL
- */
-#ifndef CUFFTUTILS_H_
-#define CUFFTUTILS_H_
-
-#include <cufft.h>
-
-#include <string>
-
-
-namespace CuFFTUtils
-{
-    /** \brief This method converts a cufftResult
-     * into the corresponding string
-     *
-     * @param[in] err a cufftResult
-     * @return an std::string
-     */
-    std::string cufftErrorToString (const cufftResult& err);
-}
-
-#endif // CUFFTUTILS_H_
diff --git a/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp b/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp
deleted file mode 100644
index 090dcf2b9c..0000000000
--- a/src/fields/fft_poisson_solver/fft/CuFFTUtils.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2020
- *
- * This file is part of HiPACE++.
- *
- * Authors: MaxThevenet
- * License: BSD-3-Clause-LBNL
- */
-#include "CuFFTUtils.H"
-
-#include <map>
-
-namespace CuFFTUtils
-{
-    std::string cufftErrorToString (const cufftResult& err)
-    {
-        const auto res2string = std::map<cufftResult, std::string>{
-            {CUFFT_SUCCESS, "CUFFT_SUCCESS"},
-            {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"},
-            {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"},
-            {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"},
-            {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"},
-            {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"},
-            {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"},
-            {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"},
-            {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"},
-            {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"}};
-
-        const auto it = res2string.find(err);
-        if(it != res2string.end()){
-            return it->second;
-        }
-        else{
-            return std::to_string(err) +
-                " (unknown error code)";
-        }
-    }
-}
diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.H b/src/fields/fft_poisson_solver/fft/RocFFTUtils.H
deleted file mode 100644
index 0180b0a97f..0000000000
--- a/src/fields/fft_poisson_solver/fft/RocFFTUtils.H
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2021
- *
- * This file is part of HiPACE++.
- *
- * Authors: Axel Huebl
- * License: BSD-3-Clause-LBNL
- */
-#ifndef ROCFFTUTILS_H_
-#define ROCFFTUTILS_H_
-
-#if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#  include <rocfft/rocfft.h>
-#else
-#  include <rocfft.h>
-#endif
-
-#include <string>
-
-
-namespace RocFFTUtils
-{
-    void assert_rocfft_status (std::string const& name, rocfft_status status);
-
-    /** \brief This method converts a cufftResult
-     * into the corresponding string
-     *
-     * @param[in] err a cufftResult
-     * @return an std::string
-     */
-    std::string rocfftErrorToString (const rocfft_status err);
-}
-
-#endif // ROCFFTUTILS_H_
diff --git a/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp b/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp
deleted file mode 100644
index 5c6f3d68ed..0000000000
--- a/src/fields/fft_poisson_solver/fft/RocFFTUtils.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2021
- *
- * This file is part of HiPACE++.
- *
- * Authors: Axel Huebl
- * License: BSD-3-Clause-LBNL
- */
-#include "RocFFTUtils.H"
-#include <AMReX.H>
-
-#include <map>
-#include <string>
-
-namespace RocFFTUtils
-{
-    void assert_rocfft_status (std::string const& name, rocfft_status status)
-    {
-        if (status != rocfft_status_success) {
-            amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status));
-        }
-    }
-
-    std::string rocfftErrorToString (const rocfft_status err)
-    {
-        if              (err == rocfft_status_success) {
-            return std::string("rocfft_status_success");
-        } else if       (err == rocfft_status_failure) {
-            return std::string("rocfft_status_failure");
-        } else if       (err == rocfft_status_invalid_arg_value) {
-            return std::string("rocfft_status_invalid_arg_value");
-        } else if       (err == rocfft_status_invalid_dimensions) {
-            return std::string("rocfft_status_invalid_dimensions");
-        } else if       (err == rocfft_status_invalid_array_type) {
-            return std::string("rocfft_status_invalid_array_type");
-        } else if       (err == rocfft_status_invalid_strides) {
-            return std::string("rocfft_status_invalid_strides");
-        } else if       (err == rocfft_status_invalid_distance) {
-            return std::string("rocfft_status_invalid_distance");
-        } else if       (err == rocfft_status_invalid_offset) {
-            return std::string("rocfft_status_invalid_offset");
-        } else {
-            return std::to_string(err) + " (unknown error code)";
-        }
-    }
-}
diff --git a/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp b/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp
deleted file mode 100644
index 571e402010..0000000000
--- a/src/fields/fft_poisson_solver/fft/WrapCuDST.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/* Copyright 2020-2022
- *
- * This file is part of HiPACE++.
- *
- * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs
- * License: BSD-3-Clause-LBNL
- */
-#include "AnyDST.H"
-#include "CuFFTUtils.H"
-#include "utils/GPUUtil.H"
-#include "utils/HipaceProfilerWrapper.H"
-#include "utils/Parser.H"
-
-#include <AMReX_Config.H>
-
-namespace AnyDST
-{
-#ifdef AMREX_USE_FLOAT
-    cufftType VendorR2C = CUFFT_R2C;
-    cufftType VendorC2R = CUFFT_C2R;
-#else
-    cufftType VendorR2C = CUFFT_D2Z;
-    cufftType VendorC2R = CUFFT_Z2D;
-#endif
-
-    /** \brief Extend src into a symmetrized larger array dst
-     *
-     * \param[in,out] dst destination array, odd symmetry around 0 and the middle points in x and y
-     * \param[in] src source array
-     */
-    void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src)
-    {
-        constexpr int scomp = 0;
-        constexpr int dcomp = 0;
-
-        const amrex::Box bx = src.box();
-        const int nx = bx.length(0);
-        const int ny = bx.length(1);
-        const amrex::IntVect lo = bx.smallEnd();
-        Array2<amrex::Real const> const src_array = src.const_array(scomp);
-        Array2<amrex::Real> const dst_array = dst.array(dcomp);
-
-        amrex::ParallelFor(
-            bx,
-            [=] AMREX_GPU_DEVICE(int i, int j, int)
-            {
-                /* upper left quadrant */
-                dst_array(i+1,j+1) = src_array(i, j);
-                /* lower left quadrant */
-                dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]);
-                /* upper right quadrant */
-                dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j);
-                /* lower right quadrant */
-                dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]);
-            }
-            );
-    };
-
-    /** \brief Extract symmetrical src array into smaller array dst
-     *
-     * \param[in,out] dst destination array
-     * \param[in] src destination array, symmetric in x and y
-     */
-    void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab<amrex::GpuComplex<amrex::Real>>& src)
-    {
-        constexpr int scomp = 0;
-        constexpr int dcomp = 0;
-
-        const amrex::Box bx = dst.box();
-        Array2<amrex::GpuComplex<amrex::Real> const> const src_array = src.const_array(scomp);
-        Array2<amrex::Real> const dst_array = dst.array(dcomp);
-        amrex::ParallelFor(
-            bx,
-            [=] AMREX_GPU_DEVICE(int i, int j, int)
-            {
-                /* upper left quadrant */
-                dst_array(i,j) = -src_array(i+1, j+1).real();
-            }
-            );
-    };
-
-    /** \brief Make Complex array out of Real array to prepare for fft.
-     * out[idx] = -in[2*idx-2] + in[2*idx] + i*in[2*idx-1] for each column with
-     * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1]
-     *
-     * \param[in] in input real array
-     * \param[out] out output complex array
-     * \param[in] n_data number of (contiguous) rows in position matrix
-     * \param[in] n_batch number of (strided) columns in position matrix
-     */
-    void ToComplex (const amrex::Real* const AMREX_RESTRICT in,
-                    amrex::GpuComplex<amrex::Real>* const AMREX_RESTRICT out,
-                    const int n_data, const int n_batch)
-    {
-        const int n_half = (n_data+1)/2;
-        if((n_data%2 == 1)) {
-            amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-                [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                {
-                    const int stride_in = n_data*j;
-                    const int i_is_zero = (i==0);
-                    const int i_is_n_half = (i==n_half);
-                    const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                             +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half);
-                    const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in]
-                                             *!i_is_zero*!i_is_n_half;
-                    out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-                });
-        } else {
-            amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-                [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                {
-                    const int stride_in = n_data*j;
-                    const int i_is_zero = (i==0);
-                    const int i_is_n_half = (i==n_half);
-                    const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                             +in[2*i-i_is_n_half+stride_in]*!i_is_n_half;
-                    const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero;
-                    out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-                });
-        }
-    };
-
-    /** \brief Complex to Real fft for every column of the input matrix.
-     * The output Matrix has its indexes reversed compared to some other libraries
-     *
-     * \param[in] plan cuda fft plan for transformation
-     * \param[in] in input complex array
-     * \param[out] out output real array
-     */
-    void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex<amrex::Real>* AMREX_RESTRICT in,
-                 amrex::Real* const AMREX_RESTRICT out)
-    {
-        cudaStream_t stream = amrex::Gpu::Device::cudaStream();
-        cufftSetStream(plan, stream);
-        cufftResult result;
-
-#ifdef AMREX_USE_FLOAT
-        result = cufftExecC2R(plan, reinterpret_cast<AnyFFT::Complex*>(in), out);
-#else
-        result = cufftExecZ2D(plan, reinterpret_cast<AnyFFT::Complex*>(in), out);
-#endif
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " forward transform using cufftExec failed ! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
-    };
-
-    /** \brief Make Sine-space Real array out of array from fft.
-     * out[idx] = 0.5 *(in[n_data-idx] - in[idx+1] + (in[n_data-idx] + in[idx+1])/
-     * (2*sin((idx+1)*pi/(n_data+1)))) for each column
-     *
-     * \param[in] in input real array
-     * \param[out] out output real array
-     * \param[in] n_data number of (contiguous) rows in position matrix
-     * \param[in] n_batch number of (strided) columns in position matrix
-     */
-    void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out,
-                 const int n_data, const int n_batch)
-    {
-        using namespace amrex::literals;
-
-        const amrex::Real n_1_real = n_data+1._rt;
-        const int n_1 = n_data+1;
-        amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}},
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-            {
-                const int i_rev = n_1-i;
-                const int stride_in = n_1*j;
-                const int stride_out = n_data*j;
-                const amrex::Real in_a = in[i+stride_in];
-                const amrex::Real in_b = in[i_rev+stride_in];
-#ifdef AMREX_USE_FLOAT
-                out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real)));
-                out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt
-                                          *sinpif(i_rev/n_1_real)));
-#else
-                out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real)));
-                out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt
-                                          *sinpi(i_rev/n_1_real)));
-#endif
-            });
-    };
-
-    /** \brief Transpose input matrix
-     * out[idy][idx] = in[idx][idy]
-     *
-     * \param[in] in input real array
-     * \param[out] out output real array
-     * \param[in] n_data number of (contiguous) rows in input matrix
-     * \param[in] n_batch number of (strided) columns in input matrix
-     */
-    void Transpose (const amrex::Real* const AMREX_RESTRICT in,
-                    amrex::Real* const AMREX_RESTRICT out,
-                    const int n_data, const int n_batch)
-    {
-        constexpr int tile_dim = 32; //must be power of 2
-        constexpr int block_rows = 8;
-        const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim;
-        const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim;
-        amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows,
-            tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(),
-            [=] AMREX_GPU_DEVICE() noexcept
-            {
-                amrex::Gpu::SharedMemory<amrex::Real> gsm;
-                amrex::Real* const tile = gsm.dataPtr();
-
-                const int thread_x = threadIdx.x&(tile_dim-1);
-                const int thread_y = threadIdx.x/tile_dim;
-                const int block_y = blockIdx.x/num_blocks_x;
-                const int block_x = blockIdx.x - block_y*num_blocks_x;
-                int mat_x = block_x * tile_dim + thread_x;
-                int mat_y = block_y * tile_dim + thread_y;
-
-                for (int i = 0; i < tile_dim; i += block_rows) {
-                    if(mat_x < n_data && (mat_y+i) < n_batch) {
-                        tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x];
-                    }
-                }
-
-                __syncthreads();
-
-                mat_x = block_y * tile_dim + thread_x;
-                mat_y = block_x * tile_dim + thread_y;
-
-                for (int i = 0; i < tile_dim; i += block_rows) {
-                    if(mat_x < n_batch && (mat_y+i) < n_data) {
-                        out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i];
-                    }
-                }
-            });
-    };
-
-    DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array,
-                        amrex::FArrayBox* fourier_array)
-    {
-        HIPACE_PROFILE("AnyDST::CreatePlan()");
-        DSTplan dst_plan;
-
-        amrex::ParmParse pp("hipace");
-        dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511);
-        queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst);
-
-        if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) {
-            AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024),
-            "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA "
-            "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes.");
-        }
-
-        if(!dst_plan.use_small_dst) {
-            const int nx = real_size[0];
-            const int ny = real_size[1];
-
-            // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2)
-            // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2)
-            amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}};
-            amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}};
-            // shift box to match rest of fields
-            expanded_position_box += position_array->box().smallEnd();
-            expanded_fourier_box += fourier_array->box().smallEnd();
-            dst_plan.m_expanded_position_array =
-                std::make_unique<amrex::FArrayBox>(
-                    expanded_position_box, 1);
-            dst_plan.m_expanded_fourier_array =
-                std::make_unique<amrex::BaseFab<amrex::GpuComplex<amrex::Real>>>(
-                    expanded_fourier_box, 1);
-
-            // setting the initial values to 0
-            // we don't set the expanded Fourier array, because it will be initialized by the FFT
-            dst_plan.m_expanded_position_array->setVal<amrex::RunOn::Device>(0.,
-                dst_plan.m_expanded_position_array->box(), 0,
-                dst_plan.m_expanded_position_array->nComp());
-
-            const amrex::IntVect& expanded_size = expanded_position_box.length();
-
-            // Initialize fft_plan.m_plan with the vendor fft plan.
-            cufftResult result;
-            result = cufftPlan2d(
-                &(dst_plan.m_plan), expanded_size[1], expanded_size[0], VendorR2C);
-
-            if ( result != CUFFT_SUCCESS ) {
-                amrex::Print() << " cufftplan failed! Error: " <<
-                    CuFFTUtils::cufftErrorToString(result) << "\n";
-            }
-
-            std::size_t buffersize = 0;
-            cufftGetSize(dst_plan.m_plan, &buffersize);
-
-            std::size_t mb = 1024*1024;
-
-            amrex::Print() << "using R2C cuFFT of size " << expanded_size[0] << " * "
-                << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n";
-
-            // Store meta-data in dst_plan
-            dst_plan.m_position_array = position_array;
-            dst_plan.m_fourier_array = fourier_array;
-
-            return dst_plan;
-        }
-        else {
-            const int nx = real_size[0]; // contiguous
-            const int ny = real_size[1]; // not contiguous
-
-            // Allocate 1d Array for 2d data or 2d transpose data
-            const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx);
-            const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx);
-            amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}};
-            amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}};
-            dst_plan.m_expanded_position_array =
-                std::make_unique<amrex::FArrayBox>(
-                    real_box, 1);
-            dst_plan.m_expanded_fourier_array =
-                std::make_unique<amrex::BaseFab<amrex::GpuComplex<amrex::Real>>>(
-                    complex_box, 1);
-
-            // Initialize fft_plan.m_plan with the vendor fft plan.
-            int s_1 = nx+1;
-            cufftResult result;
-            result = cufftPlanMany(
-                &(dst_plan.m_plan), 1, &s_1, NULL, 1, (nx+1)/2+1, NULL, 1, nx+1, VendorC2R, ny);
-
-            if ( result != CUFFT_SUCCESS ) {
-                amrex::Print() << " cufftplan failed! Error: " <<
-                    CuFFTUtils::cufftErrorToString(result) << "\n";
-            }
-
-            // Initialize transposed fft_plan.m_plan_b with the vendor fft plan.
-            int s_2 = ny+1;
-            cufftResult resultb;
-            resultb = cufftPlanMany(
-                &(dst_plan.m_plan_b), 1, &s_2, NULL, 1, (ny+1)/2+1, NULL, 1, ny+1, VendorC2R, nx);
-
-            if ( resultb != CUFFT_SUCCESS ) {
-                amrex::Print() << " cufftplan failed! Error: " <<
-                    CuFFTUtils::cufftErrorToString(resultb) << "\n";
-            }
-
-            std::size_t buffersize = 0;
-            std::size_t buffersize_b = 0;
-            cufftGetSize(dst_plan.m_plan, &buffersize);
-            cufftGetSize(dst_plan.m_plan_b, &buffersize_b);
-
-            std::size_t mb = 1024*1024;
-
-            amrex::Print() << "using C2R cuFFT of sizes " << s_1 << " and "
-                << s_2 << " with " << (buffersize+buffersize_b+mb-1)/mb << " MiB of work area\n";
-
-            // Store meta-data in dst_plan
-            dst_plan.m_position_array = position_array;
-            dst_plan.m_fourier_array = fourier_array;
-
-            return dst_plan;
-        }
-    }
-
-    void DestroyPlan (DSTplan& dst_plan)
-    {
-        cufftDestroy( dst_plan.m_plan );
-        cufftDestroy( dst_plan.m_plan_b );
-    }
-
-    void Execute (DSTplan& dst_plan, direction d){
-        HIPACE_PROFILE("AnyDST::Execute()");
-
-        if(!dst_plan.use_small_dst) {
-            // Swap position and fourier space based on execute direction
-            amrex::FArrayBox* position_array =
-                (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array;
-            amrex::FArrayBox* fourier_array =
-                (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array;
-
-            // Expand in position space m_position_array -> m_expanded_position_array
-            ExpandR2R(*dst_plan.m_expanded_position_array, *position_array);
-
-            cudaStream_t stream = amrex::Gpu::Device::cudaStream();
-            cufftSetStream ( dst_plan.m_plan, stream);
-            cufftResult result;
-
-            // R2C FFT m_expanded_position_array -> m_expanded_fourier_array
-#ifdef AMREX_USE_FLOAT
-            result = cufftExecR2C(
-                dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(),
-                reinterpret_cast<AnyFFT::Complex*>(dst_plan.m_expanded_fourier_array->dataPtr()));
-#else
-            result = cufftExecD2Z(
-                dst_plan.m_plan, dst_plan.m_expanded_position_array->dataPtr(),
-                reinterpret_cast<AnyFFT::Complex*>(dst_plan.m_expanded_fourier_array->dataPtr()));
-#endif
-            // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array
-            ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array);
-
-            if ( result != CUFFT_SUCCESS ) {
-                amrex::Print() << " forward transform using cufftExec failed ! Error: " <<
-                    CuFFTUtils::cufftErrorToString(result) << "\n";
-            }
-        }
-        else {
-            const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous
-            const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose
-
-            amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr();
-            amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr();
-            amrex::GpuComplex<amrex::Real>* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr();
-            amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr();
-
-            // Swap position and fourier space based on execute direction
-            amrex::Real* const pos_arr =
-                (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr;
-            amrex::Real* const fourier_arr =
-                (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr;
-
-            ToComplex(pos_arr, comp_arr, nx, ny);
-
-            C2Rfft(dst_plan.m_plan, comp_arr, real_arr);
-
-            ToSine(real_arr, pos_arr, nx, ny);
-
-            Transpose(pos_arr, fourier_arr, nx, ny);
-
-            ToComplex(fourier_arr, comp_arr, ny, nx);
-
-            C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr);
-
-            ToSine(real_arr, pos_arr, ny, nx);
-
-            Transpose(pos_arr, fourier_arr, ny, nx);
-        }
-    }
-}
diff --git a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp
index 739ab2cdcb..84471c665c 100644
--- a/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp
+++ b/src/fields/fft_poisson_solver/fft/WrapCuFFT.cpp
@@ -1,8 +1,8 @@
-/* Copyright 2020-2022
+/* Copyright 2020-2024
  *
  * This file is part of HiPACE++.
  *
- * Authors: MaxThevenet, Remi Lehe, Severin Diederichs
+ * Authors: AlexanderSinn, MaxThevenet, Remi Lehe, Severin Diederichs
  * License: BSD-3-Clause-LBNL
  */
 /* Copyright 2019-2020
@@ -12,84 +12,239 @@
  * License: BSD-3-Clause-LBNL
  */
 #include "AnyFFT.H"
-#include "CuFFTUtils.H"
-#include "utils/HipaceProfilerWrapper.H"
 
-namespace AnyFFT
-{
+#include <AMReX.H>
+#include <AMReX_GpuDevice.H>
+
+#include <cufft.h>
+
+#include <map>
+#include <string>
 
 #ifdef AMREX_USE_FLOAT
-    cufftType VendorR2C = CUFFT_R2C;
-    cufftType VendorC2R = CUFFT_C2R;
+static constexpr bool use_float = true;
 #else
-    cufftType VendorR2C = CUFFT_D2Z;
-    cufftType VendorC2R = CUFFT_Z2D;
+static constexpr bool use_float = false;
 #endif
 
-    FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array,
-                        Complex * const complex_array, const direction dir)
-    {
-        HIPACE_PROFILE("AnyFFT::CreatePlan()");
-        FFTplan fft_plan;
+struct VendorPlan {
+    cufftHandle m_cufftplan;
+    FFTType m_type;
+    void* m_in;
+    void* m_out;
+};
 
-        if (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 1) {
-            AMREX_ALWAYS_ASSERT_WITH_MESSAGE((std::max(real_size[0], real_size[1]) <= 1024),
-            "Due to a bug in cuFFT, CUDA 11.1 supports only nx, ny <= 1024. Please use CUDA "
-            "version >= 11.2 (recommended) or <= 11.0 for larger grid sizes.");
-        }
+std::string cufftErrorToString (const cufftResult& err) {
+    const auto res2string = std::map<cufftResult, std::string>{
+        {CUFFT_SUCCESS, "CUFFT_SUCCESS"},
+        {CUFFT_INVALID_PLAN,"CUFFT_INVALID_PLAN"},
+        {CUFFT_ALLOC_FAILED,"CUFFT_ALLOC_FAILED"},
+        {CUFFT_INVALID_TYPE,"CUFFT_INVALID_TYPE"},
+        {CUFFT_INVALID_VALUE,"CUFFT_INVALID_VALUE"},
+        {CUFFT_INTERNAL_ERROR,"CUFFT_INTERNAL_ERROR"},
+        {CUFFT_EXEC_FAILED,"CUFFT_EXEC_FAILED"},
+        {CUFFT_SETUP_FAILED,"CUFFT_SETUP_FAILED"},
+        {CUFFT_INVALID_SIZE,"CUFFT_INVALID_SIZE"},
+        {CUFFT_UNALIGNED_DATA,"CUFFT_UNALIGNED_DATA"}
+    };
 
-        // Initialize fft_plan.m_plan with the vendor fft plan.
-        cufftResult result;
-        if (dir == direction::R2C){
-            result = cufftPlan2d(
-                &(fft_plan.m_plan), real_size[1], real_size[0], VendorR2C);
-        } else {
-            result = cufftPlan2d(
-                &(fft_plan.m_plan), real_size[1], real_size[0], VendorC2R);
-        }
+    const auto it = res2string.find(err);
+    if(it != res2string.end()){
+        return it->second;
+    } else {
+        return std::to_string(err) + " (unknown error code)";
+    }
+}
 
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " cufftplan failed! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
+void assert_cufft_status (std::string const& name, const cufftResult& status) {
+    if (status != CUFFT_SUCCESS) {
+        amrex::Abort(name + " failed! Error: " + cufftErrorToString(status));
+    }
+}
 
-        // Store meta-data in fft_plan
-        fft_plan.m_real_array = real_array;
-        fft_plan.m_complex_array = complex_array;
-        fft_plan.m_dir = dir;
+std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) {
+    // https://docs.nvidia.com/cuda/cufft/index.html#cufft-api-reference
+    m_plan = new VendorPlan;
 
-        return fft_plan;
-    }
+    m_plan->m_type = type;
+    cufftType transform_type;
+    int rank = 0;
+    // n is in C order
+    long long int n[2] = {0, 0};
+    long long int batch = 0;
 
-    void DestroyPlan (FFTplan& fft_plan)
-    {
-        cufftDestroy( fft_plan.m_plan );
+    switch (type) {
+        case FFTType::C2C_2D_fwd:
+            transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z;
+            rank = 2;
+            n[0] = ny;
+            n[1] = nx;
+            batch = 1;
+            break;
+        case FFTType::C2C_2D_bkw:
+            transform_type = use_float ? CUFFT_C2C : CUFFT_Z2Z;
+            rank = 2;
+            n[0] = ny;
+            n[1] = nx;
+            batch = 1;
+            break;
+        case FFTType::C2R_2D:
+            transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D;
+            rank = 2;
+            n[0] = ny;
+            n[1] = nx;
+            batch = 1;
+            break;
+        case FFTType::R2C_2D:
+            transform_type = use_float ? CUFFT_R2C : CUFFT_D2Z;
+            rank = 2;
+            n[0] = ny;
+            n[1] = nx;
+            batch = 1;
+            break;
+        case FFTType::R2R_2D:
+            amrex::Abort("R2R FFT not supported by cufft");
+            return 0;
+        case FFTType::C2R_1D_batched:
+            transform_type = use_float ? CUFFT_C2R : CUFFT_Z2D;
+            rank = 1;
+            n[0] = nx;
+            batch = ny;
+            break;
     }
 
-    void Execute (FFTplan& fft_plan){
-        HIPACE_PROFILE("AnyFFT::Execute()");
-        // make sure that this is done on the same GPU stream as the above copy
-        cudaStream_t stream = amrex::Gpu::Device::cudaStream();
-        cufftSetStream ( fft_plan.m_plan, stream);
-        cufftResult result;
-        if (fft_plan.m_dir == direction::R2C){
-#ifdef AMREX_USE_FLOAT
-            result = cufftExecR2C(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array);
-#else
-            result = cufftExecD2Z(fft_plan.m_plan, fft_plan.m_real_array, fft_plan.m_complex_array);
-#endif
-        } else if (fft_plan.m_dir == direction::C2R){
-#ifdef AMREX_USE_FLOAT
-            result = cufftExecC2R(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array);
-#else
-            result = cufftExecZ2D(fft_plan.m_plan, fft_plan.m_complex_array, fft_plan.m_real_array);
-#endif
-        } else {
-            amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R");
+    cufftResult result;
+
+    result = cufftCreate(&(m_plan->m_cufftplan));
+    assert_cufft_status("cufftCreate", result);
+
+    result = cufftSetAutoAllocation(m_plan->m_cufftplan, 0);
+    assert_cufft_status("cufftSetAutoAllocation", result);
+
+    std::size_t workSize = 0;
+
+    result = cufftMakePlanMany64(
+        m_plan->m_cufftplan,
+        rank,
+        n,
+        nullptr, 0, 0,
+        nullptr, 0, 0,
+        transform_type,
+        batch,
+        &workSize);
+    assert_cufft_status("cufftMakePlanMany64", result);
+
+    result = cufftSetStream(m_plan->m_cufftplan, amrex::Gpu::Device::cudaStream());
+    assert_cufft_status("cufftSetStream", result);
+
+    return workSize;
+}
+
+void AnyFFT::SetBuffers (void* in, void* out, void* work_area) {
+    m_plan->m_in = in;
+    m_plan->m_out = out;
+
+    cufftResult result;
+
+    result = cufftSetWorkArea(m_plan->m_cufftplan, work_area);
+    assert_cufft_status("cufftSetWorkArea", result);
+}
+
+void AnyFFT::Execute () {
+    cufftResult result;
+
+    // There is also cufftXtExec that could replace all of these specific Exec calls,
+    // however in testing it doesn't work
+    if constexpr (use_float) {
+        switch (m_plan->m_type) {
+            case FFTType::C2C_2D_fwd:
+                result = cufftExecC2C(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftComplex*>(m_plan->m_out),
+                    CUFFT_FORWARD);
+                assert_cufft_status("cufftExecC2C", result);
+                break;
+            case FFTType::C2C_2D_bkw:
+                result = cufftExecC2C(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftComplex*>(m_plan->m_out),
+                    CUFFT_INVERSE);
+                assert_cufft_status("cufftExecC2C", result);
+                break;
+            case FFTType::C2R_2D:
+                result = cufftExecC2R(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftReal*>(m_plan->m_out));
+                assert_cufft_status("cufftExecC2R", result);
+                break;
+            case FFTType::R2C_2D:
+                result = cufftExecR2C(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftReal*>(m_plan->m_in),
+                    reinterpret_cast<cufftComplex*>(m_plan->m_out));
+                assert_cufft_status("cufftExecR2C", result);
+                break;
+            case FFTType::R2R_2D:
+                amrex::Abort("R2R FFT not supported by cufft");
+                break;
+            case FFTType::C2R_1D_batched:
+                result = cufftExecC2R(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftReal*>(m_plan->m_out));
+                assert_cufft_status("cufftExecC2R", result);
+                break;
         }
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " forward transform using cufftExec failed ! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
+    } else {
+        switch (m_plan->m_type) {
+            case FFTType::C2C_2D_fwd:
+                result = cufftExecZ2Z(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_out),
+                    CUFFT_FORWARD);
+                assert_cufft_status("cufftExecZ2Z", result);
+                break;
+            case FFTType::C2C_2D_bkw:
+                result = cufftExecZ2Z(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_out),
+                    CUFFT_INVERSE);
+                assert_cufft_status("cufftExecZ2Z", result);
+                break;
+            case FFTType::C2R_2D:
+                result = cufftExecZ2D(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftDoubleReal*>(m_plan->m_out));
+                assert_cufft_status("cufftExecZ2D", result);
+                break;
+            case FFTType::R2C_2D:
+                result = cufftExecD2Z(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftDoubleReal*>(m_plan->m_in),
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_out));
+                assert_cufft_status("cufftExecD2Z", result);
+                break;
+            case FFTType::R2R_2D:
+                amrex::Abort("R2R FFT not supported by cufft");
+                break;
+            case FFTType::C2R_1D_batched:
+                result = cufftExecZ2D(m_plan->m_cufftplan,
+                    reinterpret_cast<cufftDoubleComplex*>(m_plan->m_in),
+                    reinterpret_cast<cufftDoubleReal*>(m_plan->m_out));
+                assert_cufft_status("cufftExecZ2D", result);
+                break;
         }
     }
 }
+
+AnyFFT::~AnyFFT () {
+    if (m_plan) {
+        cufftResult result;
+
+        result = cufftDestroy(m_plan->m_cufftplan);
+        assert_cufft_status("cufftDestroy", result);
+
+        delete m_plan;
+    }
+}
+
+void AnyFFT::setup () {}
+
+void AnyFFT::cleanup () {}
diff --git a/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp b/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp
deleted file mode 100644
index ca14a5e358..0000000000
--- a/src/fields/fft_poisson_solver/fft/WrapDSTW.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2020-2022
- *
- * This file is part of HiPACE++.
- *
- * Authors: AlexanderSinn, MaxThevenet, Severin Diederichs
- * License: BSD-3-Clause-LBNL
- */
-#include "AnyDST.H"
-#include "utils/HipaceProfilerWrapper.H"
-
-#ifdef AMREX_USE_OMP
-#   include <omp.h>
-#endif
-
-namespace AnyDST
-{
-#ifdef AMREX_USE_FLOAT
-    const auto VendorCreatePlanR2R2D = fftwf_plan_r2r_2d;
-#else
-    const auto VendorCreatePlanR2R2D = fftw_plan_r2r_2d;
-#endif
-
-    DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array,
-                        amrex::FArrayBox* fourier_array)
-    {
-        DSTplan dst_plan;
-        const int nx = real_size[0];
-        const int ny = real_size[1];
-
-#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP)
-        if (nx > 32 && ny > 32) {
-#   ifdef AMREX_USE_FLOAT
-            fftwf_init_threads();
-            fftwf_plan_with_nthreads(omp_get_max_threads());
-#   else
-            fftw_init_threads();
-            fftw_plan_with_nthreads(omp_get_max_threads());
-#   endif
-        }
-#endif
-
-        // Initialize fft_plan.m_plan with the vendor fft plan.
-        // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order
-        dst_plan.m_plan = VendorCreatePlanR2R2D(
-            ny, nx, position_array->dataPtr(), fourier_array->dataPtr(),
-            FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE);
-
-        // Initialize fft_plan.m_plan_b with the vendor fft plan.
-        // Swap arrays: now for backward direction.
-        dst_plan.m_plan_b = VendorCreatePlanR2R2D(
-            ny, nx, fourier_array->dataPtr(), position_array->dataPtr(),
-            FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE);
-
-        amrex::Print() << "using R2R FFTW of size " << nx << " * " << ny << "\n";
-
-        // Store meta-data in fft_plan
-        dst_plan.m_position_array = position_array;
-        dst_plan.m_fourier_array = fourier_array;
-
-        return dst_plan;
-    }
-
-    void DestroyPlan (DSTplan& dst_plan)
-    {
-#  ifdef AMREX_USE_FLOAT
-        fftwf_destroy_plan( dst_plan.m_plan );
-        fftwf_destroy_plan( dst_plan.m_plan_b );
-#  else
-        fftw_destroy_plan( dst_plan.m_plan );
-        fftw_destroy_plan( dst_plan.m_plan_b );
-#  endif
-    }
-
-    void Execute (DSTplan& dst_plan, direction d){
-        HIPACE_PROFILE("AnyDST::Execute()");
-        // Swap position and fourier space based on execute direction
-        AnyFFT::VendorFFTPlan& plan = (d==direction::forward) ? dst_plan.m_plan : dst_plan.m_plan_b;
-#  ifdef AMREX_USE_FLOAT
-        fftwf_execute( plan );
-#  else
-        fftw_execute( plan );
-#  endif
-    }
-}
diff --git a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp
index 8a6565dbed..e3dc010111 100644
--- a/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp
+++ b/src/fields/fft_poisson_solver/fft/WrapFFTW.cpp
@@ -1,8 +1,8 @@
-/* Copyright 2020
+/* Copyright 2020-2024
  *
  * This file is part of HiPACE++.
  *
- * Authors: MaxThevenet, Remi Lehe
+ * Authors: AlexanderSinn, MaxThevenet, Remi Lehe
  * License: BSD-3-Clause-LBNL
  */
 /* Copyright 2019-2020
@@ -12,61 +12,168 @@
  * License: BSD-3-Clause-LBNL
  */
 #include "AnyFFT.H"
-#include "utils/HipaceProfilerWrapper.H"
 
-namespace AnyFFT
-{
+#include <AMReX_Config.H>
+
+#ifdef AMREX_USE_OMP
+#include <omp.h>
+#endif
+
+#include <fftw3.h>
+
 #ifdef AMREX_USE_FLOAT
-    const auto VendorCreatePlanR2C3D = fftwf_plan_dft_r2c_3d;
-    const auto VendorCreatePlanC2R3D = fftwf_plan_dft_c2r_3d;
-    const auto VendorCreatePlanR2C2D = fftwf_plan_dft_r2c_2d;
-    const auto VendorCreatePlanC2R2D = fftwf_plan_dft_c2r_2d;
+static constexpr bool use_float = true;
 #else
-    const auto VendorCreatePlanR2C3D = fftw_plan_dft_r2c_3d;
-    const auto VendorCreatePlanC2R3D = fftw_plan_dft_c2r_3d;
-    const auto VendorCreatePlanR2C2D = fftw_plan_dft_r2c_2d;
-    const auto VendorCreatePlanC2R2D = fftw_plan_dft_c2r_2d;
+static constexpr bool use_float = false;
 #endif
 
-    FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array,
-                        Complex * const complex_array, const direction dir)
-    {
-        HIPACE_PROFILE("AnyFFT::CreatePlan()");
-        FFTplan fft_plan;
+struct VendorPlan {
+    fftwf_plan m_fftwf_plan;
+    fftw_plan m_fftw_plan;
+    FFTType m_type;
+    int m_nx;
+    int m_ny;
+};
+
+std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) {
+    // https://www.fftw.org/fftw3_doc/FFTW-Reference.html
+    m_plan = new VendorPlan;
 
-        // Initialize fft_plan.m_plan with the vendor fft plan.
-        // Swap dimensions: AMReX FAB are Fortran-order but FFTW is C-order
-        if (dir == direction::R2C){
-            fft_plan.m_plan = VendorCreatePlanR2C2D(
-                    real_size[1], real_size[0], real_array, complex_array, FFTW_ESTIMATE);
-        } else if (dir == direction::C2R){
-            fft_plan.m_plan = VendorCreatePlanC2R2D(
-                    real_size[1], real_size[0], complex_array, real_array, FFTW_ESTIMATE);
+    m_plan->m_type = type;
+    m_plan->m_nx = nx;
+    m_plan->m_ny = ny;
+    // fftw doesn't allow for the manual allocation of work area, additionally the input and output
+    // arrays have to be provided when planing, so we do all the work in the SetBuffers function.
+    return 0;
+}
+
+void AnyFFT::SetBuffers (void* in, void* out, [[maybe_unused]] void* work_area) {
+    if constexpr (use_float) {
+        switch (m_plan->m_type) {
+            case FFTType::C2C_2D_fwd:
+                m_plan->m_fftwf_plan = fftwf_plan_dft_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftwf_complex*>(in), reinterpret_cast<fftwf_complex*>(out),
+                    FFTW_FORWARD, FFTW_MEASURE);
+                break;
+            case FFTType::C2C_2D_bkw:
+                m_plan->m_fftwf_plan = fftwf_plan_dft_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftwf_complex*>(in), reinterpret_cast<fftwf_complex*>(out),
+                    FFTW_BACKWARD, FFTW_MEASURE);
+                break;
+            case FFTType::C2R_2D:
+                m_plan->m_fftwf_plan = fftwf_plan_dft_c2r_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftwf_complex*>(in), reinterpret_cast<float*>(out),
+                    FFTW_MEASURE);
+                break;
+            case FFTType::R2C_2D:
+                m_plan->m_fftwf_plan = fftwf_plan_dft_r2c_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<float*>(in), reinterpret_cast<fftwf_complex*>(out),
+                    FFTW_MEASURE);
+                break;
+            case FFTType::R2R_2D:
+                m_plan->m_fftwf_plan = fftwf_plan_r2r_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<float*>(in), reinterpret_cast<float*>(out),
+                    FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE);
+                break;
+            case FFTType::C2R_1D_batched:
+                {
+                    int n[1] = {m_plan->m_nx};
+                    m_plan->m_fftwf_plan = fftwf_plan_many_dft_c2r(
+                        1, n, m_plan->m_ny,
+                        reinterpret_cast<fftwf_complex*>(in), nullptr, 1, m_plan->m_nx/2+1,
+                        reinterpret_cast<float*>(out), nullptr, 1, m_plan->m_nx,
+                        FFTW_MEASURE);
+                }
+                break;
         }
+    } else {
+        switch (m_plan->m_type) {
+            case FFTType::C2C_2D_fwd:
+                m_plan->m_fftw_plan = fftw_plan_dft_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftw_complex*>(in), reinterpret_cast<fftw_complex*>(out),
+                    FFTW_FORWARD, FFTW_MEASURE);
+                break;
+            case FFTType::C2C_2D_bkw:
+                m_plan->m_fftw_plan = fftw_plan_dft_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftw_complex*>(in), reinterpret_cast<fftw_complex*>(out),
+                    FFTW_BACKWARD, FFTW_MEASURE);
+                break;
+            case FFTType::C2R_2D:
+                m_plan->m_fftw_plan = fftw_plan_dft_c2r_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<fftw_complex*>(in), reinterpret_cast<double*>(out),
+                    FFTW_MEASURE);
+                break;
+            case FFTType::R2C_2D:
+                m_plan->m_fftw_plan = fftw_plan_dft_r2c_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<double*>(in), reinterpret_cast<fftw_complex*>(out),
+                    FFTW_MEASURE);
+                break;
+            case FFTType::R2R_2D:
+                m_plan->m_fftw_plan = fftw_plan_r2r_2d(
+                    m_plan->m_ny, m_plan->m_nx,
+                    reinterpret_cast<double*>(in), reinterpret_cast<double*>(out),
+                    FFTW_RODFT00, FFTW_RODFT00, FFTW_MEASURE);
+                break;
+            case FFTType::C2R_1D_batched:
+                {
+                    int n[1] = {m_plan->m_nx};
+                    m_plan->m_fftw_plan = fftw_plan_many_dft_c2r(
+                        1, n, m_plan->m_ny,
+                        reinterpret_cast<fftw_complex*>(in), nullptr, 1, m_plan->m_nx/2+1,
+                        reinterpret_cast<double*>(out), nullptr, 1, m_plan->m_nx,
+                        FFTW_MEASURE);
+                }
+                break;
+        }
+    }
+}
 
-        // Store meta-data in fft_plan
-        fft_plan.m_real_array = real_array;
-        fft_plan.m_complex_array = complex_array;
-        fft_plan.m_dir = dir;
+void AnyFFT::Execute () {
+    if constexpr (use_float) {
+        fftwf_execute(m_plan->m_fftwf_plan);
+    } else {
+        fftw_execute(m_plan->m_fftw_plan);
+    }
+}
 
-        return fft_plan;
+AnyFFT::~AnyFFT () {
+    if (m_plan) {
+        if constexpr (use_float) {
+            fftwf_destroy_plan(m_plan->m_fftwf_plan);
+        } else {
+            fftw_destroy_plan(m_plan->m_fftw_plan);
+        }
+        delete m_plan;
     }
+}
 
-    void DestroyPlan (FFTplan& fft_plan)
-    {
-#  ifdef AMREX_USE_FLOAT
-        fftwf_destroy_plan( fft_plan.m_plan );
-#  else
-        fftw_destroy_plan( fft_plan.m_plan );
-#  endif
+void AnyFFT::setup () {
+#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP)
+    if constexpr (use_float) {
+        fftwf_init_threads();
+        fftwf_plan_with_nthreads(omp_get_max_threads());
+    } else {
+        fftw_init_threads();
+        fftw_plan_with_nthreads(omp_get_max_threads());
     }
+#endif
+}
 
-    void Execute (FFTplan& fft_plan){
-        HIPACE_PROFILE("Execute_FFTplan()");
-#  ifdef AMREX_USE_FLOAT
-        fftwf_execute( fft_plan.m_plan );
-#  else
-        fftw_execute( fft_plan.m_plan );
-#  endif
+void AnyFFT::cleanup () {
+#if defined(AMREX_USE_OMP) && defined(HIPACE_FFTW_OMP)
+    if constexpr (use_float) {
+        fftwf_cleanup_threads();
+    } else {
+        fftw_cleanup_threads();
     }
+#endif
 }
diff --git a/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp b/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp
deleted file mode 100644
index ae96334494..0000000000
--- a/src/fields/fft_poisson_solver/fft/WrapRocDST.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/* Copyright 2021-2022
- *
- * This file is part of HiPACE++.
- *
- * Authors: AlexanderSinn, Axel Huebl, MaxThevenet, Severin Diederichs
- *
- * License: BSD-3-Clause-LBNL
- */
-#include "AnyDST.H"
-#include "RocFFTUtils.H"
-#include "utils/GPUUtil.H"
-#include "utils/HipaceProfilerWrapper.H"
-#include "utils/Parser.H"
-
-#include <AMReX_Config.H>
-
-namespace AnyDST
-{
-    // see WrapCuDST for documentation
-    void ExpandR2R (amrex::FArrayBox& dst, amrex::FArrayBox& src)
-    {
-        constexpr int scomp = 0;
-        constexpr int dcomp = 0;
-
-        const amrex::Box bx = src.box();
-        const int nx = bx.length(0);
-        const int ny = bx.length(1);
-        const amrex::IntVect lo = bx.smallEnd();
-        Array2<amrex::Real const> const src_array = src.const_array(scomp);
-        Array2<amrex::Real> const dst_array = dst.array(dcomp);
-
-        amrex::ParallelFor(
-            bx,
-            [=] AMREX_GPU_DEVICE(int i, int j, int)
-            {
-                /* upper left quadrant */
-                dst_array(i+1,j+1) = src_array(i, j);
-                /* lower left quadrant */
-                dst_array(i+1,j+ny+2) = -src_array(i, ny-1-j+2*lo[1]);
-                /* upper right quadrant */
-                dst_array(i+nx+2,j+1) = -src_array(nx-1-i+2*lo[0], j);
-                /* lower right quadrant */
-                dst_array(i+nx+2,j+ny+2) = src_array(nx-1-i+2*lo[0], ny-1-j+2*lo[1]);
-            }
-            );
-    }
-
-    // see WrapCuDST for documentation
-    void ShrinkC2R (amrex::FArrayBox& dst, amrex::BaseFab<amrex::GpuComplex<amrex::Real>>& src)
-    {
-        constexpr int scomp = 0;
-        constexpr int dcomp = 0;
-
-        const amrex::Box bx = dst.box();
-        Array2<amrex::GpuComplex<amrex::Real> const> const src_array = src.const_array(scomp);
-        Array2<amrex::Real> const dst_array = dst.array(dcomp);
-        amrex::ParallelFor(
-            bx,
-            [=] AMREX_GPU_DEVICE(int i, int j, int)
-            {
-                /* upper left quadrant */
-                dst_array(i,j) = -src_array(i+1, j+1).real();
-            }
-            );
-    }
-
-    // see WrapCuDST for documentation
-    void ToComplex (const amrex::Real* const AMREX_RESTRICT in,
-                    amrex::GpuComplex<amrex::Real>* const AMREX_RESTRICT out,
-                    const int n_data, const int n_batch)
-    {
-        const int n_half = (n_data+1)/2;
-        if((n_data%2 == 1)) {
-            amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-                [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                {
-                    const int stride_in = n_data*j;
-                    const int i_is_zero = (i==0);
-                    const int i_is_n_half = (i==n_half);
-                    const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                             +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half);
-                    const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in]
-                                             *!i_is_zero*!i_is_n_half;
-                    out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-                });
-        } else {
-            amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-                [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                {
-                    const int stride_in = n_data*j;
-                    const int i_is_zero = (i==0);
-                    const int i_is_n_half = (i==n_half);
-                    const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                             +in[2*i-i_is_n_half+stride_in]*!i_is_n_half;
-                    const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero;
-                    out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-                });
-        }
-    }
-
-    // see WrapCuDST for documentation
-    void C2Rfft (AnyFFT::VendorFFTPlan& plan, amrex::GpuComplex<amrex::Real>* AMREX_RESTRICT in,
-                 amrex::Real* const AMREX_RESTRICT out, rocfft_execution_info execinfo)
-    {
-        rocfft_status result;
-
-        void* in_arr[2] = {in, nullptr};
-        void* out_arr[2] = {out, nullptr};
-        result = rocfft_execute(plan, in_arr, out_arr, execinfo);
-
-        RocFFTUtils::assert_rocfft_status("rocfft_execute", result);
-    }
-
-    // see WrapCuDST for documentation
-    void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out,
-                 const int n_data, const int n_batch)
-    {
-        using namespace amrex::literals;
-
-        const amrex::Real n_1_real = n_data+1._rt;
-        const int n_1 = n_data+1;
-        amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}},
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-            {
-                const int i_rev = n_1-i;
-                const int stride_in = n_1*j;
-                const int stride_out = n_data*j;
-                const amrex::Real in_a = in[i+stride_in];
-                const amrex::Real in_b = in[i_rev+stride_in];
-#ifdef AMREX_USE_FLOAT
-                out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpif(i/n_1_real)));
-                out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt
-                                          *sinpif(i_rev/n_1_real)));
-#else
-                out[i-1+stride_out] = 0.5_rt*(in_b-in_a+(in_a+in_b)/(2._rt*sinpi(i/n_1_real)));
-                out[i_rev-1+stride_out] = 0.5_rt*(in_a-in_b+(in_a+in_b)/(2._rt
-                                          *sinpi(i_rev/n_1_real)));
-#endif
-            });
-    }
-
-    // see WrapCuDST for documentation
-    void Transpose (const amrex::Real* const AMREX_RESTRICT in,
-                    amrex::Real* const AMREX_RESTRICT out,
-                    const int n_data, const int n_batch)
-    {
-        constexpr int tile_dim = 32; //must be power of 2
-        constexpr int block_rows = 8;
-        const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim;
-        const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim;
-        amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows,
-            tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(),
-            [=] AMREX_GPU_DEVICE() noexcept
-            {
-                amrex::Gpu::SharedMemory<amrex::Real> gsm;
-                amrex::Real* const tile = gsm.dataPtr();
-
-                const int thread_x = threadIdx.x&(tile_dim-1);
-                const int thread_y = threadIdx.x/tile_dim;
-                const int block_y = blockIdx.x/num_blocks_x;
-                const int block_x = blockIdx.x - block_y*num_blocks_x;
-                int mat_x = block_x * tile_dim + thread_x;
-                int mat_y = block_y * tile_dim + thread_y;
-
-                for (int i = 0; i < tile_dim; i += block_rows) {
-                    if(mat_x < n_data && (mat_y+i) < n_batch) {
-                        tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x];
-                    }
-                }
-
-                __syncthreads();
-
-                mat_x = block_y * tile_dim + thread_x;
-                mat_y = block_x * tile_dim + thread_y;
-
-                for (int i = 0; i < tile_dim; i += block_rows) {
-                    if(mat_x < n_batch && (mat_y+i) < n_data) {
-                        out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i];
-                    }
-                }
-            });
-    }
-
-    DSTplan CreatePlan (const amrex::IntVect& real_size, amrex::FArrayBox* position_array,
-                        amrex::FArrayBox* fourier_array)
-    {
-        HIPACE_PROFILE("AnyDST::CreatePlan()");
-        DSTplan dst_plan;
-
-        amrex::ParmParse pp("hipace");
-        dst_plan.use_small_dst = (std::max(real_size[0], real_size[1]) >= 511);
-        queryWithParser(pp, "use_small_dst", dst_plan.use_small_dst);
-
-        if(!dst_plan.use_small_dst) {
-            const int nx = real_size[0];
-            const int ny = real_size[1];
-            int dim = 2;
-
-            // Allocate expanded_position_array Real of size (2*nx+2, 2*ny+2)
-            // Allocate expanded_fourier_array Complex of size (nx+2, 2*ny+2)
-            amrex::Box expanded_position_box {{0, 0, 0}, {2*nx+1, 2*ny+1, 0}};
-            amrex::Box expanded_fourier_box {{0, 0, 0}, {nx+1, 2*ny+1, 0}};
-            // shift box to match rest of fields
-            expanded_position_box += position_array->box().smallEnd();
-            expanded_fourier_box += fourier_array->box().smallEnd();
-            dst_plan.m_expanded_position_array =
-                std::make_unique<amrex::FArrayBox>(
-                    expanded_position_box, 1);
-            dst_plan.m_expanded_fourier_array =
-                std::make_unique<amrex::BaseFab<amrex::GpuComplex<amrex::Real>>>(
-                    expanded_fourier_box, 1);
-
-            // setting the initial values to 0
-            // we don't set the expanded Fourier array, because it will be initialized by the FFT
-            dst_plan.m_expanded_position_array->setVal<amrex::RunOn::Device>(0.,
-                dst_plan.m_expanded_position_array->box(), 0,
-                dst_plan.m_expanded_position_array->nComp());
-
-            // check for type of expanded size, should be const size_t *
-            const amrex::IntVect& expanded_size = expanded_position_box.length();
-            const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(expanded_size[0]),
-                                                        std::size_t(expanded_size[1]),
-                                                        std::size_t(expanded_size[2]))};
-
-    #ifdef AMREX_USE_FLOAT
-            rocfft_precision precision = rocfft_precision_single;
-    #else
-            rocfft_precision precision = rocfft_precision_double;
-    #endif
-
-            // Initialize fft_plan.m_plan with the vendor fft plan.
-            rocfft_status result;
-            result = rocfft_plan_create(&(dst_plan.m_plan),
-                                        rocfft_placement_notinplace,
-                                        rocfft_transform_type_real_forward,
-                                        precision,
-                                        dim,
-                                        lengths,
-                                        1,
-                                        nullptr);
-
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-
-            std::size_t buffersize = 0;
-            result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &buffersize);
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
-
-            result = rocfft_execution_info_create(&(dst_plan.m_execinfo));
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result);
-
-            dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize);
-            result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer,
-                                                           buffersize);
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
-
-            result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream());
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result);
-
-            std::size_t mb = 1024*1024;
-
-            amrex::Print() << "using R2C rocFFT of size " << expanded_size[0] << " * "
-                << expanded_size[1] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n";
-
-            // Store meta-data in dst_plan
-            dst_plan.m_position_array = position_array;
-            dst_plan.m_fourier_array = fourier_array;
-
-            return dst_plan;
-        }
-        else {
-            const int nx = real_size[0]; // contiguous
-            const int ny = real_size[1]; // not contiguous
-
-            // Allocate 1d Array for 2d data or 2d transpose data
-            const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx);
-            const int complex_1d_size = std::max(((nx+1)/2+1)*ny, ((ny+1)/2+1)*nx);
-            amrex::Box real_box {{0, 0, 0}, {real_1d_size-1, 0, 0}};
-            amrex::Box complex_box {{0, 0, 0}, {complex_1d_size-1, 0, 0}};
-            dst_plan.m_expanded_position_array =
-                std::make_unique<amrex::FArrayBox>(
-                    real_box, 1);
-            dst_plan.m_expanded_fourier_array =
-                std::make_unique<amrex::BaseFab<amrex::GpuComplex<amrex::Real>>>(
-                    complex_box, 1);
-
-#ifdef AMREX_USE_FLOAT
-            rocfft_precision precision = rocfft_precision_single;
-#else
-            rocfft_precision precision = rocfft_precision_double;
-#endif
-
-            rocfft_status result;
-
-            const std::size_t s_1[3] = {std::size_t(nx+1) ,0u ,0u};
-
-            result = rocfft_plan_create(&(dst_plan.m_plan),
-                                        rocfft_placement_notinplace,
-                                        rocfft_transform_type_real_inverse,
-                                        precision,
-                                        1,
-                                        s_1,
-                                        ny,
-                                        nullptr);
-
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-
-            const std::size_t s_2[3] = {std::size_t(ny+1) ,0u ,0u};
-
-            result = rocfft_plan_create(&(dst_plan.m_plan_b),
-                                        rocfft_placement_notinplace,
-                                        rocfft_transform_type_real_inverse,
-                                        precision,
-                                        1,
-                                        s_2,
-                                        nx,
-                                        nullptr);
-
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-
-            std::size_t work_size = 0;
-            std::size_t work_size_b = 0;
-
-            result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan, &work_size);
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
-
-            result = rocfft_plan_get_work_buffer_size(dst_plan.m_plan_b, &work_size_b);
-            RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
-
-            result = rocfft_execution_info_create(&(dst_plan.m_execinfo));
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result);
-
-            std::size_t buffersize = std::max(work_size, work_size_b);
-            dst_plan.m_buffer = amrex::The_Arena()->alloc(buffersize);
-
-            result = rocfft_execution_info_set_work_buffer(dst_plan.m_execinfo, dst_plan.m_buffer,
-                                                           buffersize);
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
-
-            result = rocfft_execution_info_set_stream(dst_plan.m_execinfo, amrex::Gpu::gpuStream());
-            RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result);
-
-            std::size_t mb = 1024*1024;
-
-            amrex::Print() << "using C2R rocFFT of sizes " << s_1[0] << " and "
-                << s_2[0] << " with " << (buffersize+mb-1)/mb << " MiB of work area\n";
-
-            // Store meta-data in dst_plan
-            dst_plan.m_position_array = position_array;
-            dst_plan.m_fourier_array = fourier_array;
-
-            return dst_plan;
-        }
-    }
-
-    void DestroyPlan (DSTplan& dst_plan)
-    {
-        rocfft_plan_destroy( dst_plan.m_plan );
-        rocfft_plan_destroy( dst_plan.m_plan_b );
-
-        amrex::The_Arena()->free(dst_plan.m_buffer);
-        rocfft_execution_info_destroy(dst_plan.m_execinfo);
-    }
-
-    void Execute (DSTplan& dst_plan, direction d){
-        HIPACE_PROFILE("AnyDST::Execute()");
-
-        if(!dst_plan.use_small_dst) {
-            // Swap position and fourier space based on execute direction
-            amrex::FArrayBox* position_array =
-                (d == direction::forward) ? dst_plan.m_position_array : dst_plan.m_fourier_array;
-            amrex::FArrayBox* fourier_array =
-                (d == direction::forward) ? dst_plan.m_fourier_array : dst_plan.m_position_array;
-
-            // Expand in position space m_position_array -> m_expanded_position_array
-            ExpandR2R(*dst_plan.m_expanded_position_array, *position_array);
-
-            rocfft_status result;
-
-            // R2C FFT m_expanded_position_array -> m_expanded_fourier_array
-            void* in[2] = {dst_plan.m_expanded_position_array->dataPtr(), nullptr};
-            void* out[2] = {dst_plan.m_expanded_fourier_array->dataPtr(), nullptr};
-            result = rocfft_execute(dst_plan.m_plan, in, out, dst_plan.m_execinfo);
-
-            RocFFTUtils::assert_rocfft_status("rocfft_execute", result);
-
-            // Shrink in Fourier space m_expanded_fourier_array -> m_fourier_array
-            ShrinkC2R(*fourier_array, *dst_plan.m_expanded_fourier_array);
-        }
-        else {
-            const int nx = dst_plan.m_position_array->box().length(0); // initially contiguous
-            const int ny = dst_plan.m_position_array->box().length(1); // contiguous after transpose
-
-            amrex::Real* const tmp_pos_arr = dst_plan.m_position_array->dataPtr();
-            amrex::Real* const tmp_fourier_arr = dst_plan.m_fourier_array->dataPtr();
-            amrex::GpuComplex<amrex::Real>* comp_arr = dst_plan.m_expanded_fourier_array->dataPtr();
-            amrex::Real* const real_arr = dst_plan.m_expanded_position_array->dataPtr();
-
-            // Swap position and fourier space based on execute direction
-            amrex::Real* const pos_arr =
-                (d == direction::forward) ? tmp_pos_arr : tmp_fourier_arr;
-            amrex::Real* const fourier_arr =
-                (d == direction::forward) ? tmp_fourier_arr : tmp_pos_arr;
-
-            ToComplex(pos_arr, comp_arr, nx, ny);
-
-            C2Rfft(dst_plan.m_plan, comp_arr, real_arr, dst_plan.m_execinfo);
-
-            ToSine(real_arr, pos_arr, nx, ny);
-
-            Transpose(pos_arr, fourier_arr, nx, ny);
-
-            ToComplex(fourier_arr, comp_arr, ny, nx);
-
-            C2Rfft(dst_plan.m_plan_b, comp_arr, real_arr, dst_plan.m_execinfo);
-
-            ToSine(real_arr, pos_arr, ny, nx);
-
-            Transpose(pos_arr, fourier_arr, ny, nx);
-        }
-    }
-}
diff --git a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp
index 0ba7a2c7db..18cd7ca0d9 100644
--- a/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp
+++ b/src/fields/fft_poisson_solver/fft/WrapRocFFT.cpp
@@ -1,8 +1,8 @@
-/* Copyright 2021
+/* Copyright 2021-2024
  *
  * This file is part of HiPACE++.
  *
- * Authors: Axel Huebl
+ * Authors: AlexanderSinn, Axel Huebl
  * License: BSD-3-Clause-LBNL
  */
 /* Copyright 2019-2020
@@ -11,92 +11,185 @@
  *
  * License: BSD-3-Clause-LBNL
  */
-
 #include "AnyFFT.H"
-#include "RocFFTUtils.H"
-#include "utils/HipaceProfilerWrapper.H"
-
-namespace AnyFFT
-{
-    FFTplan CreatePlan (const amrex::IntVect& real_size, amrex::Real * const real_array,
-                        Complex * const complex_array, const direction dir)
-    {
-        HIPACE_PROFILE("AnyFFT::CreatePlan()");
-        const int dim = 2;
-        FFTplan fft_plan;
-
-        const std::size_t lengths[] = {AMREX_D_DECL(std::size_t(real_size[0]),
-                                                    std::size_t(real_size[1]),
-                                                    std::size_t(real_size[2]))};
-        // Initialize fft_plan.m_plan with the vendor fft plan.
-        rocfft_status result = rocfft_plan_create(&(fft_plan.m_plan),
-                                                  rocfft_placement_notinplace,
-                                                  (dir == direction::R2C)
-                                                      ? rocfft_transform_type_real_forward
-                                                      : rocfft_transform_type_real_inverse,
+
+#include <AMReX.H>
+#include <AMReX_GpuDevice.H>
+
+#if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#include <rocfft/rocfft.h>
+#else
+#include <rocfft.h>
+#endif
+
+#include <string>
+
 #ifdef AMREX_USE_FLOAT
-                                                  rocfft_precision_single,
+static constexpr bool use_float = true;
 #else
-                                                  rocfft_precision_double,
+static constexpr bool use_float = false;
 #endif
-                                                  dim, lengths,
-                                                  1, // number of transforms,
-                                                  nullptr);
-        RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-
-        // Store meta-data in fft_plan
-        fft_plan.m_real_array = real_array;
-        fft_plan.m_complex_array = complex_array;
-        fft_plan.m_dir = dir;
-        //fft_plan.m_dim = dim;
-
-        return fft_plan;
+
+struct VendorPlan {
+    rocfft_plan m_rocfftplan;
+    rocfft_execution_info m_execinfo;
+    std::size_t m_work_size = 0;
+    void* m_in;
+    void* m_out;
+};
+
+std::string rocfftErrorToString (const rocfft_status& err) {
+    if              (err == rocfft_status_success) {
+        return std::string("rocfft_status_success");
+    } else if       (err == rocfft_status_failure) {
+        return std::string("rocfft_status_failure");
+    } else if       (err == rocfft_status_invalid_arg_value) {
+        return std::string("rocfft_status_invalid_arg_value");
+    } else if       (err == rocfft_status_invalid_dimensions) {
+        return std::string("rocfft_status_invalid_dimensions");
+    } else if       (err == rocfft_status_invalid_array_type) {
+        return std::string("rocfft_status_invalid_array_type");
+    } else if       (err == rocfft_status_invalid_strides) {
+        return std::string("rocfft_status_invalid_strides");
+    } else if       (err == rocfft_status_invalid_distance) {
+        return std::string("rocfft_status_invalid_distance");
+    } else if       (err == rocfft_status_invalid_offset) {
+        return std::string("rocfft_status_invalid_offset");
+    } else if       (err == rocfft_status_invalid_work_buffer) {
+        return std::string("rocfft_status_invalid_work_buffer");
+    }else {
+        return std::to_string(err) + " (unknown error code)";
+    }
+}
+
+void assert_rocfft_status (std::string const& name, const rocfft_status& status) {
+    if (status != rocfft_status_success) {
+        amrex::Abort(name + " failed! Error: " + rocfftErrorToString(status));
+    }
+}
+
+std::size_t AnyFFT::Initialize (FFTType type, int nx, int ny) {
+    // https://rocm.docs.amd.com/projects/rocFFT/en/latest/reference/allapi.html#
+    m_plan = new VendorPlan;
+
+    rocfft_transform_type transform_type;
+    std::size_t dimensions = 0;
+    // lengths is in FORTRAN order
+    std::size_t lengths[2] = {0, 0};
+    std::size_t number_of_transforms = 0;
+
+    switch (type) {
+        case FFTType::C2C_2D_fwd:
+            transform_type = rocfft_transform_type_complex_forward;
+            dimensions = 2;
+            lengths[0] = nx;
+            lengths[1] = ny;
+            number_of_transforms = 1;
+            break;
+        case FFTType::C2C_2D_bkw:
+            transform_type = rocfft_transform_type_complex_inverse;
+            dimensions = 2;
+            lengths[0] = nx;
+            lengths[1] = ny;
+            number_of_transforms = 1;
+            break;
+        case FFTType::C2R_2D:
+            transform_type = rocfft_transform_type_real_inverse;
+            dimensions = 2;
+            lengths[0] = nx;
+            lengths[1] = ny;
+            number_of_transforms = 1;
+            break;
+        case FFTType::R2C_2D:
+            transform_type = rocfft_transform_type_real_forward;
+            dimensions = 2;
+            lengths[0] = nx;
+            lengths[1] = ny;
+            number_of_transforms = 1;
+            break;
+        case FFTType::R2R_2D:
+            amrex::Abort("R2R FFT not supported by rocfft");
+            return 0;
+        case FFTType::C2R_1D_batched:
+            transform_type = rocfft_transform_type_real_inverse;
+            dimensions = 1;
+            lengths[0] = nx;
+            number_of_transforms = ny;
+            break;
     }
 
-    void DestroyPlan (FFTplan& fft_plan)
-    {
-        rocfft_plan_destroy( fft_plan.m_plan );
+    rocfft_status status;
+
+    status = rocfft_plan_create(
+        &(m_plan->m_rocfftplan),
+        rocfft_placement_notinplace,
+        transform_type,
+        use_float ? rocfft_precision_single : rocfft_precision_double,
+        dimensions,
+        lengths,
+        number_of_transforms,
+        nullptr);
+    assert_rocfft_status("rocfft_plan_create", status);
+
+    status = rocfft_plan_get_work_buffer_size(m_plan->m_rocfftplan, &(m_plan->m_work_size));
+    assert_rocfft_status("rocfft_plan_get_work_buffer_size", status);
+
+    return m_plan->m_work_size;
+}
+
+void AnyFFT::SetBuffers (void* in, void* out, void* work_area) {
+    m_plan->m_in = in;
+    m_plan->m_out = out;
+
+    rocfft_status status;
+
+    status = rocfft_execution_info_create(&(m_plan->m_execinfo));
+    assert_rocfft_status("rocfft_execution_info_create", status);
+
+    if (m_plan->m_work_size > 0) {
+        status = rocfft_execution_info_set_work_buffer(
+            m_plan->m_execinfo, work_area, m_plan->m_work_size);
+        assert_rocfft_status("rocfft_execution_info_set_work_buffer", status);
     }
 
-    void Execute (FFTplan& fft_plan)
-    {
-        HIPACE_PROFILE("AnyFFT::Execute()");
-        rocfft_execution_info execinfo = NULL;
-        rocfft_status result = rocfft_execution_info_create(&execinfo);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result);
-
-        std::size_t buffersize = 0;
-        result = rocfft_plan_get_work_buffer_size(fft_plan.m_plan, &buffersize);
-        RocFFTUtils::assert_rocfft_status("rocfft_plan_get_work_buffer_size", result);
-
-        void* buffer = amrex::The_Arena()->alloc(buffersize);
-        result = rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_work_buffer", result);
-
-        result = rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream());
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result);
-
-        if (fft_plan.m_dir == direction::R2C) {
-            result = rocfft_execute(fft_plan.m_plan,
-                                    (void**)&(fft_plan.m_real_array), // in
-                                    (void**)&(fft_plan.m_complex_array), // out
-                                    execinfo);
-        } else if (fft_plan.m_dir == direction::C2R) {
-            result = rocfft_execute(fft_plan.m_plan,
-                                    (void**)&(fft_plan.m_complex_array), // in
-                                    (void**)&(fft_plan.m_real_array), // out
-                                    execinfo);
-        } else {
-            amrex::Abort("direction must be AnyFFT::direction::R2C or AnyFFT::direction::C2R");
-        }
-
-        RocFFTUtils::assert_rocfft_status("rocfft_execute", result);
-
-        amrex::Gpu::streamSynchronize();
-
-        amrex::The_Arena()->free(buffer);
-
-        result = rocfft_execution_info_destroy(execinfo);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result);
+    status = rocfft_execution_info_set_stream(m_plan->m_execinfo, amrex::Gpu::gpuStream());
+    assert_rocfft_status("rocfft_execution_info_set_stream", status);
+}
+
+void AnyFFT::Execute () {
+    rocfft_status status;
+
+    void* in_arr[2] = {m_plan->m_in, nullptr};
+    void* out_arr[2] = {m_plan->m_out, nullptr};
+
+    status = rocfft_execute(m_plan->m_rocfftplan, in_arr, out_arr, m_plan->m_execinfo);
+    assert_rocfft_status("rocfft_execute", status);
+}
+
+AnyFFT::~AnyFFT () {
+    if (m_plan) {
+        rocfft_status status;
+
+        status = rocfft_execution_info_destroy(m_plan->m_execinfo);
+        assert_rocfft_status("rocfft_execution_info_destroy", status);
+
+        status = rocfft_plan_destroy(m_plan->m_rocfftplan);
+        assert_rocfft_status("rocfft_plan_destroy", status);
+
+        delete m_plan;
     }
 }
+
+void AnyFFT::setup () {
+    rocfft_status status;
+
+    status = rocfft_setup();
+    assert_rocfft_status("rocfft_setup", status);
+}
+
+void AnyFFT::cleanup () {
+    rocfft_status status;
+
+    status = rocfft_cleanup();
+    assert_rocfft_status("rocfft_cleanup", status);
+}
diff --git a/src/laser/MultiLaser.H b/src/laser/MultiLaser.H
index e0619d331a..e8d516ec1b 100644
--- a/src/laser/MultiLaser.H
+++ b/src/laser/MultiLaser.H
@@ -13,59 +13,13 @@
 #include "Laser.H"
 #include "fields/Fields.H"
 #include "mg_solver/HpMultiGrid.H"
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
 
 #include <AMReX_MultiFab.H>
 #include <AMReX_Vector.H>
 #include <AMReX_AmrCore.H>
 #include <AMReX_GpuComplex.H>
 
-#ifdef AMREX_USE_CUDA
-#  include <cufft.h>
-#elif defined(AMREX_USE_HIP)
-#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#    include <rocfft/rocfft.h>
-#  else
-#    include <rocfft.h>
-#  endif
-#else
-#  include <fftw3.h>
-#endif
-
-namespace LaserFFT {
-#ifdef AMREX_USE_CUDA
-    using VendorFFT = cufftHandle;
-    const auto VendorCreate = cufftPlan2d;
-    const auto VendorDestroy = cufftDestroy;
-#  ifdef AMREX_USE_FLOAT
-    const auto VendorExecute = cufftExecC2C;
-    const auto cufft_type = CUFFT_C2C;
-    using cufftComplex = cuComplex;
-#  else
-    const auto VendorExecute = cufftExecZ2Z;
-    const auto cufft_type = CUFFT_Z2Z;
-    using cufftComplex = cuDoubleComplex;
-#  endif
-#elif defined(AMREX_USE_HIP)
-    using VendorFFT = rocfft_plan;
-    const auto VendorDestroy = rocfft_plan_destroy;
-// TODO
-#else
-#  ifdef AMREX_USE_FLOAT
-    using VendorFFT = fftwf_plan;
-    using FFTWComplex = fftwf_complex;
-    const auto VendorCreate = fftwf_plan_dft_2d;
-    const auto VendorExecute = fftwf_execute;
-    const auto VendorDestroy = fftwf_destroy_plan;
-#  else
-    using VendorFFT = fftw_plan;
-    using FFTWComplex = fftw_complex;
-    const auto VendorCreate = fftw_plan_dft_2d;
-    const auto VendorExecute = fftw_execute;
-    const auto VendorDestroy = fftw_destroy_plan;
-#  endif
-#endif
-}
-
 /** \brief describes which slice with respect to the currently calculated is used */
 namespace WhichLaserSlice {
     // n stands for the time step, j for the longitudinal slice.
@@ -108,15 +62,6 @@ public:
         ReadParameters();
     }
 
-    ~MultiLaser ()
-    {
-        if (!m_use_laser) return;
-        if (m_solver_type == "fft") {
-            LaserFFT::VendorDestroy( m_plan_fwd );
-            LaserFFT::VendorDestroy( m_plan_bkw );
-        }
-    }
-
     void ReadParameters ();
 
     /** get function for the 2D slices */
@@ -268,26 +213,18 @@ private:
     /** hpmg solver for the envelope solver */
     std::unique_ptr<hpmg::MultiGrid> m_mg;
 
-    // Elements for the FFT-based laser envelope solver
-    // This could belong to AnyFFT etc., with 2 caveats:
-    // - This solver operates on a FArrayBox instead of a MultiFab, which is more adequate
-    // - The array in position space must be Complex rather than real, which takes up quite some
-    //   rewriting, see https://github.com/MaxThevenet/hipace/tree/laser_solve,
-    // not sure what the best way to proceed is.
     /** FFTW plan for forward C2C transform to solve Complex Poisson equation */
-    LaserFFT::VendorFFT m_plan_fwd;
+    AnyFFT m_forward_fft;
     /** FFTW plan for backward C2C transform to solve Complex Poisson equation */
-    LaserFFT::VendorFFT m_plan_bkw;
+    AnyFFT m_backward_fft;
+    /** work area for both FFT plans */
+    amrex::Gpu::DeviceVector<char> m_fft_work_area;
     /** Complex FAB to store the solution (e.g. laser envelope on current slice) */
     SpectralFieldLoc m_sol;
     /** Complex FAB to store the RHS in position space */
     SpectralFieldLoc m_rhs;
     /** Complex FAB to store the RHS in Fourier space */
     SpectralFieldLoc m_rhs_fourier;
-#ifdef AMREX_USE_CUDA
-    cufftResult m_result_fwd;
-    cufftResult m_result_bkw;
-#endif
 
     // Data for in-situ diagnostics:
     /** Number of real laser properties for in-situ per-slice reduced diagnostics. */
diff --git a/src/laser/MultiLaser.cpp b/src/laser/MultiLaser.cpp
index 2d2b008673..ba77cbcdc5 100644
--- a/src/laser/MultiLaser.cpp
+++ b/src/laser/MultiLaser.cpp
@@ -13,11 +13,7 @@
 #include "utils/HipaceProfilerWrapper.H"
 #include "utils/DeprecatedInput.H"
 #include "utils/InsituUtil.H"
-#ifdef AMREX_USE_CUDA
-#  include "fields/fft_poisson_solver/fft/CuFFTUtils.H"
-#elif defined(AMREX_USE_HIP)
-#  include "fields/fft_poisson_solver/fft/RocFFTUtils.H"
-#endif
+#include "fields/fft_poisson_solver/fft/AnyFFT.H"
 #include "particles/particles_utils/ShapeFactors.H"
 #ifdef HIPACE_USE_OPENPMD
 #   include <openPMD/auxiliary/Filesystem.hpp>
@@ -25,18 +21,6 @@
 
 #include <AMReX_GpuComplex.H>
 
-#ifdef AMREX_USE_CUDA
-#  include <cufft.h>
-#elif defined(AMREX_USE_HIP)
-#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#    include <rocfft/rocfft.h>
-#  else
-#    include <rocfft.h>
-#  endif
-#else
-#  include <fftw3.h>
-#endif
-
 void
 MultiLaser::ReadParameters ()
 {
@@ -111,62 +95,14 @@ MultiLaser::InitData (const amrex::BoxArray& slice_ba,
         // Create FFT plans
         amrex::IntVect fft_size = m_slice_box.length();
 
-#ifdef AMREX_USE_CUDA
-        cufftResult result;
-        // Forward FFT plan
-        result = LaserFFT::VendorCreate(
-            &(m_plan_fwd), fft_size[1], fft_size[0], LaserFFT::cufft_type);
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " cufftplan failed! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
-        // Backward FFT plan
-        result = LaserFFT::VendorCreate(
-            &(m_plan_bkw), fft_size[1], fft_size[0], LaserFFT::cufft_type);
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " cufftplan failed! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
-#elif defined(AMREX_USE_HIP)
-        const std::size_t lengths[] = {
-            static_cast<std::size_t>(fft_size[0]), static_cast<std::size_t>(fft_size[1])
-        };
-
-        rocfft_status result;
-        // Forward FFT plan
-        result = rocfft_plan_create(&(m_plan_fwd), rocfft_placement_notinplace,
-            rocfft_transform_type_complex_forward,
-#ifdef AMREX_USE_FLOAT
-            rocfft_precision_single,
-#else
-            rocfft_precision_double,
-#endif
-            2, lengths, 1, nullptr);
-        RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-        // Backward FFT plan
-        result = rocfft_plan_create(&(m_plan_bkw), rocfft_placement_notinplace,
-            rocfft_transform_type_complex_inverse,
-#ifdef AMREX_USE_FLOAT
-            rocfft_precision_single,
-#else
-            rocfft_precision_double,
-#endif
-            2, lengths, 1, nullptr);
-        RocFFTUtils::assert_rocfft_status("rocfft_plan_create", result);
-#else
-        // Forward FFT plan
-        m_plan_fwd = LaserFFT::VendorCreate(
-            fft_size[1], fft_size[0],
-            reinterpret_cast<LaserFFT::FFTWComplex*>(m_rhs.dataPtr()),
-            reinterpret_cast<LaserFFT::FFTWComplex*>(m_rhs_fourier.dataPtr()),
-            FFTW_FORWARD, FFTW_ESTIMATE);
-        // Backward FFT plan
-        m_plan_bkw = LaserFFT::VendorCreate(
-            fft_size[1], fft_size[0],
-            reinterpret_cast<LaserFFT::FFTWComplex*>(m_rhs_fourier.dataPtr()),
-            reinterpret_cast<LaserFFT::FFTWComplex*>(m_sol.dataPtr()),
-            FFTW_BACKWARD, FFTW_ESTIMATE);
-#endif
+        std::size_t fwd_area = m_forward_fft.Initialize(FFTType::C2C_2D_fwd, fft_size[0], fft_size[1]);
+        std::size_t bkw_area = m_backward_fft.Initialize(FFTType::C2C_2D_bkw, fft_size[0], fft_size[1]);
+
+        // Allocate work area for both FFTs
+        m_fft_work_area.resize(std::max(fwd_area, bkw_area));
+
+        m_forward_fft.SetBuffers(m_rhs.dataPtr(), m_rhs_fourier.dataPtr(), m_fft_work_area.dataPtr());
+        m_backward_fft.SetBuffers(m_rhs_fourier.dataPtr(), m_sol.dataPtr(), m_fft_work_area.dataPtr());
     }
 
     if (m_laser_from_file) {
@@ -922,36 +858,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste
             });
 
         // Transform rhs to Fourier space
-#ifdef AMREX_USE_CUDA
-        cudaStream_t stream = amrex::Gpu::Device::cudaStream();
-        cufftSetStream(m_plan_fwd, stream);
-        cufftResult result;
-        result = LaserFFT::VendorExecute(
-            m_plan_fwd,
-            reinterpret_cast<LaserFFT::cufftComplex*>( m_rhs.dataPtr() ),
-            reinterpret_cast<LaserFFT::cufftComplex*>( m_rhs_fourier.dataPtr() ),
-            CUFFT_FORWARD);
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " forward transform using cufftExec failed ! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
-#elif defined(AMREX_USE_HIP)
-        rocfft_execution_info execinfo_fwd = nullptr;
-        rocfft_status result = rocfft_execution_info_create(&execinfo_fwd);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result);
-        result = rocfft_execution_info_set_stream(execinfo_fwd, amrex::Gpu::gpuStream());
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result);
-
-        void* in_buffer_fwd[2] = {(void*)m_rhs.dataPtr(), nullptr};
-        void* out_bufferfwd[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr};
-
-        result = rocfft_execute(m_plan_fwd, in_buffer_fwd, out_bufferfwd, execinfo_fwd);
-        RocFFTUtils::assert_rocfft_status("rocfft_execute", result);
-        result = rocfft_execution_info_destroy(execinfo_fwd);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result);
-#else
-        LaserFFT::VendorExecute( m_plan_fwd );
-#endif
+        m_forward_fft.Execute();
 
         // Multiply by appropriate factors in Fourier space
         amrex::Real dkx = 2.*MathConst::pi/m_laser_geom_3D.ProbLength(0);
@@ -973,35 +880,7 @@ MultiLaser::AdvanceSliceFFT (const Fields& fields, const amrex::Real dt, int ste
             });
 
         // Transform rhs to Fourier space to get solution in sol
-#ifdef AMREX_USE_CUDA
-        cudaStream_t stream_bkw = amrex::Gpu::Device::cudaStream();
-        cufftSetStream ( m_plan_bkw, stream_bkw);
-        result = LaserFFT::VendorExecute(
-            m_plan_bkw,
-            reinterpret_cast<LaserFFT::cufftComplex*>( m_rhs_fourier.dataPtr() ),
-            reinterpret_cast<LaserFFT::cufftComplex*>( m_sol.dataPtr() ),
-            CUFFT_INVERSE);
-        if ( result != CUFFT_SUCCESS ) {
-            amrex::Print() << " forward transform using cufftExec failed ! Error: " <<
-                CuFFTUtils::cufftErrorToString(result) << "\n";
-        }
-#elif defined(AMREX_USE_HIP)
-        rocfft_execution_info execinfo_bkw = nullptr;
-        result = rocfft_execution_info_create(&execinfo_bkw);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_create", result);
-        result = rocfft_execution_info_set_stream(execinfo_bkw, amrex::Gpu::gpuStream());
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_set_stream", result);
-
-        void* in_buffer_bkw[2] = {(void*)m_rhs_fourier.dataPtr(), nullptr};
-        void* out_buffer_bkw[2] = {(void*)m_sol.dataPtr(), nullptr};
-
-        result = rocfft_execute(m_plan_bkw, in_buffer_bkw, out_buffer_bkw, execinfo_bkw);
-        RocFFTUtils::assert_rocfft_status("rocfft_execute", result);
-        result = rocfft_execution_info_destroy(execinfo_bkw);
-        RocFFTUtils::assert_rocfft_status("rocfft_execution_info_destroy", result);
-#else
-        LaserFFT::VendorExecute( m_plan_bkw );
-#endif
+        m_backward_fft.Execute();
 
         // Normalize and store solution in np1j00[0]. Guard cells are filled with 0s.
         amrex::Box grown_bx = bx;