diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 15d3a82419e..f5a28596d16 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
     factorization/par_ilut_select_kernels.cpp
     factorization/par_ilut_spgeam_kernels.cpp
     factorization/par_ilut_sweep_kernels.cpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/dense_kernels.cpp
     matrix/diagonal_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index bb3aac67b55..7583cc72292 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -22,14 +23,6 @@
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
 
-#if defined(GKO_COMPILING_CUDA)
-#include "cuda/base/batch_struct.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/batch_struct.hip.hpp"
-#else
-#error "batch struct def missing"
-#endif
-
 
 namespace gko {
 namespace kernels {
diff --git a/cuda/base/batch_struct.hpp b/common/cuda_hip/base/batch_struct.hpp
similarity index 71%
rename from cuda/base/batch_struct.hpp
rename to common/cuda_hip/base/batch_struct.hpp
index 9f07b6b4532..bc10752975f 100644
--- a/cuda/base/batch_struct.hpp
+++ b/common/cuda_hip/base/batch_struct.hpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_
-#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
@@ -11,12 +11,13 @@
 
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 /** @file batch_struct.hpp
@@ -33,10 +34,10 @@ namespace cuda {
  * Generates an immutable uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<const device_type<ValueType>>
 get_batch_struct(const batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -46,19 +47,19 @@ get_batch_struct(const batch::MultiVector<ValueType>* const op)
  * Generates a uniform batch struct from a batch of multi-vectors.
  */
 template <typename ValueType>
-inline batch::multi_vector::uniform_batch<cuda_type<ValueType>>
+inline batch::multi_vector::uniform_batch<device_type<ValueType>>
 get_batch_struct(batch::MultiVector<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
 }
 
 
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#endif  // GKO_COMMON_CUDA_HIP_BASE_BATCH_STRUCT_HPP_
diff --git a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp
similarity index 64%
rename from common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.cpp
index 18c9dbcb29a..35dc2c17e03 100644
--- a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp
similarity index 66%
rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.hpp
index e041dadaa3e..64611559715 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -21,23 +51,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      csr::uniform_batch<
-                                                          const ValueType,
-                                                          IndexType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -71,33 +89,13 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        csr::uniform_batch<
-                                                            const ValueType,
-                                                            IndexType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -196,3 +194,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp
similarity index 66%
rename from common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.cpp
index 8fdb001fd1f..44dad55aa70 100644
--- a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -58,8 +87,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
@@ -75,7 +106,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto mat_ub = get_batch_struct(mat);
     const auto in_out_ub = get_batch_struct(in_out);
-    scale_add_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+    batch_single_kernels::scale_add_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
         alpha_ub, mat_ub, in_out_ub);
 }
 
@@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp
similarity index 72%
rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.hpp
index f8abf9131a1..e4cd24bbd78 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -33,22 +63,10 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      dense::uniform_batch<
-                                                          const ValueType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::dense::uniform_batch<const ValueType> mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -94,32 +112,12 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        dense::uniform_batch<
-                                                            const ValueType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::dense::uniform_batch<const ValueType> mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -243,3 +241,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp
similarity index 64%
rename from common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.cpp
index 7e69b119c85..c56325ab824 100644
--- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    simple_apply_kernel<<<num_blocks, default_block_size, 0,
-                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+    batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+                                                0, exec->get_stream()>>>(
+        mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     if (b->get_common_size()[1] > 1) {
         GKO_NOT_IMPLEMENTED;
     }
-    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-                                                  beta_ub, x_ub);
+    batch_single_kernels::advanced_apply_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_scale_vals = col_scale->get_const_data();
     const auto row_scale_vals = row_scale->get_const_data();
     const auto mat_ub = get_batch_struct(input);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+    batch_single_kernels::
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+            mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto beta_ub = get_batch_struct(beta);
     const auto mat_ub = get_batch_struct(mat);
-    add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(alpha_ub, beta_ub,
-                                                       mat_ub);
+    batch_single_kernels::add_scaled_identity_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADD_SCALED_IDENTITY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp
similarity index 67%
rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.hpp
index 0a6d1927c96..52826957ddb 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -28,23 +58,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      ell::uniform_batch<
-                                                          const ValueType,
-                                                          IndexType>
-                                                          mat,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              const ValueType>
-                                                              b,
-                                                  const gko::batch::
-                                                      multi_vector::
-                                                          uniform_batch<
-                                                              ValueType>
-                                                              x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+    const gko::batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -84,34 +102,15 @@ __device__ __forceinline__ void advanced_apply(
     }
 }
 
+
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                alpha,
-                                                    const gko::batch::matrix::
-                                                        ell::uniform_batch<
-                                                            const ValueType,
-                                                            IndexType>
-                                                            mat,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                b,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                beta,
-                                                    const gko::batch::
-                                                        multi_vector::
-                                                            uniform_batch<
-                                                                ValueType>
-                                                                x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+        mat,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
     for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
          batch_id += gridDim.x) {
@@ -205,3 +204,9 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_struct.hpp b/common/cuda_hip/matrix/batch_struct.hpp
similarity index 74%
rename from cuda/matrix/batch_struct.hpp
rename to common/cuda_hip/matrix/batch_struct.hpp
index 5845fb2235e..e88eca245bb 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/common/cuda_hip/matrix/batch_struct.hpp
@@ -2,27 +2,31 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
-#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+#ifndef GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
 
 
+#include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 /** @file batch_struct.hpp
  *
  * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required CUDA scalar type.
+ * while also shallow-casting to the required GKO_DEVICE_NAMESPACE scalar
+ * type.
  *
  * A specialization is needed for every format of every kind of linear algebra
  * object. These are intended to be called on the host.
@@ -33,11 +37,11 @@ namespace cuda {
  * Generates an immutable uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<const cuda_type<ValueType>,
+inline batch::matrix::csr::uniform_batch<const device_type<ValueType>,
                                          const IndexType>
 get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_const_row_ptrs(),
             op->get_num_batch_items(),
@@ -51,10 +55,10 @@ get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of csr matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<cuda_type<ValueType>, IndexType>
+inline batch::matrix::csr::uniform_batch<device_type<ValueType>, IndexType>
 get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_values()),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_row_ptrs(),
             op->get_num_batch_items(),
@@ -68,10 +72,10 @@ get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
  * Generates an immutable uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<const cuda_type<ValueType>>
+inline batch::matrix::dense::uniform_batch<const device_type<ValueType>>
 get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_const_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -82,10 +86,10 @@ get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
  * Generates a uniform batch struct from a batch of dense matrices.
  */
 template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<cuda_type<ValueType>>
+inline batch::matrix::dense::uniform_batch<device_type<ValueType>>
 get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 {
-    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+    return {as_device_type(op->get_values()), op->get_num_batch_items(),
             static_cast<int32>(op->get_common_size()[1]),
             static_cast<int32>(op->get_common_size()[0]),
             static_cast<int32>(op->get_common_size()[1])};
@@ -96,11 +100,11 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>,
+inline batch::matrix::ell::uniform_batch<const device_type<ValueType>,
                                          const IndexType>
 get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_const_values()),
+    return {as_device_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
@@ -114,10 +118,10 @@ get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>, IndexType>
+inline batch::matrix::ell::uniform_batch<device_type<ValueType>, IndexType>
 get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 {
-    return {as_cuda_type(op->get_values()),
+    return {as_device_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
             static_cast<IndexType>(op->get_common_size()[0]),
@@ -127,9 +131,9 @@ get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 }
 
 
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
 
-#endif  // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+#endif  // GKO_COMMON_CUDA_HIP_MATRIX_BATCH_STRUCT_HPP_
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
index c2a53b2e518..d4ce149d394 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -27,8 +27,9 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-                   static_cast<ValueType>(1.0), r_shared_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
@@ -295,7 +296,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // v = A * p_hat
-            simple_apply(mat_entry, p_hat_sh, v_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, p_hat_sh, v_sh);
             __syncthreads();
 
             // alpha = rho_new / < r_hat , v>
@@ -327,7 +329,8 @@ __global__ void apply_kernel(
             __syncthreads();
 
             // t = A * s_hat
-            simple_apply(mat_entry, s_hat_sh, t_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, s_hat_sh, t_sh);
             __syncthreads();
 
             // omega = <t,s> / <t,t>
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
index c95a6b1cf05..4f4b382f552 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -22,8 +22,9 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     // r = b - A*x
-    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
-                   static_cast<ValueType>(1.0), r_shared_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry);
     __syncthreads();
 
     // z = precond * r
@@ -189,7 +190,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
             }
 
             // Ap = A * p
-            simple_apply(mat_entry, p_sh, Ap_sh);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                simple_apply(mat_entry, p_sh, Ap_sh);
             __syncthreads();
 
             // alpha = rho_old / (p' * Ap)
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 73d37eb2ac2..455d3d67a6d 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -16,7 +16,6 @@
 
 #if defined(GKO_COMPILING_CUDA)
 
-#define GKO_DEVICE_NAMESPACE cuda
 #define GKO_KERNEL __device__
 #include "common/cuda_hip/base/types.hpp"
 
@@ -43,7 +42,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_HIP)
 
-#define GKO_DEVICE_NAMESPACE hip
 #define GKO_KERNEL __device__
 #include "common/cuda_hip/base/types.hpp"
 
@@ -70,7 +68,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_DPCPP)
 
-#define GKO_DEVICE_NAMESPACE dpcpp
 #define GKO_KERNEL
 
 
@@ -105,7 +102,6 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #elif defined(GKO_COMPILING_OMP)
 
-#define GKO_DEVICE_NAMESPACE omp
 #define GKO_KERNEL
 
 
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
index 8a142a5224a..599c708b334 100644
--- a/core/solver/batch_dispatch.hpp
+++ b/core/solver/batch_dispatch.hpp
@@ -24,10 +24,10 @@
 #if defined GKO_COMPILING_CUDA
 
 
-#include "cuda/base/batch_struct.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/log/batch_logger.cuh"
-#include "cuda/matrix/batch_struct.hpp"
 #include "cuda/preconditioner/batch_preconditioners.cuh"
 #include "cuda/stop/batch_criteria.cuh"
 
@@ -52,10 +52,10 @@ using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
 #elif defined GKO_COMPILING_HIP
 
 
-#include "hip/base/batch_struct.hip.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/log/batch_logger.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 #include "hip/preconditioner/batch_preconditioners.hip.hpp"
 #include "hip/stop/batch_criteria.hip.hpp"
 
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 3631a65f48d..000cb7b215f 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -16,9 +16,6 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    matrix/batch_csr_kernels.cu
-    matrix/batch_dense_kernels.cu
-    matrix/batch_ell_kernels.cu
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
deleted file mode 100644
index 95b4f85cdfc..00000000000
--- a/cuda/matrix/batch_csr_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
deleted file mode 100644
index 10148ee242b..00000000000
--- a/cuda/matrix/batch_dense_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
deleted file mode 100644
index 25281cf6f81..00000000000
--- a/cuda/matrix/batch_ell_kernels.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
index 716c158ffff..edf052cb649 100644
--- a/cuda/preconditioner/batch_jacobi_kernels.cu
+++ b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -8,19 +8,19 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/batch_struct.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/matrix/batch_struct.hpp"
 // generated header
 #include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 4d3deb742fe..35d567fd911 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
@@ -19,11 +20,13 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
 
 
 namespace gko {
@@ -31,11 +34,6 @@ namespace kernels {
 namespace cuda {
 
 
-// NOTE: this default block size is not used for the main solver kernel.
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
 /**
  * @brief The batch Bicgstab solver namespace.
  *
@@ -44,9 +42,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 21c3e3d43c4..f26f2d37313 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -18,11 +19,13 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/matrix/batch_struct.hpp"
 
 
 namespace gko {
@@ -30,11 +33,6 @@ namespace kernels {
 namespace cuda {
 
 
-// NOTE: this default block size is not used for the main solver kernel.
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
 /**
  * @brief The batch Cg solver namespace.
  *
@@ -43,9 +41,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/dpcpp/matrix/batch_csr_kernels.dp.cpp b/dpcpp/matrix/batch_csr_kernels.dp.cpp
index 9feb824a3aa..1759a959299 100644
--- a/dpcpp/matrix/batch_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_csr_kernels.dp.cpp
@@ -21,23 +21,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }
@@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -172,9 +165,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     const auto mat_item =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(mat_item.num_rows, col_scale_b, row_scale_b,
-                                 mat_item.col_idxs, mat_item.row_ptrs,
-                                 mat_item.values, item_ct1);
+                    batch_single_kernels::scale(mat_item.num_rows, col_scale_b,
+                                                row_scale_b, mat_item.col_idxs,
+                                                mat_item.row_ptrs,
+                                                mat_item.values, item_ct1);
                 });
     });
 }
@@ -215,7 +209,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_csr_kernels.hpp.inc b/dpcpp/matrix/batch_csr_kernels.hpp
similarity index 67%
rename from dpcpp/matrix/batch_csr_kernels.hpp.inc
rename to dpcpp/matrix/batch_csr_kernels.hpp
index 4379e02d0b7..f51124f81a4 100644
--- a/dpcpp/matrix/batch_csr_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_csr_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -23,7 +47,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -45,13 +69,11 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void scale_kernel(const int num_rows,
-                                  const ValueType* const col_scale,
-                                  const ValueType* const row_scale,
-                                  const IndexType* const col_idxs,
-                                  const IndexType* const row_ptrs,
-                                  ValueType* const values,
-                                  sycl::nd_item<3>& item_ct1)
+__dpct_inline__ void scale(const int num_rows, const ValueType* const col_scale,
+                           const ValueType* const row_scale,
+                           const IndexType* const col_idxs,
+                           const IndexType* const row_ptrs,
+                           ValueType* const values, sycl::nd_item<3>& item_ct1)
 {
     for (int row = item_ct1.get_local_linear_id(); row < num_rows;
          row += item_ct1.get_local_range().size()) {
@@ -64,7 +86,7 @@ __dpct_inline__ void scale_kernel(const int num_rows,
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::csr::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -80,3 +102,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
index a9f6afce0f5..43974589abb 100644
--- a/dpcpp/matrix/batch_dense_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -25,23 +25,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -77,8 +70,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }
@@ -129,9 +122,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -174,7 +167,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     auto input_mat =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(col_scale_b, row_scale_b, input_mat, item_ct1);
+                    batch_single_kernels::scale(col_scale_b, row_scale_b,
+                                                input_mat, item_ct1);
                 });
     });
 }
@@ -204,18 +198,20 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
             sycl_nd_range(grid, block),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto alpha_b =
-                    gko::batch::extract_batch_item(alpha_ub, group_id);
-                const auto mat_b =
-                    gko::batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto in_out_b =
-                    gko::batch::matrix::extract_batch_item(in_out_ub, group_id);
-                scale_add_kernel(alpha_b.values[0], mat_b, in_out_b, item_ct1);
-            });
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        gko::batch::extract_batch_item(alpha_ub, group_id);
+                    const auto mat_b = gko::batch::matrix::extract_batch_item(
+                        mat_ub, group_id);
+                    const auto in_out_b =
+                        gko::batch::matrix::extract_batch_item(in_out_ub,
+                                                               group_id);
+                    batch_single_kernels::scale_add(alpha_b.values[0], mat_b,
+                                                    in_out_b, item_ct1);
+                });
     });
 }
 
@@ -254,7 +250,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp
similarity index 84%
rename from dpcpp/matrix/batch_dense_kernels.hpp.inc
rename to dpcpp/matrix/batch_dense_kernels.hpp
index 98282fe253d..acf1e65939d 100644
--- a/dpcpp/matrix/batch_dense_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_dense_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -34,7 +58,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -67,7 +91,7 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void scale_kernel(
+__dpct_inline__ void scale(
     const ValueType* const col_scale, const ValueType* const row_scale,
     gko::batch::matrix::dense::batch_item<ValueType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -91,7 +115,7 @@ __dpct_inline__ void scale_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void scale_add_kernel(
+__dpct_inline__ void scale_add(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
     const gko::batch::matrix::dense::batch_item<ValueType>& in_out,
@@ -117,7 +141,7 @@ __dpct_inline__ void scale_add_kernel(
 
 
 template <typename ValueType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::dense::batch_item<ValueType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -140,3 +164,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 2cb40dc35eb..d9b819b101e 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -21,23 +21,16 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -74,8 +67,8 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::matrix::extract_batch_item(mat_ub, group_id);
                     const auto b_b = batch::extract_batch_item(b_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
-                                        item_ct1);
+                    batch_single_kernels::simple_apply(mat_b, b_b.values,
+                                                       x_b.values, item_ct1);
                 });
     });
 }
@@ -127,9 +120,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto beta_b =
                         batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
-                                          beta_b.values[0], x_b.values,
-                                          item_ct1);
+                    batch_single_kernels::advanced_apply(
+                        alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                        x_b.values, item_ct1);
                 });
     });
 }
@@ -171,7 +164,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                         row_scale_vals + num_rows * group_id;
                     auto mat_item =
                         batch::matrix::extract_batch_item(mat_ub, group_id);
-                    scale_kernel(col_scale_b, row_scale_b, mat_item, item_ct1);
+                    batch_single_kernels::scale(col_scale_b, row_scale_b,
+                                                mat_item, item_ct1);
                 });
     });
 }
@@ -212,7 +206,7 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
                         gko::batch::extract_batch_item(beta_ub, group_id);
                     const auto mat_b = gko::batch::matrix::extract_batch_item(
                         mat_ub, group_id);
-                    add_scaled_identity_kernel(
+                    batch_single_kernels::add_scaled_identity(
                         alpha_b.values[0], beta_b.values[0], mat_b, item_ct1);
                 });
     });
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp
similarity index 78%
rename from dpcpp/matrix/batch_ell_kernels.hpp.inc
rename to dpcpp/matrix/batch_ell_kernels.hpp
index 1a809664dca..48ab9318bdf 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp
@@ -2,8 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void simple_apply_kernel(
+__dpct_inline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
 {
@@ -24,7 +48,7 @@ __dpct_inline__ void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void advanced_apply_kernel(
+__dpct_inline__ void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* b, const ValueType beta, ValueType* x,
@@ -47,7 +71,7 @@ __dpct_inline__ void advanced_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void scale_kernel(
+__dpct_inline__ void scale(
     const ValueType* const col_scale, const ValueType* const row_scale,
     gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -69,7 +93,7 @@ __dpct_inline__ void scale_kernel(
 
 
 template <typename ValueType, typename IndexType>
-__dpct_inline__ void add_scaled_identity_kernel(
+__dpct_inline__ void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat,
     sycl::nd_item<3>& item_ct1)
@@ -89,3 +113,9 @@ __dpct_inline__ void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index 7dc8f3ec23b..291ee1d8a8b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -23,23 +23,18 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
 #include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
index f5a88e9d59d..de1956c8c6c 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
@@ -33,9 +33,9 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), mat_global_entry,
-                          x_shared_entry, static_cast<ValueType>(1.0),
-                          r_shared_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     if (sg_id == 0) {
@@ -330,7 +330,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // v = A * p_hat
-        simple_apply_kernel(mat_global_entry, p_hat_sh, v_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, p_hat_sh, v_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_new / < r_hat , v>
@@ -361,7 +362,8 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // t = A * s_hat
-        simple_apply_kernel(mat_global_entry, s_hat_sh, t_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, s_hat_sh, t_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // omega = <t,s> / <t,t>
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index f25d8266803..05b3f7b803c 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -23,23 +23,18 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_csr_kernels.hpp"
+#include "dpcpp/matrix/batch_dense_kernels.hpp"
+#include "dpcpp/matrix/batch_ell_kernels.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
 
 
-#include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
-#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
 #include "dpcpp/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc
index 7a91bcb2bbf..b233b7df680 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp.inc
+++ b/dpcpp/solver/batch_cg_kernels.hpp.inc
@@ -27,9 +27,9 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), mat_global_entry,
-                          x_shared_entry, static_cast<ValueType>(1.0),
-                          r_shared_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), mat_global_entry, x_shared_entry,
+        static_cast<ValueType>(1.0), r_shared_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
 
@@ -207,7 +207,8 @@ __dpct_inline__ void apply_kernel(
             break;
         }
         // Ap = A * p
-        simple_apply_kernel(mat_global_entry, p_sh, Ap_sh, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            mat_global_entry, p_sh, Ap_sh, item_ct1);
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         // alpha = rho_old / (p' * Ap)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 84bba295120..7d914d57a81 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -14,9 +14,6 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    matrix/batch_csr_kernels.hip.cpp
-    matrix/batch_dense_kernels.hip.cpp
-    matrix/batch_ell_kernels.hip.cpp
     ${CSR_INSTANTIATE}
     ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
deleted file mode 100644
index 3e4cba6a747..00000000000
--- a/hip/base/batch_struct.hip.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
-#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-/** @file batch_struct.hpp
- *
- * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required Hip scalar type.
- *
- * A specialization is needed for every format of every kind of linear algebra
- * object. These are intended to be called on the host.
- */
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of multi-vectors.
- */
-template <typename ValueType>
-inline batch::multi_vector::uniform_batch<const hip_type<ValueType>>
-get_batch_struct(const batch::MultiVector<ValueType>* const op)
-{
-    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-/**
- * Generates a uniform batch struct from a batch of multi-vectors.
- */
-template <typename ValueType>
-inline batch::multi_vector::uniform_batch<hip_type<ValueType>> get_batch_struct(
-    batch::MultiVector<ValueType>* const op)
-{
-    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
deleted file mode 100644
index b77b9416505..00000000000
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
deleted file mode 100644
index 67dfd78e264..00000000000
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
deleted file mode 100644
index 68b59c042f1..00000000000
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-#include <thrust/functional.h>
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
deleted file mode 100644
index bb9f7912cd6..00000000000
--- a/hip/matrix/batch_struct.hip.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
-#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
-
-
-#include <ginkgo/core/matrix/batch_dense.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-#include "common/cuda_hip/base/types.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-/** @file batch_struct.hpp
- *
- * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the required HIP scalar type.
- *
- * A specialization is needed for every format of every kind of linear algebra
- * object. These are intended to be called on the host.
- */
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of csr matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<const hip_type<ValueType>,
-                                         const IndexType>
-get_batch_struct(const batch::matrix::Csr<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_const_values()),
-            op->get_const_col_idxs(),
-            op->get_const_row_ptrs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_elements_per_item())};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of csr matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::csr::uniform_batch<hip_type<ValueType>, IndexType>
-get_batch_struct(batch::matrix::Csr<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_values()),
-            op->get_col_idxs(),
-            op->get_row_ptrs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_elements_per_item())};
-}
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of dense matrices.
- */
-template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<const hip_type<ValueType>>
-get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
-{
-    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of dense matrices.
- */
-template <typename ValueType>
-inline batch::matrix::dense::uniform_batch<hip_type<ValueType>>
-get_batch_struct(batch::matrix::Dense<ValueType>* const op)
-{
-    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1])};
-}
-
-
-/**
- * Generates an immutable uniform batch struct from a batch of ell matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>,
-                                         const IndexType>
-get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_const_values()),
-            op->get_const_col_idxs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
-}
-
-
-/**
- * Generates a uniform batch struct from a batch of ell matrices.
- */
-template <typename ValueType, typename IndexType>
-inline batch::matrix::ell::uniform_batch<hip_type<ValueType>, IndexType>
-get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
-{
-    return {as_hip_type(op->get_values()),
-            op->get_col_idxs(),
-            op->get_num_batch_items(),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[0]),
-            static_cast<IndexType>(op->get_common_size()[1]),
-            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/preconditioner/batch_jacobi_kernels.hip.cpp b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
index e86bc86390a..38a81972e66 100644
--- a/hip/preconditioner/batch_jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/batch_jacobi_kernels.hip.cpp
@@ -8,21 +8,21 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "common/cuda_hip/components/warp_blas.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 // generated header
 #include "common/cuda_hip/preconditioner/jacobi_common.hpp"
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 1c1be8b21f7..a5de10953bc 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -20,11 +21,13 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {
@@ -32,9 +35,6 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
 /**
  * @brief The batch Bicgstab solver namespace.
  *
@@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index c860286c17c..23bb939ead8 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+#include "common/cuda_hip/base/batch_struct.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -20,11 +21,13 @@
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp"
+#include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {
@@ -32,9 +35,6 @@ namespace kernels {
 namespace hip {
 
 
-constexpr int default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
 /**
  * @brief The batch Cg solver namespace.
  *
@@ -43,9 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 #include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/omp/matrix/batch_csr_kernels.cpp b/omp/matrix/batch_csr_kernels.cpp
index eacb26c12cb..d4ea6cbd642 100644
--- a/omp/matrix/batch_csr_kernels.cpp
+++ b/omp/matrix/batch_csr_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -99,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index 836908260a7..cd4a7f05b4a 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Dense matrix format namespace.
- * @ref Dense
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -98,7 +93,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto input_mat =
             input_vals + input->get_num_elements_per_item() * batch_id;
-        scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat);
+        batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b,
+                                    row_scale_b, input_mat);
     }
 }
 
@@ -121,7 +117,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
         const auto input_mat_b =
             batch::matrix::extract_batch_item(in_mat_ub, batch_id);
-        scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b);
+        batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b);
     }
 }
 
@@ -143,7 +139,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 4fb5aeea6fa..8b1239565a1 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -9,26 +9,20 @@
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -43,7 +37,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -71,8 +65,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -99,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -122,7 +117,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index c245f284106..661cdbcd2ec 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -10,28 +10,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 55d6ee29321..3a6e31256c2 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -10,28 +10,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace omp {
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 0c226830637..85b8f33e38b 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -66,6 +66,7 @@ target_sources(ginkgo_reference
     stop/residual_norm_kernels.cpp)
 
 target_link_libraries(ginkgo_reference PUBLIC ginkgo_device)
+target_compile_definitions(ginkgo_reference PRIVATE GKO_COMPILING_REFERENCE GKO_DEVICE_NAMESPACE=reference)
 ginkgo_compile_features(ginkgo_reference)
 ginkgo_default_includes(ginkgo_reference)
 ginkgo_install_library(ginkgo_reference)
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index f5e1c653054..d7fbf3ce214 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -10,10 +10,6 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
-
-#define GKO_DEVICE_NAMESPACE reference
-
-
 #include "core/base/batch_struct.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_csr_kernels.cpp b/reference/matrix/batch_csr_kernels.cpp
index 7c6d9a6c000..d3304ab9795 100644
--- a/reference/matrix/batch_csr_kernels.cpp
+++ b/reference/matrix/batch_csr_kernels.cpp
@@ -12,23 +12,16 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
 namespace batch_csr {
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -96,7 +90,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -118,7 +112,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_csr_kernels.hpp.inc b/reference/matrix/batch_csr_kernels.hpp
similarity index 81%
rename from reference/matrix/batch_csr_kernels.hpp.inc
rename to reference/matrix/batch_csr_kernels.hpp
index 52e511785a0..e04b2bdf345 100644
--- a/reference/matrix/batch_csr_kernels.hpp.inc
+++ b/reference/matrix/batch_csr_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -25,7 +42,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -63,7 +80,7 @@ inline void scale(
 
 
 template <typename ValueType, typename IndexType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::csr::batch_item<ValueType, IndexType>& mat)
 {
@@ -76,3 +93,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 2116a691fb9..599af30ecfb 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -12,23 +12,16 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Dense matrix format namespace.
- * @ref Dense
- * @ingroup batch_dense
- */
 namespace batch_dense {
 
 
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Dense<ValueType>* mat,
@@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -95,7 +89,8 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto input_mat =
             input_vals + input->get_num_elements_per_item() * batch_id;
-        scale(num_rows, num_cols, stride, col_scale_b, row_scale_b, input_mat);
+        batch_single_kernels::scale(num_rows, num_cols, stride, col_scale_b,
+                                    row_scale_b, input_mat);
     }
 }
 
@@ -117,7 +112,7 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
         const auto input_mat_b =
             batch::matrix::extract_batch_item(in_mat_ub, batch_id);
-        scale_add_kernel(alpha_b.values[0], mat_b, input_mat_b);
+        batch_single_kernels::scale_add(alpha_b.values[0], mat_b, input_mat_b);
     }
 }
 
@@ -138,7 +133,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp
similarity index 84%
rename from reference/matrix/batch_dense_kernels.hpp.inc
rename to reference/matrix/batch_dense_kernels.hpp
index a017010a644..e12827c77de 100644
--- a/reference/matrix/batch_dense_kernels.hpp.inc
+++ b/reference/matrix/batch_dense_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -27,7 +44,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -75,7 +92,7 @@ inline void scale(const int num_rows, const int num_cols,
 
 
 template <typename ValueType>
-inline void scale_add_kernel(
+inline void scale_add(
     const ValueType alpha,
     const gko::batch::matrix::dense::batch_item<const ValueType>& b,
     const gko::batch::matrix::dense::batch_item<ValueType>& in_out)
@@ -91,7 +108,7 @@ inline void scale_add_kernel(
 
 
 template <typename ValueType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::dense::batch_item<ValueType>& mat)
 {
@@ -105,3 +122,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index 0d47f9ea601..1a4855f389f 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -12,23 +12,16 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 #include "reference/matrix/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
 namespace batch_ell {
 
 
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
-
-
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
@@ -42,7 +35,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
         const auto b_item = batch::extract_batch_item(b_ub, batch);
         const auto x_item = batch::extract_batch_item(x_ub, batch);
-        simple_apply_kernel(mat_item, b_item, x_item);
+        batch_single_kernels::simple_apply(mat_item, b_item, x_item);
     }
 }
 
@@ -69,8 +62,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
         const auto x_item = batch::extract_batch_item(x_ub, batch);
         const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
         const auto beta_item = batch::extract_batch_item(beta_ub, batch);
-        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
-                              beta_item.values[0], x_item);
+        batch_single_kernels::advanced_apply(alpha_item.values[0], mat_item,
+                                             b_item, beta_item.values[0],
+                                             x_item);
     }
 }
 
@@ -96,7 +90,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
         const auto row_scale_b = row_scale_vals + num_rows * batch_id;
         const auto mat_item =
             batch::matrix::extract_batch_item(mat_ub, batch_id);
-        scale(col_scale_b, row_scale_b, mat_item);
+        batch_single_kernels::scale(col_scale_b, row_scale_b, mat_item);
     }
 }
 
@@ -118,7 +112,8 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch_id);
         const auto beta_b = batch::extract_batch_item(beta_ub, batch_id);
         const auto mat_b = batch::matrix::extract_batch_item(mat_ub, batch_id);
-        add_scaled_identity_kernel(alpha_b.values[0], beta_b.values[0], mat_b);
+        batch_single_kernels::add_scaled_identity(alpha_b.values[0],
+                                                  beta_b.values[0], mat_b);
     }
 }
 
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp
similarity index 84%
rename from reference/matrix/batch_ell_kernels.hpp.inc
rename to reference/matrix/batch_ell_kernels.hpp
index 7aea0946573..71bd1ce851a 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp
@@ -2,8 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
-inline void simple_apply_kernel(
+inline void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -27,7 +44,7 @@ inline void simple_apply_kernel(
 
 
 template <typename ValueType, typename IndexType>
-inline void advanced_apply_kernel(
+inline void advanced_apply(
     const ValueType alpha,
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
@@ -73,7 +90,7 @@ inline void scale(
 
 
 template <typename ValueType, typename IndexType>
-inline void add_scaled_identity_kernel(
+inline void add_scaled_identity(
     const ValueType alpha, const ValueType beta,
     const gko::batch::matrix::ell::batch_item<ValueType, IndexType>& mat)
 {
@@ -91,3 +108,9 @@ inline void add_scaled_identity_kernel(
         }
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index e68caffa936..33e1e9392d9 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -6,30 +6,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-
-
-/**
- * @brief The batch Bicgstab solver namespace.
- *
- * @ingroup batch_bicgstab
- */
 namespace batch_bicgstab {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_bicgstab_kernels.hpp.inc"
 
 
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc
index 1f8537ab66d..786e98eb5d1 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp.inc
+++ b/reference/solver/batch_bicgstab_kernels.hpp.inc
@@ -33,9 +33,9 @@ inline void initialize(
         b_entry, r_entry);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
-                          gko::batch::to_const(x_entry),
-                          static_cast<ValueType>(1.0), r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
+        static_cast<ValueType>(1.0), r_entry);
     gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
         compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
                                         res_norms_entry);
@@ -271,8 +271,8 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(p_entry), p_hat_entry);
 
         // v = A * p_hat
-        simple_apply_kernel(A_entry, gko::batch::to_const(p_hat_entry),
-                            v_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(p_hat_entry), v_entry);
 
         // alpha = rho_new / < r_hat , v>
         compute_alpha(gko::batch::to_const(rho_new_entry),
@@ -303,8 +303,8 @@ inline void batch_entry_bicgstab_impl(
         prec.apply(gko::batch::to_const(s_entry), s_hat_entry);
 
         // t = A * s_hat
-        simple_apply_kernel(A_entry, gko::batch::to_const(s_hat_entry),
-                            t_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(s_hat_entry), t_entry);
         // omega = <t,s> / <t,t>
         compute_omega(gko::batch::to_const(t_entry),
                       gko::batch::to_const(s_entry), temp_entry, omega_entry);
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 785a7a868a2..7c69157d4a7 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -6,30 +6,21 @@
 
 #include "core/solver/batch_dispatch.hpp"
 #include "reference/base/batch_multi_vector_kernels.hpp"
+#include "reference/matrix/batch_csr_kernels.hpp"
+#include "reference/matrix/batch_dense_kernels.hpp"
+#include "reference/matrix/batch_ell_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace reference {
-
-
-/**
- * @brief The batch Cg solver namespace.
- *
- * @ingroup batch_cg
- */
 namespace batch_cg {
-
-
 namespace {
 
 
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/matrix/batch_csr_kernels.hpp.inc"
-#include "reference/matrix/batch_dense_kernels.hpp.inc"
-#include "reference/matrix/batch_ell_kernels.hpp.inc"
 #include "reference/solver/batch_cg_kernels.hpp.inc"
 
 
diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc
index ca88940cd69..991db5c061c 100644
--- a/reference/solver/batch_cg_kernels.hpp.inc
+++ b/reference/solver/batch_cg_kernels.hpp.inc
@@ -34,9 +34,9 @@ inline void initialize(
         b_entry, r_entry);
 
     // r = b - A*x
-    advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
-                          gko::batch::to_const(x_entry),
-                          static_cast<ValueType>(1.0), r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::advanced_apply(
+        static_cast<ValueType>(-1.0), A_entry, gko::batch::to_const(x_entry),
+        static_cast<ValueType>(1.0), r_entry);
 }
 
 
@@ -181,7 +181,8 @@ inline void batch_entry_cg_impl(
                  gko::batch::to_const(z_entry), p_entry);
 
         // Ap = A * p
-        simple_apply_kernel(A_entry, gko::batch::to_const(p_entry), Ap_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::simple_apply(
+            A_entry, gko::batch::to_const(p_entry), Ap_entry);
 
         // temp= rho_old / (p' * Ap)
         // x = x + temp * p