diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu
index cd5c5b10482..eb2ed35d76d 100644
--- a/src/core/grid_based_algorithms/lbgpu_cuda.cu
+++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu
@@ -100,16 +100,6 @@ static LB_extern_nodeforcedensity_gpu *extern_node_force_densities = nullptr;
 /** @brief Force on the boundary nodes */
 static float *lb_boundary_force = nullptr;
 
-/** @brief Velocity at the boundary */
-static float *lb_boundary_velocity = nullptr;
-
-/** @name pointers for bound index array */
-/*@{*/
-static int *boundary_node_list = nullptr;
-static int *boundary_index_list = nullptr;
-static size_t size_of_boundindex = 0;
-/*@}*/
-
 /** @name pointers for additional cuda check flag */
 /*@{*/
 static int *gpu_check = nullptr;
@@ -1371,9 +1361,27 @@ __device__ __inline__ float three_point_polynomial_larger_than_half(float u) {
          (5.f + -3 * fabsf(u) - sqrtf(-2.f + 6.f * fabsf(u) - 3.f * u * u));
 }
 
+/**
+ * @brief Get velocity of at index.
+ *
+ */
+__device__ __inline__ float3 node_velocity(float rho_eq, LB_nodes_gpu n_a,
+                                           int index) {
+  auto const boundary_index = n_a.boundary[index];
+
+  if (boundary_index) {
+    auto const &u = n_a.boundary_velocity[index];
+    return make_float3(u[0], u[1], u[2]);
+  }
+
+  auto const rho = rho_eq + calc_mode_x_from_n(n_a, index, 0);
+  return make_float3(calc_mode_x_from_n(n_a, index, 1) / rho,
+                     calc_mode_x_from_n(n_a, index, 2) / rho,
+                     calc_mode_x_from_n(n_a, index, 3) / rho);
+}
+
 __device__ __inline__ float3
 velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
-                       float *lb_boundary_velocity,
                        Utils::Array<unsigned int, 27> &node_indices,
                        Utils::Array<float, 27> &delta) {
   Utils::Array<int, 3> center_node_index{};
@@ -1431,31 +1439,14 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
         auto const z =
             fold_if_necessary(center_node_index[2] - 1 + k, para->dim_z);
         delta[cnt] = temp_delta[i].x * temp_delta[j].y * temp_delta[k].z;
-        node_indices[cnt] = xyz_to_index(x, y, z);
-        auto const boundary_index = n_a.boundary[node_indices[cnt]];
-        if (not boundary_index) {
-          float totmass = 0.0f;
-          auto const mass_mode = calc_mode_x_from_n(n_a, node_indices[cnt], 0);
-
-          totmass += mass_mode + para->rho;
-
-          auto const j_x = calc_mode_x_from_n(n_a, node_indices[cnt], 1);
-          auto const j_y = calc_mode_x_from_n(n_a, node_indices[cnt], 2);
-          auto const j_z = calc_mode_x_from_n(n_a, node_indices[cnt], 3);
-          interpolated_u.x += (j_x / totmass) * delta[cnt];
-          interpolated_u.y += (j_y / totmass) * delta[cnt];
-          interpolated_u.z += (j_z / totmass) * delta[cnt];
-        } else {
-          interpolated_u.x +=
-              lb_boundary_velocity[3 * (boundary_index - 1) + 0] * para->tau /
-              para->agrid * delta[cnt];
-          interpolated_u.y +=
-              lb_boundary_velocity[3 * (boundary_index - 1) + 1] * para->tau /
-              para->agrid * delta[cnt];
-          interpolated_u.z +=
-              lb_boundary_velocity[3 * (boundary_index - 1) + 2] * para->tau /
-              para->agrid * delta[cnt];
-        }
+        auto const index = xyz_to_index(x, y, z);
+        node_indices[cnt] = index;
+
+        auto const node_u = node_velocity(para->rho, n_a, index);
+        interpolated_u.x += delta[cnt] * node_u.x;
+        interpolated_u.y += delta[cnt] * node_u.y;
+        interpolated_u.z += delta[cnt] * node_u.z;
+
         ++cnt;
       }
     }
@@ -1469,12 +1460,12 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
  *  @param[in]  particle_position  Particle position
  *  @param[out] node_index         Node index around (8) particle
  *  @param[out] delta              Weighting of particle position
- *  @param[in]  lb_boundary_velocity Velocity at the boundary
  *  @retval Interpolated velocity
  */
-__device__ __inline__ float3 velocity_interpolation(
-    LB_nodes_gpu n_a, float *particle_position, float *lb_boundary_velocity,
-    Utils::Array<unsigned int, 8> &node_index, Utils::Array<float, 8> &delta) {
+__device__ __inline__ float3
+velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
+                       Utils::Array<unsigned int, 8> &node_index,
+                       Utils::Array<float, 8> &delta) {
   Utils::Array<int, 3> left_node_index;
   Utils::Array<float, 6> temp_delta;
   // see ahlrichs + duenweg page 8227 equ (10) and (11)
@@ -1497,9 +1488,9 @@ __device__ __inline__ float3 velocity_interpolation(
 
   // modulo for negative numbers is strange at best, shift to make sure we are
   // positive
-  int x = (left_node_index[0] + para->dim_x) % para->dim_x;
-  int y = (left_node_index[1] + para->dim_y) % para->dim_y;
-  int z = (left_node_index[2] + para->dim_z) % para->dim_z;
+  int const x = (left_node_index[0] + para->dim_x) % para->dim_x;
+  int const y = (left_node_index[1] + para->dim_y) % para->dim_y;
+  int const z = (left_node_index[2] + para->dim_z) % para->dim_z;
   auto xp1 = x + 1;
   auto yp1 = y + 1;
   auto zp1 = z + 1;
@@ -1521,30 +1512,10 @@ __device__ __inline__ float3 velocity_interpolation(
   float3 interpolated_u{0.0f, 0.0f, 0.0f};
 #pragma unroll
   for (int i = 0; i < 8; ++i) {
-    float totmass = 0.0f;
-    Utils::Array<float, 19> mode;
-
-    calc_m_from_n(n_a, node_index[i], mode);
-    auto const mass_mode = calc_mode_x_from_n(n_a, node_index[i], 0);
-
-    totmass += mass_mode + para->rho;
-
-    /* The boolean expression (n_a.boundary[node_index[i]] == 0) causes boundary
-       nodes to couple with velocity 0 to particles. This is necessary, since
-       boundary nodes undergo the same LB dynamics as fluid nodes do. The flow
-       within the boundaries does not interact with the physical fluid, since
-       these populations are overwritten by the bounce back kernel. Particles
-       close to walls can couple to this unphysical flow, though.
-    */
-    auto const j_x = calc_mode_x_from_n(n_a, node_index[i], 1);
-    auto const j_y = calc_mode_x_from_n(n_a, node_index[i], 2);
-    auto const j_z = calc_mode_x_from_n(n_a, node_index[i], 3);
-    interpolated_u.x +=
-        (j_x / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
-    interpolated_u.y +=
-        (j_y / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
-    interpolated_u.z +=
-        (j_z / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
+    auto const node_u = node_velocity(para->rho, n_a, node_index[i]);
+    interpolated_u.x += delta[i] * node_u.x;
+    interpolated_u.y += delta[i] * node_u.y;
+    interpolated_u.z += delta[i] * node_u.z;
   }
   return interpolated_u;
 }
@@ -1562,19 +1533,16 @@ __device__ __inline__ float3 velocity_interpolation(
  *  @param[in]  flag_cs            Determine if we are at the centre (0,
  *                                 typical) or at the source (1, swimmer only)
  *  @param[in]  friction           Friction constant for the particle coupling
- *  @param[in]  lb_boundary_velocity Velocity at the boundary
  *  @tparam no_of_neighbours       The number of neighbours to consider for
  * interpolation
  */
 template <std::size_t no_of_neighbours>
-__device__ void
-calc_viscous_force(LB_nodes_gpu n_a,
-                   Utils::Array<float, no_of_neighbours> &delta,
-                   CUDA_particle_data *particle_data, float *particle_force,
-                   unsigned int part_index, float *delta_j,
-                   Utils::Array<unsigned int, no_of_neighbours> &node_index,
-                   LB_rho_v_gpu *d_v, int flag_cs, uint64_t philox_counter,
-                   float friction, float *lb_boundary_velocity) {
+__device__ void calc_viscous_force(
+    LB_nodes_gpu n_a, Utils::Array<float, no_of_neighbours> &delta,
+    CUDA_particle_data *particle_data, float *particle_force,
+    unsigned int part_index, float *delta_j,
+    Utils::Array<unsigned int, no_of_neighbours> &node_index, LB_rho_v_gpu *d_v,
+    int flag_cs, uint64_t philox_counter, float friction) {
 // Zero out workspace
 #pragma unroll
   for (int jj = 0; jj < 3; ++jj) {
@@ -1614,8 +1582,8 @@ calc_viscous_force(LB_nodes_gpu n_a,
       flag_cs * direction * particle_data[part_index].swim.director[2];
 #endif
 
-  float3 const interpolated_u = velocity_interpolation(
-      n_a, position, lb_boundary_velocity, node_index, delta);
+  float3 const interpolated_u =
+      velocity_interpolation(n_a, position, node_index, delta);
 
 #ifdef ENGINE
   velocity[0] -= particle_data[part_index].swim.v_swim *
@@ -2230,15 +2198,15 @@ __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v,
  *  @param[out] node_f              Local node force
  *  @param[in]  d_v                 Local device values
  *  @param[in]  friction            Friction constant for the particle coupling
- *  @param[in]  lb_boundary_velocity Velocity at the boundary
  *  @tparam     no_of_neighbours    The number of neighbours to consider for
  * interpolation
  */
 template <std::size_t no_of_neighbours>
-__global__ void calc_fluid_particle_ia(
-    LB_nodes_gpu n_a, CUDA_particle_data *particle_data, float *particle_force,
-    LB_node_force_density_gpu node_f, LB_rho_v_gpu *d_v, bool couple_virtual,
-    uint64_t philox_counter, float friction, float *lb_boundary_velocity) {
+__global__ void
+calc_fluid_particle_ia(LB_nodes_gpu n_a, CUDA_particle_data *particle_data,
+                       float *particle_force, LB_node_force_density_gpu node_f,
+                       LB_rho_v_gpu *d_v, bool couple_virtual,
+                       uint64_t philox_counter, float friction) {
 
   unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x +
                             blockDim.x * blockIdx.x + threadIdx.x;
@@ -2254,14 +2222,14 @@ __global__ void calc_fluid_particle_ia(
        * force that acts back onto the fluid. */
       calc_viscous_force<no_of_neighbours>(
           n_a, delta, particle_data, particle_force, part_index, delta_j,
-          node_index, d_v, 0, philox_counter, friction, lb_boundary_velocity);
+          node_index, d_v, 0, philox_counter, friction);
       calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);
 
 #ifdef ENGINE
       if (particle_data[part_index].swim.swimming) {
         calc_viscous_force<no_of_neighbours>(
             n_a, delta, particle_data, particle_force, part_index, delta_j,
-            node_index, d_v, 1, philox_counter, friction, lb_boundary_velocity);
+            node_index, d_v, 1, philox_counter, friction);
         calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);
       }
 #endif
@@ -2540,7 +2508,11 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
   if (this_node != 0)
     return;
 
-  size_of_boundindex = number_of_boundnodes * sizeof(int);
+  float *boundary_velocity = nullptr;
+  int *boundary_node_list = nullptr;
+  int *boundary_index_list = nullptr;
+
+  auto const size_of_boundindex = number_of_boundnodes * sizeof(int);
   cuda_safe_mem(cudaMalloc((void **)&boundary_node_list, size_of_boundindex));
   cuda_safe_mem(cudaMalloc((void **)&boundary_index_list, size_of_boundindex));
   cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list,
@@ -2549,10 +2521,10 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
                            size_of_boundindex, cudaMemcpyHostToDevice));
   cuda_safe_mem(cudaMalloc((void **)&lb_boundary_force,
                            3 * host_n_lb_boundaries * sizeof(float)));
-  cuda_safe_mem(cudaMalloc((void **)&lb_boundary_velocity,
+  cuda_safe_mem(cudaMalloc((void **)&boundary_velocity,
                            3 * host_n_lb_boundaries * sizeof(float)));
   cuda_safe_mem(
-      cudaMemcpy(lb_boundary_velocity, host_lb_boundary_velocity,
+      cudaMemcpy(boundary_velocity, host_lb_boundary_velocity,
                  3 * LBBoundaries::lbboundaries.size() * sizeof(float),
                  cudaMemcpyHostToDevice));
 
@@ -2585,10 +2557,14 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
         make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1);
 
     KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound,
-               boundary_node_list, boundary_index_list, lb_boundary_velocity,
+               boundary_node_list, boundary_index_list, boundary_velocity,
                number_of_boundnodes, boundaries);
   }
 
+  cudaFree(boundary_velocity);
+  cudaFree(boundary_node_list);
+  cudaFree(boundary_index_list);
+
   cudaDeviceSynchronize();
 }
 #endif
@@ -2669,15 +2645,13 @@ void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction) {
                  threads_per_block_particles, *current_nodes,
                  gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
                  node_f, device_rho_v, couple_virtual,
-                 rng_counter_coupling_gpu->value(), friction,
-                 lb_boundary_velocity);
+                 rng_counter_coupling_gpu->value(), friction);
     } else {
       // We use a dummy value for the RNG counter if no temperature is set.
       KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid_particles,
                  threads_per_block_particles, *current_nodes,
                  gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
-                 node_f, device_rho_v, couple_virtual, 0, friction,
-                 lb_boundary_velocity);
+                 node_f, device_rho_v, couple_virtual, 0, friction);
     }
   }
 }
@@ -3052,17 +3026,14 @@ void lb_lbfluid_get_population(const Utils::Vector3i &xyz,
 template <std::size_t no_of_neighbours> struct interpolation {
   LB_nodes_gpu current_nodes_gpu;
   LB_rho_v_gpu *d_v_gpu;
-  float *lb_boundary_velocity;
-  interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu,
-                float *lb_boundary_velocity)
-      : current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu),
-        lb_boundary_velocity(lb_boundary_velocity){};
+  interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu)
+      : current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu) {}
   __device__ float3 operator()(const float3 &position) const {
     float _position[3] = {position.x, position.y, position.z};
     Utils::Array<unsigned int, no_of_neighbours> node_indices;
     Utils::Array<float, no_of_neighbours> delta;
-    return velocity_interpolation(current_nodes_gpu, _position,
-                                  lb_boundary_velocity, node_indices, delta);
+    return velocity_interpolation(current_nodes_gpu, _position, node_indices,
+                                  delta);
   }
 };
 
@@ -3078,10 +3049,10 @@ void lb_get_interpolated_velocity_gpu(double const *positions,
   }
   thrust::device_vector<float3> positions_device = positions_host;
   thrust::device_vector<float3> velocities_device(length);
-  thrust::transform(positions_device.begin(), positions_device.end(),
-                    velocities_device.begin(),
-                    interpolation<no_of_neighbours>(
-                        *current_nodes, device_rho_v, lb_boundary_velocity));
+  thrust::transform(
+      positions_device.begin(), positions_device.end(),
+      velocities_device.begin(),
+      interpolation<no_of_neighbours>(*current_nodes, device_rho_v));
   thrust::host_vector<float3> velocities_host = velocities_device;
   int index = 0;
   for (auto v : velocities_host) {