diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu index cd5c5b10482..eb2ed35d76d 100644 --- a/src/core/grid_based_algorithms/lbgpu_cuda.cu +++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu @@ -100,16 +100,6 @@ static LB_extern_nodeforcedensity_gpu *extern_node_force_densities = nullptr; /** @brief Force on the boundary nodes */ static float *lb_boundary_force = nullptr; -/** @brief Velocity at the boundary */ -static float *lb_boundary_velocity = nullptr; - -/** @name pointers for bound index array */ -/*@{*/ -static int *boundary_node_list = nullptr; -static int *boundary_index_list = nullptr; -static size_t size_of_boundindex = 0; -/*@}*/ - /** @name pointers for additional cuda check flag */ /*@{*/ static int *gpu_check = nullptr; @@ -1371,9 +1361,27 @@ __device__ __inline__ float three_point_polynomial_larger_than_half(float u) { (5.f + -3 * fabsf(u) - sqrtf(-2.f + 6.f * fabsf(u) - 3.f * u * u)); } +/** + * @brief Get velocity of at index. + * + */ +__device__ __inline__ float3 node_velocity(float rho_eq, LB_nodes_gpu n_a, + int index) { + auto const boundary_index = n_a.boundary[index]; + + if (boundary_index) { + auto const &u = n_a.boundary_velocity[index]; + return make_float3(u[0], u[1], u[2]); + } + + auto const rho = rho_eq + calc_mode_x_from_n(n_a, index, 0); + return make_float3(calc_mode_x_from_n(n_a, index, 1) / rho, + calc_mode_x_from_n(n_a, index, 2) / rho, + calc_mode_x_from_n(n_a, index, 3) / rho); +} + __device__ __inline__ float3 velocity_interpolation(LB_nodes_gpu n_a, float *particle_position, - float *lb_boundary_velocity, Utils::Array &node_indices, Utils::Array &delta) { Utils::Array center_node_index{}; @@ -1431,31 +1439,14 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position, auto const z = fold_if_necessary(center_node_index[2] - 1 + k, para->dim_z); delta[cnt] = temp_delta[i].x * temp_delta[j].y * temp_delta[k].z; - node_indices[cnt] = xyz_to_index(x, y, z); - auto const boundary_index = n_a.boundary[node_indices[cnt]]; - if (not boundary_index) { - float totmass = 0.0f; - auto const mass_mode = calc_mode_x_from_n(n_a, node_indices[cnt], 0); - - totmass += mass_mode + para->rho; - - auto const j_x = calc_mode_x_from_n(n_a, node_indices[cnt], 1); - auto const j_y = calc_mode_x_from_n(n_a, node_indices[cnt], 2); - auto const j_z = calc_mode_x_from_n(n_a, node_indices[cnt], 3); - interpolated_u.x += (j_x / totmass) * delta[cnt]; - interpolated_u.y += (j_y / totmass) * delta[cnt]; - interpolated_u.z += (j_z / totmass) * delta[cnt]; - } else { - interpolated_u.x += - lb_boundary_velocity[3 * (boundary_index - 1) + 0] * para->tau / - para->agrid * delta[cnt]; - interpolated_u.y += - lb_boundary_velocity[3 * (boundary_index - 1) + 1] * para->tau / - para->agrid * delta[cnt]; - interpolated_u.z += - lb_boundary_velocity[3 * (boundary_index - 1) + 2] * para->tau / - para->agrid * delta[cnt]; - } + auto const index = xyz_to_index(x, y, z); + node_indices[cnt] = index; + + auto const node_u = node_velocity(para->rho, n_a, index); + interpolated_u.x += delta[cnt] * node_u.x; + interpolated_u.y += delta[cnt] * node_u.y; + interpolated_u.z += delta[cnt] * node_u.z; + ++cnt; } } @@ -1469,12 +1460,12 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position, * @param[in] particle_position Particle position * @param[out] node_index Node index around (8) particle * @param[out] delta Weighting of particle position - * @param[in] lb_boundary_velocity Velocity at the boundary * @retval Interpolated velocity */ -__device__ __inline__ float3 velocity_interpolation( - LB_nodes_gpu n_a, float *particle_position, float *lb_boundary_velocity, - Utils::Array &node_index, Utils::Array &delta) { +__device__ __inline__ float3 +velocity_interpolation(LB_nodes_gpu n_a, float *particle_position, + Utils::Array &node_index, + Utils::Array &delta) { Utils::Array left_node_index; Utils::Array temp_delta; // see ahlrichs + duenweg page 8227 equ (10) and (11) @@ -1497,9 +1488,9 @@ __device__ __inline__ float3 velocity_interpolation( // modulo for negative numbers is strange at best, shift to make sure we are // positive - int x = (left_node_index[0] + para->dim_x) % para->dim_x; - int y = (left_node_index[1] + para->dim_y) % para->dim_y; - int z = (left_node_index[2] + para->dim_z) % para->dim_z; + int const x = (left_node_index[0] + para->dim_x) % para->dim_x; + int const y = (left_node_index[1] + para->dim_y) % para->dim_y; + int const z = (left_node_index[2] + para->dim_z) % para->dim_z; auto xp1 = x + 1; auto yp1 = y + 1; auto zp1 = z + 1; @@ -1521,30 +1512,10 @@ __device__ __inline__ float3 velocity_interpolation( float3 interpolated_u{0.0f, 0.0f, 0.0f}; #pragma unroll for (int i = 0; i < 8; ++i) { - float totmass = 0.0f; - Utils::Array mode; - - calc_m_from_n(n_a, node_index[i], mode); - auto const mass_mode = calc_mode_x_from_n(n_a, node_index[i], 0); - - totmass += mass_mode + para->rho; - - /* The boolean expression (n_a.boundary[node_index[i]] == 0) causes boundary - nodes to couple with velocity 0 to particles. This is necessary, since - boundary nodes undergo the same LB dynamics as fluid nodes do. The flow - within the boundaries does not interact with the physical fluid, since - these populations are overwritten by the bounce back kernel. Particles - close to walls can couple to this unphysical flow, though. - */ - auto const j_x = calc_mode_x_from_n(n_a, node_index[i], 1); - auto const j_y = calc_mode_x_from_n(n_a, node_index[i], 2); - auto const j_z = calc_mode_x_from_n(n_a, node_index[i], 3); - interpolated_u.x += - (j_x / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0); - interpolated_u.y += - (j_y / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0); - interpolated_u.z += - (j_z / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0); + auto const node_u = node_velocity(para->rho, n_a, node_index[i]); + interpolated_u.x += delta[i] * node_u.x; + interpolated_u.y += delta[i] * node_u.y; + interpolated_u.z += delta[i] * node_u.z; } return interpolated_u; } @@ -1562,19 +1533,16 @@ __device__ __inline__ float3 velocity_interpolation( * @param[in] flag_cs Determine if we are at the centre (0, * typical) or at the source (1, swimmer only) * @param[in] friction Friction constant for the particle coupling - * @param[in] lb_boundary_velocity Velocity at the boundary * @tparam no_of_neighbours The number of neighbours to consider for * interpolation */ template -__device__ void -calc_viscous_force(LB_nodes_gpu n_a, - Utils::Array &delta, - CUDA_particle_data *particle_data, float *particle_force, - unsigned int part_index, float *delta_j, - Utils::Array &node_index, - LB_rho_v_gpu *d_v, int flag_cs, uint64_t philox_counter, - float friction, float *lb_boundary_velocity) { +__device__ void calc_viscous_force( + LB_nodes_gpu n_a, Utils::Array &delta, + CUDA_particle_data *particle_data, float *particle_force, + unsigned int part_index, float *delta_j, + Utils::Array &node_index, LB_rho_v_gpu *d_v, + int flag_cs, uint64_t philox_counter, float friction) { // Zero out workspace #pragma unroll for (int jj = 0; jj < 3; ++jj) { @@ -1614,8 +1582,8 @@ calc_viscous_force(LB_nodes_gpu n_a, flag_cs * direction * particle_data[part_index].swim.director[2]; #endif - float3 const interpolated_u = velocity_interpolation( - n_a, position, lb_boundary_velocity, node_index, delta); + float3 const interpolated_u = + velocity_interpolation(n_a, position, node_index, delta); #ifdef ENGINE velocity[0] -= particle_data[part_index].swim.v_swim * @@ -2230,15 +2198,15 @@ __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v, * @param[out] node_f Local node force * @param[in] d_v Local device values * @param[in] friction Friction constant for the particle coupling - * @param[in] lb_boundary_velocity Velocity at the boundary * @tparam no_of_neighbours The number of neighbours to consider for * interpolation */ template -__global__ void calc_fluid_particle_ia( - LB_nodes_gpu n_a, CUDA_particle_data *particle_data, float *particle_force, - LB_node_force_density_gpu node_f, LB_rho_v_gpu *d_v, bool couple_virtual, - uint64_t philox_counter, float friction, float *lb_boundary_velocity) { +__global__ void +calc_fluid_particle_ia(LB_nodes_gpu n_a, CUDA_particle_data *particle_data, + float *particle_force, LB_node_force_density_gpu node_f, + LB_rho_v_gpu *d_v, bool couple_virtual, + uint64_t philox_counter, float friction) { unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; @@ -2254,14 +2222,14 @@ __global__ void calc_fluid_particle_ia( * force that acts back onto the fluid. */ calc_viscous_force( n_a, delta, particle_data, particle_force, part_index, delta_j, - node_index, d_v, 0, philox_counter, friction, lb_boundary_velocity); + node_index, d_v, 0, philox_counter, friction); calc_node_force(delta, delta_j, node_index, node_f); #ifdef ENGINE if (particle_data[part_index].swim.swimming) { calc_viscous_force( n_a, delta, particle_data, particle_force, part_index, delta_j, - node_index, d_v, 1, philox_counter, friction, lb_boundary_velocity); + node_index, d_v, 1, philox_counter, friction); calc_node_force(delta, delta_j, node_index, node_f); } #endif @@ -2540,7 +2508,11 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes, if (this_node != 0) return; - size_of_boundindex = number_of_boundnodes * sizeof(int); + float *boundary_velocity = nullptr; + int *boundary_node_list = nullptr; + int *boundary_index_list = nullptr; + + auto const size_of_boundindex = number_of_boundnodes * sizeof(int); cuda_safe_mem(cudaMalloc((void **)&boundary_node_list, size_of_boundindex)); cuda_safe_mem(cudaMalloc((void **)&boundary_index_list, size_of_boundindex)); cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list, @@ -2549,10 +2521,10 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes, size_of_boundindex, cudaMemcpyHostToDevice)); cuda_safe_mem(cudaMalloc((void **)&lb_boundary_force, 3 * host_n_lb_boundaries * sizeof(float))); - cuda_safe_mem(cudaMalloc((void **)&lb_boundary_velocity, + cuda_safe_mem(cudaMalloc((void **)&boundary_velocity, 3 * host_n_lb_boundaries * sizeof(float))); cuda_safe_mem( - cudaMemcpy(lb_boundary_velocity, host_lb_boundary_velocity, + cudaMemcpy(boundary_velocity, host_lb_boundary_velocity, 3 * LBBoundaries::lbboundaries.size() * sizeof(float), cudaMemcpyHostToDevice)); @@ -2585,10 +2557,14 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes, make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1); KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound, - boundary_node_list, boundary_index_list, lb_boundary_velocity, + boundary_node_list, boundary_index_list, boundary_velocity, number_of_boundnodes, boundaries); } + cudaFree(boundary_velocity); + cudaFree(boundary_node_list); + cudaFree(boundary_index_list); + cudaDeviceSynchronize(); } #endif @@ -2669,15 +2645,13 @@ void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction) { threads_per_block_particles, *current_nodes, gpu_get_particle_pointer(), gpu_get_particle_force_pointer(), node_f, device_rho_v, couple_virtual, - rng_counter_coupling_gpu->value(), friction, - lb_boundary_velocity); + rng_counter_coupling_gpu->value(), friction); } else { // We use a dummy value for the RNG counter if no temperature is set. KERNELCALL(calc_fluid_particle_ia, dim_grid_particles, threads_per_block_particles, *current_nodes, gpu_get_particle_pointer(), gpu_get_particle_force_pointer(), - node_f, device_rho_v, couple_virtual, 0, friction, - lb_boundary_velocity); + node_f, device_rho_v, couple_virtual, 0, friction); } } } @@ -3052,17 +3026,14 @@ void lb_lbfluid_get_population(const Utils::Vector3i &xyz, template struct interpolation { LB_nodes_gpu current_nodes_gpu; LB_rho_v_gpu *d_v_gpu; - float *lb_boundary_velocity; - interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu, - float *lb_boundary_velocity) - : current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu), - lb_boundary_velocity(lb_boundary_velocity){}; + interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu) + : current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu) {} __device__ float3 operator()(const float3 &position) const { float _position[3] = {position.x, position.y, position.z}; Utils::Array node_indices; Utils::Array delta; - return velocity_interpolation(current_nodes_gpu, _position, - lb_boundary_velocity, node_indices, delta); + return velocity_interpolation(current_nodes_gpu, _position, node_indices, + delta); } }; @@ -3078,10 +3049,10 @@ void lb_get_interpolated_velocity_gpu(double const *positions, } thrust::device_vector positions_device = positions_host; thrust::device_vector velocities_device(length); - thrust::transform(positions_device.begin(), positions_device.end(), - velocities_device.begin(), - interpolation( - *current_nodes, device_rho_v, lb_boundary_velocity)); + thrust::transform( + positions_device.begin(), positions_device.end(), + velocities_device.begin(), + interpolation(*current_nodes, device_rho_v)); thrust::host_vector velocities_host = velocities_device; int index = 0; for (auto v : velocities_host) {