Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lbgpu node vel #2878

Merged
merged 10 commits into from
Jul 11, 2019
179 changes: 75 additions & 104 deletions src/core/grid_based_algorithms/lbgpu_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,6 @@ static LB_extern_nodeforcedensity_gpu *extern_node_force_densities = nullptr;
/** @brief Force on the boundary nodes */
static float *lb_boundary_force = nullptr;

/** @brief Velocity at the boundary */
static float *lb_boundary_velocity = nullptr;

/** @name pointers for bound index array */
/*@{*/
static int *boundary_node_list = nullptr;
static int *boundary_index_list = nullptr;
static size_t size_of_boundindex = 0;
/*@}*/

/** @name pointers for additional cuda check flag */
/*@{*/
static int *gpu_check = nullptr;
Expand Down Expand Up @@ -1371,9 +1361,27 @@ __device__ __inline__ float three_point_polynomial_larger_than_half(float u) {
(5.f + -3 * fabsf(u) - sqrtf(-2.f + 6.f * fabsf(u) - 3.f * u * u));
}

/**
* @brief Get velocity of at index.
*
*/
__device__ __inline__ float3 node_velocity(float rho_eq, LB_nodes_gpu n_a,
int index) {
auto const boundary_index = n_a.boundary[index];

if (boundary_index) {
auto const &u = n_a.boundary_velocity[index];
return make_float3(u[0], u[1], u[2]);
}

auto const rho = rho_eq + calc_mode_x_from_n(n_a, index, 0);
return make_float3(calc_mode_x_from_n(n_a, index, 1) / rho,
calc_mode_x_from_n(n_a, index, 2) / rho,
calc_mode_x_from_n(n_a, index, 3) / rho);
}

__device__ __inline__ float3
velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
float *lb_boundary_velocity,
Utils::Array<unsigned int, 27> &node_indices,
Utils::Array<float, 27> &delta) {
Utils::Array<int, 3> center_node_index{};
Expand Down Expand Up @@ -1431,31 +1439,14 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
auto const z =
fold_if_necessary(center_node_index[2] - 1 + k, para->dim_z);
delta[cnt] = temp_delta[i].x * temp_delta[j].y * temp_delta[k].z;
node_indices[cnt] = xyz_to_index(x, y, z);
auto const boundary_index = n_a.boundary[node_indices[cnt]];
if (not boundary_index) {
float totmass = 0.0f;
auto const mass_mode = calc_mode_x_from_n(n_a, node_indices[cnt], 0);

totmass += mass_mode + para->rho;

auto const j_x = calc_mode_x_from_n(n_a, node_indices[cnt], 1);
auto const j_y = calc_mode_x_from_n(n_a, node_indices[cnt], 2);
auto const j_z = calc_mode_x_from_n(n_a, node_indices[cnt], 3);
interpolated_u.x += (j_x / totmass) * delta[cnt];
interpolated_u.y += (j_y / totmass) * delta[cnt];
interpolated_u.z += (j_z / totmass) * delta[cnt];
} else {
interpolated_u.x +=
lb_boundary_velocity[3 * (boundary_index - 1) + 0] * para->tau /
para->agrid * delta[cnt];
interpolated_u.y +=
lb_boundary_velocity[3 * (boundary_index - 1) + 1] * para->tau /
para->agrid * delta[cnt];
interpolated_u.z +=
lb_boundary_velocity[3 * (boundary_index - 1) + 2] * para->tau /
para->agrid * delta[cnt];
}
auto const index = xyz_to_index(x, y, z);
node_indices[cnt] = index;

auto const node_u = node_velocity(para->rho, n_a, index);
interpolated_u.x += delta[cnt] * node_u.x;
interpolated_u.y += delta[cnt] * node_u.y;
interpolated_u.z += delta[cnt] * node_u.z;

++cnt;
}
}
Expand All @@ -1469,12 +1460,12 @@ velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
* @param[in] particle_position Particle position
* @param[out] node_index Node index around (8) particle
* @param[out] delta Weighting of particle position
* @param[in] lb_boundary_velocity Velocity at the boundary
* @retval Interpolated velocity
*/
__device__ __inline__ float3 velocity_interpolation(
LB_nodes_gpu n_a, float *particle_position, float *lb_boundary_velocity,
Utils::Array<unsigned int, 8> &node_index, Utils::Array<float, 8> &delta) {
__device__ __inline__ float3
velocity_interpolation(LB_nodes_gpu n_a, float *particle_position,
Utils::Array<unsigned int, 8> &node_index,
Utils::Array<float, 8> &delta) {
Utils::Array<int, 3> left_node_index;
Utils::Array<float, 6> temp_delta;
// see ahlrichs + duenweg page 8227 equ (10) and (11)
Expand All @@ -1497,9 +1488,9 @@ __device__ __inline__ float3 velocity_interpolation(

// modulo for negative numbers is strange at best, shift to make sure we are
// positive
int x = (left_node_index[0] + para->dim_x) % para->dim_x;
int y = (left_node_index[1] + para->dim_y) % para->dim_y;
int z = (left_node_index[2] + para->dim_z) % para->dim_z;
int const x = (left_node_index[0] + para->dim_x) % para->dim_x;
int const y = (left_node_index[1] + para->dim_y) % para->dim_y;
int const z = (left_node_index[2] + para->dim_z) % para->dim_z;
auto xp1 = x + 1;
auto yp1 = y + 1;
auto zp1 = z + 1;
Expand All @@ -1521,30 +1512,10 @@ __device__ __inline__ float3 velocity_interpolation(
float3 interpolated_u{0.0f, 0.0f, 0.0f};
#pragma unroll
for (int i = 0; i < 8; ++i) {
float totmass = 0.0f;
Utils::Array<float, 19> mode;

calc_m_from_n(n_a, node_index[i], mode);
auto const mass_mode = calc_mode_x_from_n(n_a, node_index[i], 0);

totmass += mass_mode + para->rho;

/* The boolean expression (n_a.boundary[node_index[i]] == 0) causes boundary
nodes to couple with velocity 0 to particles. This is necessary, since
boundary nodes undergo the same LB dynamics as fluid nodes do. The flow
within the boundaries does not interact with the physical fluid, since
these populations are overwritten by the bounce back kernel. Particles
close to walls can couple to this unphysical flow, though.
*/
auto const j_x = calc_mode_x_from_n(n_a, node_index[i], 1);
auto const j_y = calc_mode_x_from_n(n_a, node_index[i], 2);
auto const j_z = calc_mode_x_from_n(n_a, node_index[i], 3);
interpolated_u.x +=
(j_x / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
interpolated_u.y +=
(j_y / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
interpolated_u.z +=
(j_z / totmass) * delta[i] * (n_a.boundary[node_index[i]] == 0);
auto const node_u = node_velocity(para->rho, n_a, node_index[i]);
interpolated_u.x += delta[i] * node_u.x;
interpolated_u.y += delta[i] * node_u.y;
interpolated_u.z += delta[i] * node_u.z;
}
return interpolated_u;
}
Expand All @@ -1562,19 +1533,16 @@ __device__ __inline__ float3 velocity_interpolation(
* @param[in] flag_cs Determine if we are at the centre (0,
* typical) or at the source (1, swimmer only)
* @param[in] friction Friction constant for the particle coupling
* @param[in] lb_boundary_velocity Velocity at the boundary
* @tparam no_of_neighbours The number of neighbours to consider for
* interpolation
*/
template <std::size_t no_of_neighbours>
__device__ void
calc_viscous_force(LB_nodes_gpu n_a,
Utils::Array<float, no_of_neighbours> &delta,
CUDA_particle_data *particle_data, float *particle_force,
unsigned int part_index, float *delta_j,
Utils::Array<unsigned int, no_of_neighbours> &node_index,
LB_rho_v_gpu *d_v, int flag_cs, uint64_t philox_counter,
float friction, float *lb_boundary_velocity) {
__device__ void calc_viscous_force(
LB_nodes_gpu n_a, Utils::Array<float, no_of_neighbours> &delta,
CUDA_particle_data *particle_data, float *particle_force,
unsigned int part_index, float *delta_j,
Utils::Array<unsigned int, no_of_neighbours> &node_index, LB_rho_v_gpu *d_v,
int flag_cs, uint64_t philox_counter, float friction) {
// Zero out workspace
#pragma unroll
for (int jj = 0; jj < 3; ++jj) {
Expand Down Expand Up @@ -1614,8 +1582,8 @@ calc_viscous_force(LB_nodes_gpu n_a,
flag_cs * direction * particle_data[part_index].swim.director[2];
#endif

float3 const interpolated_u = velocity_interpolation(
n_a, position, lb_boundary_velocity, node_index, delta);
float3 const interpolated_u =
velocity_interpolation(n_a, position, node_index, delta);

#ifdef ENGINE
velocity[0] -= particle_data[part_index].swim.v_swim *
Expand Down Expand Up @@ -2230,15 +2198,15 @@ __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_rho_v_gpu *d_v,
* @param[out] node_f Local node force
* @param[in] d_v Local device values
* @param[in] friction Friction constant for the particle coupling
* @param[in] lb_boundary_velocity Velocity at the boundary
* @tparam no_of_neighbours The number of neighbours to consider for
* interpolation
*/
template <std::size_t no_of_neighbours>
__global__ void calc_fluid_particle_ia(
LB_nodes_gpu n_a, CUDA_particle_data *particle_data, float *particle_force,
LB_node_force_density_gpu node_f, LB_rho_v_gpu *d_v, bool couple_virtual,
uint64_t philox_counter, float friction, float *lb_boundary_velocity) {
__global__ void
calc_fluid_particle_ia(LB_nodes_gpu n_a, CUDA_particle_data *particle_data,
float *particle_force, LB_node_force_density_gpu node_f,
LB_rho_v_gpu *d_v, bool couple_virtual,
uint64_t philox_counter, float friction) {

unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x +
blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -2254,14 +2222,14 @@ __global__ void calc_fluid_particle_ia(
* force that acts back onto the fluid. */
calc_viscous_force<no_of_neighbours>(
n_a, delta, particle_data, particle_force, part_index, delta_j,
node_index, d_v, 0, philox_counter, friction, lb_boundary_velocity);
node_index, d_v, 0, philox_counter, friction);
calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);

#ifdef ENGINE
if (particle_data[part_index].swim.swimming) {
calc_viscous_force<no_of_neighbours>(
n_a, delta, particle_data, particle_force, part_index, delta_j,
node_index, d_v, 1, philox_counter, friction, lb_boundary_velocity);
node_index, d_v, 1, philox_counter, friction);
calc_node_force<no_of_neighbours>(delta, delta_j, node_index, node_f);
}
#endif
Expand Down Expand Up @@ -2540,7 +2508,11 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
if (this_node != 0)
return;

size_of_boundindex = number_of_boundnodes * sizeof(int);
float *boundary_velocity = nullptr;
int *boundary_node_list = nullptr;
int *boundary_index_list = nullptr;

auto const size_of_boundindex = number_of_boundnodes * sizeof(int);
cuda_safe_mem(cudaMalloc((void **)&boundary_node_list, size_of_boundindex));
cuda_safe_mem(cudaMalloc((void **)&boundary_index_list, size_of_boundindex));
cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list,
Expand All @@ -2549,10 +2521,10 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
size_of_boundindex, cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMalloc((void **)&lb_boundary_force,
3 * host_n_lb_boundaries * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&lb_boundary_velocity,
cuda_safe_mem(cudaMalloc((void **)&boundary_velocity,
3 * host_n_lb_boundaries * sizeof(float)));
cuda_safe_mem(
cudaMemcpy(lb_boundary_velocity, host_lb_boundary_velocity,
cudaMemcpy(boundary_velocity, host_lb_boundary_velocity,
3 * LBBoundaries::lbboundaries.size() * sizeof(float),
cudaMemcpyHostToDevice));

Expand Down Expand Up @@ -2585,10 +2557,14 @@ void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes,
make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1);

KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound,
boundary_node_list, boundary_index_list, lb_boundary_velocity,
boundary_node_list, boundary_index_list, boundary_velocity,
number_of_boundnodes, boundaries);
}

cudaFree(boundary_velocity);
cudaFree(boundary_node_list);
cudaFree(boundary_index_list);

cudaDeviceSynchronize();
}
#endif
Expand Down Expand Up @@ -2669,15 +2645,13 @@ void lb_calc_particle_lattice_ia_gpu(bool couple_virtual, double friction) {
threads_per_block_particles, *current_nodes,
gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
node_f, device_rho_v, couple_virtual,
rng_counter_coupling_gpu->value(), friction,
lb_boundary_velocity);
rng_counter_coupling_gpu->value(), friction);
} else {
// We use a dummy value for the RNG counter if no temperature is set.
KERNELCALL(calc_fluid_particle_ia<no_of_neighbours>, dim_grid_particles,
threads_per_block_particles, *current_nodes,
gpu_get_particle_pointer(), gpu_get_particle_force_pointer(),
node_f, device_rho_v, couple_virtual, 0, friction,
lb_boundary_velocity);
node_f, device_rho_v, couple_virtual, 0, friction);
}
}
}
Expand Down Expand Up @@ -3052,17 +3026,14 @@ void lb_lbfluid_get_population(const Utils::Vector3i &xyz,
template <std::size_t no_of_neighbours> struct interpolation {
LB_nodes_gpu current_nodes_gpu;
LB_rho_v_gpu *d_v_gpu;
float *lb_boundary_velocity;
interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu,
float *lb_boundary_velocity)
: current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu),
lb_boundary_velocity(lb_boundary_velocity){};
interpolation(LB_nodes_gpu _current_nodes_gpu, LB_rho_v_gpu *_d_v_gpu)
: current_nodes_gpu(_current_nodes_gpu), d_v_gpu(_d_v_gpu) {}
__device__ float3 operator()(const float3 &position) const {
float _position[3] = {position.x, position.y, position.z};
Utils::Array<unsigned int, no_of_neighbours> node_indices;
Utils::Array<float, no_of_neighbours> delta;
return velocity_interpolation(current_nodes_gpu, _position,
lb_boundary_velocity, node_indices, delta);
return velocity_interpolation(current_nodes_gpu, _position, node_indices,
delta);
}
};

Expand All @@ -3078,10 +3049,10 @@ void lb_get_interpolated_velocity_gpu(double const *positions,
}
thrust::device_vector<float3> positions_device = positions_host;
thrust::device_vector<float3> velocities_device(length);
thrust::transform(positions_device.begin(), positions_device.end(),
velocities_device.begin(),
interpolation<no_of_neighbours>(
*current_nodes, device_rho_v, lb_boundary_velocity));
thrust::transform(
positions_device.begin(), positions_device.end(),
velocities_device.begin(),
interpolation<no_of_neighbours>(*current_nodes, device_rho_v));
thrust::host_vector<float3> velocities_host = velocities_device;
int index = 0;
for (auto v : velocities_host) {
Expand Down