diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu index 4076755..05f8272 100644 --- a/cuda_rasterizer/backward.cu +++ b/cuda_rasterizer/backward.cu @@ -170,7 +170,7 @@ renderCUDA( const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; const uint32_t pix_id = W * pix.y + pix.x; - const float2 pixf = { (float)pix.x + 0.5, (float)pix.y + 0.5}; + const float2 pixf = {(float)pix.x, (float)pix.y}; const bool inside = pix.x < W&& pix.y < H; const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; @@ -453,9 +453,9 @@ inline __device__ void computeTransMat( const glm::vec4 & quat, const glm::vec2 & scale, const float* viewmat, - const float4 & intrins, - float tan_fovx, - float tan_fovy, + const float* projmat, + const int W, + const int H, const float* transMat, const float* dL_dtransMat, const float* dL_dnormal3D, @@ -463,69 +463,56 @@ inline __device__ void computeTransMat( glm::vec2 & dL_dscale, glm::vec4 & dL_drot ) { - // camera information - const glm::mat3 W = glm::mat3( - viewmat[0],viewmat[1],viewmat[2], - viewmat[4],viewmat[5],viewmat[6], - viewmat[8],viewmat[9],viewmat[10] - ); // viewmat - - const glm::vec3 cam_pos = glm::vec3(viewmat[12], viewmat[13], viewmat[14]); // camera center - const glm::mat4 P = glm::mat4( - intrins.x, 0.0, 0.0, 0.0, - 0.0, intrins.y, 0.0, 0.0, - intrins.z, intrins.w, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0 + glm::mat4 world2ndc = glm::mat4( + projmat[0], projmat[4], projmat[8], projmat[12], + projmat[1], projmat[5], projmat[9], projmat[13], + projmat[2], projmat[6], projmat[10], projmat[14], + projmat[3], projmat[7], projmat[11], projmat[15] ); - glm::mat3 S = scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f); - glm::mat3 R = quat_to_rotmat(quat); - glm::mat3 RS = R * S; - glm::vec3 p_view = W * p_world + cam_pos; - glm::mat3 M = glm::mat3(W * RS[0], W * RS[1], p_view); + glm::mat3x4 ndc2pix = glm::mat3x4( + glm::vec4(float(W) / 2.0, 0.0, 0.0, float(W-1) / 2.0), + glm::vec4(0.0, float(H) / 2.0, 0.0, float(H-1) / 2.0), + glm::vec4(0.0, 0.0, 0.0, 1.0) + ); + glm::mat3x4 P = world2ndc * ndc2pix; - glm::mat4x3 dL_dT = glm::mat4x3( + glm::mat3 dL_dT = glm::mat3( dL_dtransMat[0], dL_dtransMat[1], dL_dtransMat[2], dL_dtransMat[3], dL_dtransMat[4], dL_dtransMat[5], - dL_dtransMat[6], dL_dtransMat[7], dL_dtransMat[8], - 0.0, 0.0, 0.0 + dL_dtransMat[6], dL_dtransMat[7], dL_dtransMat[8] ); - glm::mat3x4 dL_dM_aug = glm::transpose(P) * glm::transpose(dL_dT); - glm::mat3 dL_dM = glm::mat3( - glm::vec3(dL_dM_aug[0]), - glm::vec3(dL_dM_aug[1]), - glm::vec3(dL_dM_aug[2]) - ); + glm::mat3x4 dL_dsplat = P * glm::transpose(dL_dT); + + const glm::mat3 R = quat_to_rotmat(quat); - glm::mat3 W_t = glm::transpose(W); - glm::mat3 dL_dRS = W_t * dL_dM; - glm::vec3 dL_dRS0 = dL_dRS[0]; - glm::vec3 dL_dRS1 = dL_dRS[1]; - glm::vec3 dL_dpw = dL_dRS[2]; - glm::vec3 dL_dtn = W_t * glm::vec3(dL_dnormal3D[0], dL_dnormal3D[1], dL_dnormal3D[2]); - -#if DUAL_VISIABLE - glm::vec3 tn = W*R[2]; - float cos = glm::dot(-tn, M[2]); - float multiplier = cos > 0 ? 1 : -1; - dL_dtn *= multiplier; + float multiplier = 1; + +#if DUAL_VISIABLE + float3 normal = transformVec4x3({R[2].x, R[2].y, R[2].z}, viewmat); + multiplier = normal.z < 0 ? 1: -1; #endif - glm::mat3 dL_dR = glm::mat3( - dL_dRS0 * glm::vec3(scale.x), - dL_dRS1 * glm::vec3(scale.y), - dL_dtn + float3 dL_dtn = transformVec4x3Transpose({dL_dnormal3D[0],dL_dnormal3D[1],dL_dnormal3D[2]}, viewmat); + glm::mat3 dL_dRS = glm::mat3( + glm::vec3(dL_dsplat[0]), + glm::vec3(dL_dsplat[1]), + multiplier * glm::vec3(dL_dtn.x, dL_dtn.y, dL_dtn.z) ); + // propagate to scale and quat, mean + glm::mat3 dL_dR = glm::mat3( + dL_dRS[0] * glm::vec3(scale.x), + dL_dRS[1] * glm::vec3(scale.y), + dL_dRS[2]); + + dL_dmean3D = glm::vec3(dL_dsplat[2]); dL_drot = quat_to_rotmat_vjp(quat, dL_dR); dL_dscale = glm::vec2( - (float)glm::dot(dL_dRS0, R[0]), - (float)glm::dot(dL_dRS1, R[1]) - ); - - dL_dmean3D = dL_dpw; + (float)glm::dot(dL_dRS[0], R[0]), + (float)glm::dot(dL_dRS[1], R[1])); } @@ -562,35 +549,38 @@ __global__ void preprocessCUDA( if (idx >= P || !(radii[idx] > 0)) return; - const float* transMat = &(transMats[9 * idx]); - const float* dL_dtransMat = &(dL_dtransMats[9 * idx]); - const float* dL_dnormal3D = &(dL_dnormal3Ds[3 * idx]); - - glm::vec3 p_world = glm::vec3(means3D[idx].x, means3D[idx].y, means3D[idx].z); - float4 intrins = {focal_x, focal_y, focal_x * tan_fovx, focal_y * tan_fovy}; - - glm::vec3 dL_dmean3D; - glm::vec2 dL_dscale; - glm::vec4 dL_drot; - computeTransMat( - p_world, - rotations[idx], - scales[idx], - viewmatrix, - intrins, - tan_fovx, - tan_fovy, - transMat, - dL_dtransMat, - dL_dnormal3D, - dL_dmean3D, - dL_dscale, - dL_drot - ); - // update - dL_dmean3Ds[idx] = dL_dmean3D; - dL_dscales[idx] = dL_dscale; - dL_drots[idx] = dL_drot; + if (scales) { + const float* transMat = &(transMats[9 * idx]); + const float* dL_dtransMat = &(dL_dtransMats[9 * idx]); + const float* dL_dnormal3D = &(dL_dnormal3Ds[3 * idx]); + + glm::vec3 p_world = glm::vec3(means3D[idx].x, means3D[idx].y, means3D[idx].z); + + const int W = int(focal_x * tan_fovx * 2); + const int H = int(focal_y * tan_fovy * 2); + glm::vec3 dL_dmean3D; + glm::vec2 dL_dscale; + glm::vec4 dL_drot; + computeTransMat( + p_world, + rotations[idx], + scales[idx], + viewmatrix, + projmatrix, + W, + H, + transMat, + dL_dtransMat, + dL_dnormal3D, + dL_dmean3D, + dL_dscale, + dL_drot + ); + // update + dL_dmean3Ds[idx] = dL_dmean3D; + dL_dscales[idx] = dL_dscale; + dL_drots[idx] = dL_drot; + } if (shs) computeColorFromSH(idx, D, M, (glm::vec3*)means3D, *campos, shs, clamped, (glm::vec3*)dL_dcolors, (glm::vec3*)dL_dmean3Ds, (glm::vec3*)dL_dshs); diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu index 81cebb2..19605f9 100644 --- a/cuda_rasterizer/forward.cu +++ b/cuda_rasterizer/forward.cu @@ -72,47 +72,33 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const // Compute a 2D-to-2D mapping matrix from a tangent plane into a image plane // given a 2D gaussian parameters. -__device__ bool computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat, const glm::vec2 &scale, const float *viewmat, const float4 &intrins, float tan_fovx, float tan_fovy, float* transMat, float3 &normal) { - // Setup cameras - // Currently only support ideal pinhole camera - // but more advanced intrins can be implemented - const glm::mat3 W = glm::mat3( - viewmat[0],viewmat[1],viewmat[2], - viewmat[4],viewmat[5],viewmat[6], - viewmat[8],viewmat[9],viewmat[10] - ); - const glm::vec3 cam_pos = glm::vec3(viewmat[12], viewmat[13], viewmat[14]); // camera center - const glm::mat4 P = glm::mat4( - intrins.x, 0.0, 0.0, 0.0, - 0.0, intrins.y, 0.0, 0.0, - intrins.z, intrins.w, 1.0, 1.0, - 0.0, 0.0, 0.0, 0.0 +__device__ void computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat, const glm::vec2 &scale, const float *viewmat, const float*projmat, const int W, const int H, float* transMat, float3 &normal) { + // setup camera + // can be fatored out to reduce computations + glm::mat4 world2ndc = glm::mat4( + projmat[0], projmat[4], projmat[8], projmat[12], + projmat[1], projmat[5], projmat[9], projmat[13], + projmat[2], projmat[6], projmat[10], projmat[14], + projmat[3], projmat[7], projmat[11], projmat[15] ); + glm::mat3x4 ndc2pix = glm::mat3x4( + glm::vec4(float(W) / 2.0, 0.0, 0.0, float(W-1) / 2.0), + glm::vec4(0.0, float(H) / 2.0, 0.0, float(H-1) / 2.0), + glm::vec4(0.0, 0.0, 0.0, 1.0) + ); + + glm::mat3x4 P = world2ndc * ndc2pix; // Make the geometry of 2D Gaussian as a Homogeneous transformation matrix // under the camera view, See Eq. (5) in 2DGS' paper. - glm::vec3 p_view = W * p_world + cam_pos; - glm::mat3 R = quat_to_rotmat(quat) * scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f); - glm::mat3 M = glm::mat3(W * R[0], W * R[1], p_view); - glm::vec3 tn = W*R[2]; - float cos = glm::dot(-tn, p_view); - -#if BACKFACE_CULL - if (cos == 0.0f) return false; -#endif - -#if RENDER_AXUTILITY and DUAL_VISIABLE - // This means a 2D Gaussian is dual visiable. - // Experimentally, turning off the dual visiable works eqully. - float multiplier = cos > 0 ? 1 : -1; - tn *= multiplier; -#endif - // projection into screen space, see Eq. (7) - glm::mat4x3 T = glm::transpose(P * glm::mat3x4( - glm::vec4(M[0], 0.0), - glm::vec4(M[1], 0.0), - glm::vec4(M[2], 1.0) - )); + glm::mat3 RS = quat_to_rotmat(quat) * scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f); + glm::mat3x4 splat2world = glm::mat3x4( + glm::vec4(RS[0], 0.0), + glm::vec4(RS[1], 0.0), + glm::vec4(p_world, 1.0) + ); + // projection into screen space, see Eq. (7) in 2DGS + glm::mat3 T = glm::transpose(splat2world) * P; transMat[0] = T[0].x; transMat[1] = T[0].y; @@ -123,8 +109,15 @@ __device__ bool computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat, transMat[6] = T[2].x; transMat[7] = T[2].y; transMat[8] = T[2].z; - normal = {tn.x, tn.y, tn.z}; - return true; + + normal = transformVec4x3({RS[2].x, RS[2].y, RS[2].z}, viewmat); + +#if DUAL_VISIABLE + // This means a 2D Gaussian is dual visiable. + // Experimentally, turning off the dual visiable works eqully. + float multiplier = normal.z < 0 ? 1: -1; + normal = {multiplier * normal.x, multiplier * normal.y, multiplier * normal.z}; +#endif } // Computing the bounding box of the 2D Gaussian and its center, @@ -205,34 +198,29 @@ __global__ void preprocessCUDA(int P, int D, int M, if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; - float4 intrins = {focal_x, focal_y, float(W)/2.0, float(H)/2.0}; - glm::vec2 scale = scales[idx]; - glm::vec4 quat = rotations[idx]; - const float* transMat; - bool ok; float3 normal; if (transMat_precomp != nullptr) { transMat = transMat_precomp + idx * 9; + normal = {0.f, 0.f, 0.f}; // not support precomp normal } else { - ok = computeTransMat(p_world, quat, scale, viewmatrix, intrins, tan_fovx, tan_fovy, transMats + idx * 9, normal); - if (!ok) return; + computeTransMat(p_world, rotations[idx], scales[idx], viewmatrix, projmatrix, W, H, transMats + idx * 9, normal); transMat = transMats + idx * 9; } // compute center and extent float2 center; float2 extent; - ok = computeAABB(transMat, center, extent); + bool ok = computeAABB(transMat, center, extent); if (!ok) return; // add the bounding of countour #if TIGHTBBOX // no use in the paper, but it indeed help speeds. // the effective extent is now depended on the opacity of gaussian. - float truncated_R = sqrtf(max(9.f + logf(opacities[idx]), 0.000001)); + float truncated_R = sqrtf(max(9.f + 2.f * logf(opacities[idx]), 0.000001)); #else float truncated_R = 3.f; #endif @@ -287,7 +275,7 @@ renderCUDA( uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; uint32_t pix_id = W * pix.y + pix.x; - float2 pixf = { (float)pix.x + 0.5, (float)pix.y + 0.5}; + float2 pixf = { (float)pix.x, (float)pix.y}; // Check if this thread is associated with a valid pixel or outside. bool inside = pix.x < W&& pix.y < H; diff --git a/rasterize_points.cu b/rasterize_points.cu index 7eff4c7..c1426e8 100644 --- a/rasterize_points.cu +++ b/rasterize_points.cu @@ -62,13 +62,6 @@ RasterizeGaussiansCUDA( AT_ERROR("means3D must have dimensions (num_points, 3)"); } - if (scales.ndimension() != 2 || scales.size(1) != 2) { - AT_ERROR("scales must have dimensions (num_points, 2)"); - } - - if (rotations.ndimension() != 2 || rotations.size(1) != 4) { - AT_ERROR("rotations must have dimensions (num_points, 4)"); - } const int P = means3D.size(0); const int H = image_height; diff --git a/setup.py b/setup.py index 4d949c9..8cbdca9 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="diff_surfel_rasterization", packages=['diff_surfel_rasterization'], - version='0.0.0', + version='0.0.1', ext_modules=[ CUDAExtension( name="diff_surfel_rasterization._C",