diff --git a/cuda_rasterizer/backward.cu b/cuda_rasterizer/backward.cu
index 4076755..05f8272 100644
--- a/cuda_rasterizer/backward.cu
+++ b/cuda_rasterizer/backward.cu
@@ -170,7 +170,7 @@ renderCUDA(
 	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
 	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
 	const uint32_t pix_id = W * pix.y + pix.x;
-	const float2 pixf = { (float)pix.x + 0.5, (float)pix.y + 0.5};
+	const float2 pixf = {(float)pix.x, (float)pix.y};
 
 	const bool inside = pix.x < W&& pix.y < H;
 	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
@@ -453,9 +453,9 @@ inline __device__ void computeTransMat(
 	const glm::vec4 & quat,
 	const glm::vec2 & scale,
 	const float* viewmat,
-	const float4 & intrins,
-	float tan_fovx, 
-	float tan_fovy,
+	const float* projmat,
+	const int W,
+	const int H,
 	const float* transMat,
 	const float* dL_dtransMat,
 	const float* dL_dnormal3D,
@@ -463,69 +463,56 @@ inline __device__ void computeTransMat(
 	glm::vec2 & dL_dscale,
 	glm::vec4 & dL_drot
 ) {
-	// camera information 
-	const glm::mat3 W = glm::mat3(
-		viewmat[0],viewmat[1],viewmat[2],
-		viewmat[4],viewmat[5],viewmat[6],
-		viewmat[8],viewmat[9],viewmat[10]
-	); // viewmat 
-
-	const glm::vec3 cam_pos = glm::vec3(viewmat[12], viewmat[13], viewmat[14]); // camera center
-	const glm::mat4 P = glm::mat4(
-		intrins.x, 0.0, 0.0, 0.0,
-		0.0, intrins.y, 0.0, 0.0,
-		intrins.z, intrins.w, 1.0, 1.0,
-		0.0, 0.0, 0.0, 0.0
+	glm::mat4 world2ndc = glm::mat4(
+		projmat[0], projmat[4], projmat[8], projmat[12],
+		projmat[1], projmat[5], projmat[9], projmat[13],
+		projmat[2], projmat[6], projmat[10], projmat[14],
+		projmat[3], projmat[7], projmat[11], projmat[15]
 	);
 
-	glm::mat3 S = scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f);
-	glm::mat3 R = quat_to_rotmat(quat);
-	glm::mat3 RS = R * S;
-	glm::vec3 p_view = W * p_world + cam_pos;
-	glm::mat3 M = glm::mat3(W * RS[0], W * RS[1], p_view);
+	glm::mat3x4 ndc2pix = glm::mat3x4(
+		glm::vec4(float(W) / 2.0, 0.0, 0.0, float(W-1) / 2.0),
+		glm::vec4(0.0, float(H) / 2.0, 0.0, float(H-1) / 2.0),
+		glm::vec4(0.0, 0.0, 0.0, 1.0)
+	);
 
+	glm::mat3x4 P = world2ndc * ndc2pix;
 
-	glm::mat4x3 dL_dT = glm::mat4x3(
+	glm::mat3 dL_dT = glm::mat3(
 		dL_dtransMat[0], dL_dtransMat[1], dL_dtransMat[2],
 		dL_dtransMat[3], dL_dtransMat[4], dL_dtransMat[5],
-		dL_dtransMat[6], dL_dtransMat[7], dL_dtransMat[8],
-		0.0, 0.0, 0.0
+		dL_dtransMat[6], dL_dtransMat[7], dL_dtransMat[8]
 	);
 
-	glm::mat3x4 dL_dM_aug = glm::transpose(P) * glm::transpose(dL_dT);
-	glm::mat3 dL_dM = glm::mat3(
-		glm::vec3(dL_dM_aug[0]),
-		glm::vec3(dL_dM_aug[1]),
-		glm::vec3(dL_dM_aug[2])
-	);
+	glm::mat3x4 dL_dsplat = P * glm::transpose(dL_dT);
+
+	const glm::mat3 R = quat_to_rotmat(quat);
 
-	glm::mat3 W_t = glm::transpose(W);
-	glm::mat3 dL_dRS = W_t * dL_dM;
-	glm::vec3 dL_dRS0 = dL_dRS[0];
-	glm::vec3 dL_dRS1 = dL_dRS[1];
-	glm::vec3 dL_dpw = dL_dRS[2];
-	glm::vec3 dL_dtn = W_t * glm::vec3(dL_dnormal3D[0], dL_dnormal3D[1], dL_dnormal3D[2]);
-
-#if DUAL_VISIABLE
-	glm::vec3 tn = W*R[2];
-	float cos = glm::dot(-tn, M[2]);
-	float multiplier = cos > 0 ? 1 : -1;
-	dL_dtn *= multiplier;
+	float multiplier = 1;
+
+#if  DUAL_VISIABLE
+	float3 normal = transformVec4x3({R[2].x, R[2].y, R[2].z}, viewmat);
+	multiplier = normal.z < 0 ? 1: -1;
 #endif
 
-	glm::mat3 dL_dR = glm::mat3(
-		dL_dRS0 * glm::vec3(scale.x),
-		dL_dRS1 * glm::vec3(scale.y),
-		dL_dtn
+	float3 dL_dtn = transformVec4x3Transpose({dL_dnormal3D[0],dL_dnormal3D[1],dL_dnormal3D[2]}, viewmat);
+	glm::mat3 dL_dRS = glm::mat3(
+		glm::vec3(dL_dsplat[0]),
+		glm::vec3(dL_dsplat[1]),
+		multiplier * glm::vec3(dL_dtn.x, dL_dtn.y, dL_dtn.z)
 	);
 
+	// propagate to scale and quat, mean
+	glm::mat3 dL_dR = glm::mat3(
+		dL_dRS[0] * glm::vec3(scale.x),
+		dL_dRS[1] * glm::vec3(scale.y),
+		dL_dRS[2]);
+
+	dL_dmean3D = glm::vec3(dL_dsplat[2]);
 	dL_drot = quat_to_rotmat_vjp(quat, dL_dR);
 	dL_dscale = glm::vec2(
-		(float)glm::dot(dL_dRS0, R[0]),
-		(float)glm::dot(dL_dRS1, R[1])
-	);
-
-	dL_dmean3D = dL_dpw;
+		(float)glm::dot(dL_dRS[0], R[0]),
+		(float)glm::dot(dL_dRS[1], R[1]));
 }
 
 
@@ -562,35 +549,38 @@ __global__ void preprocessCUDA(
 	if (idx >= P || !(radii[idx] > 0))
 		return;
 
-	const float* transMat = &(transMats[9 * idx]);
-	const float* dL_dtransMat = &(dL_dtransMats[9 * idx]);
-	const float* dL_dnormal3D = &(dL_dnormal3Ds[3 * idx]);
-
-	glm::vec3 p_world = glm::vec3(means3D[idx].x, means3D[idx].y, means3D[idx].z);
-	float4 intrins = {focal_x, focal_y, focal_x * tan_fovx, focal_y * tan_fovy};
-
-	glm::vec3 dL_dmean3D;
-	glm::vec2 dL_dscale;
-	glm::vec4 dL_drot;
-	computeTransMat(
-		p_world,
-		rotations[idx],
-		scales[idx],
-		viewmatrix,
-		intrins,
-		tan_fovx,
-		tan_fovy,
-		transMat, 
-		dL_dtransMat,
-		dL_dnormal3D,
-		dL_dmean3D, 
-		dL_dscale,
-		dL_drot
-	);
-	// update 
-	dL_dmean3Ds[idx] = dL_dmean3D;
-	dL_dscales[idx] = dL_dscale;
-	dL_drots[idx] = dL_drot;
+	if (scales) {
+		const float* transMat = &(transMats[9 * idx]);
+		const float* dL_dtransMat = &(dL_dtransMats[9 * idx]);
+		const float* dL_dnormal3D = &(dL_dnormal3Ds[3 * idx]);
+
+		glm::vec3 p_world = glm::vec3(means3D[idx].x, means3D[idx].y, means3D[idx].z);
+
+		const int W = int(focal_x * tan_fovx * 2);
+		const int H = int(focal_y * tan_fovy * 2);
+		glm::vec3 dL_dmean3D;
+		glm::vec2 dL_dscale;
+		glm::vec4 dL_drot;
+		computeTransMat(
+			p_world,
+			rotations[idx],
+			scales[idx],
+			viewmatrix,
+			projmatrix,
+			W,
+			H,
+			transMat, 
+			dL_dtransMat,
+			dL_dnormal3D,
+			dL_dmean3D, 
+			dL_dscale,
+			dL_drot
+		);
+		// update 
+		dL_dmean3Ds[idx] = dL_dmean3D;
+		dL_dscales[idx] = dL_dscale;
+		dL_drots[idx] = dL_drot;
+	}
 
 	if (shs)
 		computeColorFromSH(idx, D, M, (glm::vec3*)means3D, *campos, shs, clamped, (glm::vec3*)dL_dcolors, (glm::vec3*)dL_dmean3Ds, (glm::vec3*)dL_dshs);
diff --git a/cuda_rasterizer/forward.cu b/cuda_rasterizer/forward.cu
index 81cebb2..19605f9 100644
--- a/cuda_rasterizer/forward.cu
+++ b/cuda_rasterizer/forward.cu
@@ -72,47 +72,33 @@ __device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const
 
 // Compute a 2D-to-2D mapping matrix from a tangent plane into a image plane
 // given a 2D gaussian parameters.
-__device__ bool computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat, const glm::vec2 &scale, const float *viewmat, const float4 &intrins, float tan_fovx, float tan_fovy, float* transMat, float3 &normal) {
-	// Setup cameras
-	// Currently only support ideal pinhole camera
-	// but more advanced intrins can be implemented
-	const glm::mat3 W = glm::mat3(
-		viewmat[0],viewmat[1],viewmat[2],
-		viewmat[4],viewmat[5],viewmat[6],
-		viewmat[8],viewmat[9],viewmat[10]
-	); 
-	const glm::vec3 cam_pos = glm::vec3(viewmat[12], viewmat[13], viewmat[14]); // camera center
-	const glm::mat4 P = glm::mat4(
-		intrins.x, 0.0, 0.0, 0.0,
-		0.0, intrins.y, 0.0, 0.0,
-		intrins.z, intrins.w, 1.0, 1.0,
-		0.0, 0.0, 0.0, 0.0
+__device__ void computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat, const glm::vec2 &scale, const float *viewmat, const float*projmat, const int W, const int H, float* transMat, float3 &normal) {
+	// setup camera
+	// can be fatored out to reduce computations
+	glm::mat4 world2ndc = glm::mat4(
+		projmat[0], projmat[4], projmat[8], projmat[12],
+		projmat[1], projmat[5], projmat[9], projmat[13],
+		projmat[2], projmat[6], projmat[10], projmat[14],
+		projmat[3], projmat[7], projmat[11], projmat[15]
 	);
 
+	glm::mat3x4 ndc2pix = glm::mat3x4(
+		glm::vec4(float(W) / 2.0, 0.0, 0.0, float(W-1) / 2.0),
+		glm::vec4(0.0, float(H) / 2.0, 0.0, float(H-1) / 2.0),
+		glm::vec4(0.0, 0.0, 0.0, 1.0)
+	);
+
+	glm::mat3x4 P = world2ndc * ndc2pix;
 	// Make the geometry of 2D Gaussian as a Homogeneous transformation matrix 
 	// under the camera view, See Eq. (5) in 2DGS' paper.
-	glm::vec3 p_view = W * p_world + cam_pos;
-	glm::mat3 R = quat_to_rotmat(quat) * scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f);
-	glm::mat3 M = glm::mat3(W * R[0], W * R[1], p_view);
-	glm::vec3 tn = W*R[2];
-	float cos = glm::dot(-tn, p_view);
-
-#if BACKFACE_CULL
-	if (cos == 0.0f) return false;
-#endif
-
-#if RENDER_AXUTILITY and DUAL_VISIABLE
-	// This means a 2D Gaussian is dual visiable.
-	// Experimentally, turning off the dual visiable works eqully.
-	float multiplier = cos > 0 ? 1 : -1;
-	tn *= multiplier;
-#endif
-	// projection into screen space, see Eq. (7)
-	glm::mat4x3 T = glm::transpose(P * glm::mat3x4(
-		glm::vec4(M[0], 0.0),
-		glm::vec4(M[1], 0.0),
-		glm::vec4(M[2], 1.0)
-	));
+	glm::mat3 RS = quat_to_rotmat(quat) * scale_to_mat({scale.x, scale.y, 1.0f}, 1.0f);
+	glm::mat3x4 splat2world = glm::mat3x4(
+		glm::vec4(RS[0], 0.0),
+		glm::vec4(RS[1], 0.0),
+		glm::vec4(p_world, 1.0)
+	);
+	// projection into screen space, see Eq. (7) in 2DGS
+	glm::mat3 T = glm::transpose(splat2world) * P;
 
 	transMat[0] = T[0].x;
 	transMat[1] = T[0].y;
@@ -123,8 +109,15 @@ __device__ bool computeTransMat(const glm::vec3 &p_world, const glm::vec4 &quat,
 	transMat[6] = T[2].x;
 	transMat[7] = T[2].y;
 	transMat[8] = T[2].z;
-	normal = {tn.x, tn.y, tn.z};
-	return true;
+
+	normal = transformVec4x3({RS[2].x, RS[2].y, RS[2].z}, viewmat);
+
+#if DUAL_VISIABLE
+	// This means a 2D Gaussian is dual visiable.
+	// Experimentally, turning off the dual visiable works eqully.
+	float multiplier = normal.z < 0 ? 1: -1;
+	normal = {multiplier * normal.x, multiplier * normal.y, multiplier * normal.z};
+#endif
 }
 
 // Computing the bounding box of the 2D Gaussian and its center,
@@ -205,34 +198,29 @@ __global__ void preprocessCUDA(int P, int D, int M,
 	if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
 		return;
 	
-	float4 intrins = {focal_x, focal_y, float(W)/2.0, float(H)/2.0};
-	glm::vec2 scale = scales[idx];
-	glm::vec4 quat = rotations[idx];
-	
 	const float* transMat;
-	bool ok;
 	float3 normal;
 	if (transMat_precomp != nullptr)
 	{
 		transMat = transMat_precomp + idx * 9;
+		normal = {0.f, 0.f, 0.f}; // not support precomp normal
 	}
 	else
 	{
-		ok = computeTransMat(p_world, quat, scale, viewmatrix, intrins, tan_fovx, tan_fovy, transMats + idx * 9, normal);
-		if (!ok) return;
+		computeTransMat(p_world, rotations[idx], scales[idx], viewmatrix, projmatrix, W, H, transMats + idx * 9, normal);
 		transMat = transMats + idx * 9;
 	}
 	
 	//  compute center and extent
 	float2 center;
 	float2 extent;
-	ok = computeAABB(transMat, center, extent);
+	bool ok = computeAABB(transMat, center, extent);
 	if (!ok) return;
 
 	// add the bounding of countour
 #if TIGHTBBOX // no use in the paper, but it indeed help speeds.
 	// the effective extent is now depended on the opacity of gaussian.
-	float truncated_R = sqrtf(max(9.f + logf(opacities[idx]), 0.000001));
+	float truncated_R = sqrtf(max(9.f + 2.f * logf(opacities[idx]), 0.000001));
 #else
 	float truncated_R = 3.f;
 #endif
@@ -287,7 +275,7 @@ renderCUDA(
 	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
 	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
 	uint32_t pix_id = W * pix.y + pix.x;
-	float2 pixf = { (float)pix.x + 0.5, (float)pix.y + 0.5};
+	float2 pixf = { (float)pix.x, (float)pix.y};
 
 	// Check if this thread is associated with a valid pixel or outside.
 	bool inside = pix.x < W&& pix.y < H;
diff --git a/rasterize_points.cu b/rasterize_points.cu
index 7eff4c7..c1426e8 100644
--- a/rasterize_points.cu
+++ b/rasterize_points.cu
@@ -62,13 +62,6 @@ RasterizeGaussiansCUDA(
 	AT_ERROR("means3D must have dimensions (num_points, 3)");
   }
 
-  if (scales.ndimension() != 2 || scales.size(1) != 2) {
-	AT_ERROR("scales must have dimensions (num_points, 2)");
-  }
-
-  if (rotations.ndimension() != 2 || rotations.size(1) != 4) {
-	AT_ERROR("rotations must have dimensions (num_points, 4)");
-  }
   
   const int P = means3D.size(0);
   const int H = image_height;
diff --git a/setup.py b/setup.py
index 4d949c9..8cbdca9 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 setup(
     name="diff_surfel_rasterization",
     packages=['diff_surfel_rasterization'],
-    version='0.0.0',
+    version='0.0.1',
     ext_modules=[
         CUDAExtension(
             name="diff_surfel_rasterization._C",