Merge pull request #99 from NVlabs/reduce-nerf-memory

Reduce NeRF memory usage by ~1gb
NVlabs · Jan 30, 2022 · 16b6409 · 16b6409
2 parents d6c7241 + 9836695
commit 16b6409
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 138 deletions.
diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn
diff --git a/include/neural-graphics-primitives/marching_cubes.h b/include/neural-graphics-primitives/marching_cubes.h
@@ -21,10 +21,10 @@ NGP_NAMESPACE_BEGIN
 
 Eigen::Vector3i get_marching_cubes_res(uint32_t res_1d, const BoundingBox &aabb);
 
-void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float> &density, tcnn::GPUMemory<Eigen::Vector3f> &vert_out, tcnn::GPUMemory<uint32_t> &indices_out);
+void marching_cubes_gpu(tcnn::GPUMemory<char>& scratch_memory, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<Eigen::Vector3f>& vert_out, tcnn::GPUMemory<uint32_t>& indices_out);
 
 // computes the average of the 1ring of all verts, as homogenous coordinates
-void compute_mesh_1ring(const tcnn::GPUMemory<Eigen::Vector3f> &verts, const tcnn::GPUMemory<uint32_t> &indices, tcnn::GPUMemory<Eigen::Vector4f> &output_pos, tcnn::GPUMemory<Eigen::Vector3f> &output_normals);
+void compute_mesh_1ring(const tcnn::GPUMemory<Eigen::Vector3f>& verts, const tcnn::GPUMemory<uint32_t>& indices, tcnn::GPUMemory<Eigen::Vector4f>& output_pos, tcnn::GPUMemory<Eigen::Vector3f>& output_normals);
 
 void compute_mesh_opt_gradients(float thresh,
 	const tcnn::GPUMemory<Eigen::Vector3f>& verts, const tcnn::GPUMemory<Eigen::Vector3f>& vert_normals,

diff --git a/include/neural-graphics-primitives/nerf.h b/include/neural-graphics-primitives/nerf.h
@@ -34,18 +34,22 @@ struct NerfPayload {
 };
 
 struct RaysNerfSoa {
-	void enlarge(size_t n_elements) {
-		rgba.enlarge(n_elements);
-		payload.enlarge(n_elements);
-	}
 #ifdef __NVCC__
 	void copy_from_other_async(const RaysNerfSoa& other, cudaStream_t stream) {
-		CUDA_CHECK_THROW(cudaMemcpyAsync(rgba.data(), other.rgba.data(), rgba.get_bytes(), cudaMemcpyDeviceToDevice, stream));
-		CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), payload.get_bytes(), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(rgba, other.rgba, size * sizeof(Eigen::Array4f), cudaMemcpyDeviceToDevice, stream));
+		CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, size * sizeof(NerfPayload), cudaMemcpyDeviceToDevice, stream));
 	}
 #endif
-	tcnn::GPUMemory<Eigen::Array4f> rgba;
-	tcnn::GPUMemory<NerfPayload> payload;
+
+	void set(Eigen::Array4f* _rgba, NerfPayload* _payload, size_t _size) {
+		rgba = _rgba;
+		payload = _payload;
+		size = _size;
+	}
+
+	Eigen::Array4f* rgba;
+	NerfPayload* payload;
+	size_t size;
 };
 
 

diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
@@ -75,7 +75,8 @@ class Testbed {
 	public:
 		SphereTracer() : m_hit_counter(1), m_alive_counter(1) {}
 
-		void init_rays_from_camera(uint32_t spp,
+		void init_rays_from_camera(
+			uint32_t spp,
 			const Eigen::Vector2i& resolution,
 			const Eigen::Vector2f& focal_length,
 			const Eigen::Matrix<float, 3, 4>& camera_matrix,
@@ -88,7 +89,8 @@ class Testbed {
 			const float* envmap_data,
 			const Eigen::Vector2i& envmap_resolution,
 			Eigen::Array4f* frame_buffer,
-			const TriangleOctree* octree, cudaStream_t stream);
+			const TriangleOctree* octree, cudaStream_t stream
+		);
 
 		void init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream);
 		uint32_t trace_bvh(TriangleBvh* bvh, const Triangle* triangles, cudaStream_t stream);
@@ -113,7 +115,9 @@ class Testbed {
 	public:
 		NerfTracer() : m_hit_counter(1), m_alive_counter(1) {}
 
-		void init_rays_from_camera(uint32_t spp,
+		void init_rays_from_camera(
+			tcnn::GPUMemory<char>& scratch_memory,
+			uint32_t spp,
 			uint32_t padded_output_width,
 			const Eigen::Vector2i& resolution,
 			const Eigen::Vector2f& focal_length,
@@ -158,16 +162,16 @@ class Testbed {
 			cudaStream_t stream
 		);
 
-		void enlarge(size_t n_elements, uint32_t padded_output_width);
+		void enlarge(tcnn::GPUMemory<char>& scratch_memory, size_t n_elements, uint32_t padded_output_width);
 		RaysNerfSoa& rays_hit() { return m_rays_hit; }
 		RaysNerfSoa& rays_init() { return m_rays[0]; }
 		uint32_t n_rays_initialized() const { return m_n_rays_initialized; }
 
 	private:
 		RaysNerfSoa m_rays[2];
 		RaysNerfSoa m_rays_hit;
-		tcnn::GPUMemory<precision_t> m_network_output;
-		tcnn::GPUMemory<NerfCoordinate> m_network_input;
+		precision_t* m_network_output;
+		NerfCoordinate* m_network_input;
 		tcnn::GPUMemory<uint32_t> m_hit_counter;
 		tcnn::GPUMemory<uint32_t> m_alive_counter;
 		uint32_t m_n_rays_initialized = 0;
@@ -421,6 +425,8 @@ class Testbed {
 	std::vector<CudaRenderBuffer> m_render_surfaces;
 	std::unique_ptr<CudaRenderBuffer> m_pip_render_surface;
 
+	tcnn::GPUMemory<char> m_scratch_gpu_memory;
+
 	struct Nerf {
 		NerfTracer tracer;
 
@@ -464,21 +470,10 @@ class Testbed {
 			std::vector<RotationAdamOptimizer> cam_rot_offset;
 			AdamOptimizer<Eigen::Vector2f> cam_focal_length_offset = AdamOptimizer<Eigen::Vector2f>(0.f);
 
-			tcnn::GPUMemory<uint32_t> ray_indices;
-			tcnn::GPUMemory<Ray> rays;
-			tcnn::GPUMemory<uint32_t> numsteps; // number of steps each ray took
 			tcnn::GPUMemory<uint32_t> numsteps_counter; // number of steps each ray took
 			tcnn::GPUMemory<uint32_t> numsteps_counter_compacted; // number of steps each ray took
 			tcnn::GPUMemory<uint32_t> ray_counter;
-			tcnn::GPUMemory<NerfCoordinate> coords;
-			tcnn::GPUMemory<NerfCoordinate> coords_compacted;
-			tcnn::GPUMemory<NerfCoordinate> coords_gradient;
-			tcnn::GPUMemory<precision_t> mlp_out; // space for mlp to output into - half, padded output size
-			tcnn::GPUMemory<precision_t> mlp_out_trimmed;
-			tcnn::GPUMemory<precision_t> dloss_dmlp_out; // space for loss gradients - padded_output_width
 			tcnn::GPUMemory<float> loss;
-			tcnn::GPUMemory<float> max_level;
-			tcnn::GPUMemory<float> max_level_compacted;
 
 			uint32_t rays_per_batch = 1<<12;
 			uint32_t n_rays_total = 0;
@@ -515,11 +510,8 @@ class Testbed {
 		} training = {};
 
 		tcnn::GPUMemory<float> density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network
-		tcnn::GPUMemory<NerfPosition> density_grid_positions;
-		tcnn::GPUMemory<uint32_t> density_grid_indices;
 		tcnn::GPUMemory<uint8_t> density_grid_bitfield;
 		uint8_t* get_density_grid_bitfield_mip(uint32_t mip);
-		tcnn::GPUMemory<float> density_grid_tmp;
 		tcnn::GPUMemory<float> density_grid_mean;
 		uint32_t density_grid_ema_step = 0;
 

diff --git a/src/marching_cubes.cu b/src/marching_cubes.cu
@@ -743,15 +743,15 @@ void compute_mesh_opt_gradients(float thresh,
 	);
 }
 
-void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float> &density, tcnn::GPUMemory<Eigen::Vector3f>& verts_out, tcnn::GPUMemory<uint32_t>& indices_out) {
+void marching_cubes_gpu(GPUMemory<char>& scratch_memory, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<Eigen::Vector3f>& verts_out, tcnn::GPUMemory<uint32_t>& indices_out) {
 	GPUMemory<uint32_t> counters;
 
 	counters.enlarge(4);
 	counters.memset(0);
 
-	GPUMemory<int> vertex_grid;
-	vertex_grid.enlarge(res_3d.x()*res_3d.y()*res_3d.z()*3);
-	vertex_grid.memset(-1);
+	scratch_memory.enlarge(res_3d.x() * res_3d.y() * res_3d.z() * 3 * sizeof(int));
+	scratch_memory.memset(-1);
+	int* vertex_grid = (int*)scratch_memory.data();
 
 	const dim3 threads = { 4, 4, 4 };
 	const dim3 blocks = { div_round_up((uint32_t)res_3d.x(), threads.x), div_round_up((uint32_t)res_3d.y(), threads.y), div_round_up((uint32_t)res_3d.z(), threads.z) };
@@ -767,8 +767,8 @@ void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh,
 	verts_out.memset(0);
 	indices_out.resize(cpucounters[1]);
 	// actually generate verts
-	gen_vertices<<<blocks, threads, 0>>>(aabb, res_3d, density.data(), vertex_grid.data(), verts_out.data(), thresh, counters.data()+2);
-	gen_faces<<<blocks, threads, 0>>>(res_3d, density.data(), vertex_grid.data(), indices_out.data(), thresh, counters.data()+2);
+	gen_vertices<<<blocks, threads, 0>>>(aabb, res_3d, density.data(), vertex_grid, verts_out.data(), thresh, counters.data()+2);
+	gen_faces<<<blocks, threads, 0>>>(res_3d, density.data(), vertex_grid, indices_out.data(), thresh, counters.data()+2);
 }
 
 void save_mesh(