Skip to content

Commit

Permalink
Merge pull request #99 from NVlabs/reduce-nerf-memory
Browse files Browse the repository at this point in the history
Reduce NeRF memory usage by ~1gb
  • Loading branch information
Tom94 authored Jan 30, 2022
2 parents d6c7241 + 9836695 commit 16b6409
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 138 deletions.
2 changes: 1 addition & 1 deletion dependencies/tiny-cuda-nn
4 changes: 2 additions & 2 deletions include/neural-graphics-primitives/marching_cubes.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ NGP_NAMESPACE_BEGIN

Eigen::Vector3i get_marching_cubes_res(uint32_t res_1d, const BoundingBox &aabb);

void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float> &density, tcnn::GPUMemory<Eigen::Vector3f> &vert_out, tcnn::GPUMemory<uint32_t> &indices_out);
void marching_cubes_gpu(tcnn::GPUMemory<char>& scratch_memory, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<Eigen::Vector3f>& vert_out, tcnn::GPUMemory<uint32_t>& indices_out);

// computes the average of the 1ring of all verts, as homogenous coordinates
void compute_mesh_1ring(const tcnn::GPUMemory<Eigen::Vector3f> &verts, const tcnn::GPUMemory<uint32_t> &indices, tcnn::GPUMemory<Eigen::Vector4f> &output_pos, tcnn::GPUMemory<Eigen::Vector3f> &output_normals);
void compute_mesh_1ring(const tcnn::GPUMemory<Eigen::Vector3f>& verts, const tcnn::GPUMemory<uint32_t>& indices, tcnn::GPUMemory<Eigen::Vector4f>& output_pos, tcnn::GPUMemory<Eigen::Vector3f>& output_normals);

void compute_mesh_opt_gradients(float thresh,
const tcnn::GPUMemory<Eigen::Vector3f>& verts, const tcnn::GPUMemory<Eigen::Vector3f>& vert_normals,
Expand Down
20 changes: 12 additions & 8 deletions include/neural-graphics-primitives/nerf.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,22 @@ struct NerfPayload {
};

struct RaysNerfSoa {
void enlarge(size_t n_elements) {
rgba.enlarge(n_elements);
payload.enlarge(n_elements);
}
#ifdef __NVCC__
void copy_from_other_async(const RaysNerfSoa& other, cudaStream_t stream) {
CUDA_CHECK_THROW(cudaMemcpyAsync(rgba.data(), other.rgba.data(), rgba.get_bytes(), cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK_THROW(cudaMemcpyAsync(payload.data(), other.payload.data(), payload.get_bytes(), cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK_THROW(cudaMemcpyAsync(rgba, other.rgba, size * sizeof(Eigen::Array4f), cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK_THROW(cudaMemcpyAsync(payload, other.payload, size * sizeof(NerfPayload), cudaMemcpyDeviceToDevice, stream));
}
#endif
tcnn::GPUMemory<Eigen::Array4f> rgba;
tcnn::GPUMemory<NerfPayload> payload;

void set(Eigen::Array4f* _rgba, NerfPayload* _payload, size_t _size) {
rgba = _rgba;
payload = _payload;
size = _size;
}

Eigen::Array4f* rgba;
NerfPayload* payload;
size_t size;
};


Expand Down
32 changes: 12 additions & 20 deletions include/neural-graphics-primitives/testbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ class Testbed {
public:
SphereTracer() : m_hit_counter(1), m_alive_counter(1) {}

void init_rays_from_camera(uint32_t spp,
void init_rays_from_camera(
uint32_t spp,
const Eigen::Vector2i& resolution,
const Eigen::Vector2f& focal_length,
const Eigen::Matrix<float, 3, 4>& camera_matrix,
Expand All @@ -88,7 +89,8 @@ class Testbed {
const float* envmap_data,
const Eigen::Vector2i& envmap_resolution,
Eigen::Array4f* frame_buffer,
const TriangleOctree* octree, cudaStream_t stream);
const TriangleOctree* octree, cudaStream_t stream
);

void init_rays_from_data(uint32_t n_elements, const RaysSdfSoa& data, cudaStream_t stream);
uint32_t trace_bvh(TriangleBvh* bvh, const Triangle* triangles, cudaStream_t stream);
Expand All @@ -113,7 +115,9 @@ class Testbed {
public:
NerfTracer() : m_hit_counter(1), m_alive_counter(1) {}

void init_rays_from_camera(uint32_t spp,
void init_rays_from_camera(
tcnn::GPUMemory<char>& scratch_memory,
uint32_t spp,
uint32_t padded_output_width,
const Eigen::Vector2i& resolution,
const Eigen::Vector2f& focal_length,
Expand Down Expand Up @@ -158,16 +162,16 @@ class Testbed {
cudaStream_t stream
);

void enlarge(size_t n_elements, uint32_t padded_output_width);
void enlarge(tcnn::GPUMemory<char>& scratch_memory, size_t n_elements, uint32_t padded_output_width);
RaysNerfSoa& rays_hit() { return m_rays_hit; }
RaysNerfSoa& rays_init() { return m_rays[0]; }
uint32_t n_rays_initialized() const { return m_n_rays_initialized; }

private:
RaysNerfSoa m_rays[2];
RaysNerfSoa m_rays_hit;
tcnn::GPUMemory<precision_t> m_network_output;
tcnn::GPUMemory<NerfCoordinate> m_network_input;
precision_t* m_network_output;
NerfCoordinate* m_network_input;
tcnn::GPUMemory<uint32_t> m_hit_counter;
tcnn::GPUMemory<uint32_t> m_alive_counter;
uint32_t m_n_rays_initialized = 0;
Expand Down Expand Up @@ -421,6 +425,8 @@ class Testbed {
std::vector<CudaRenderBuffer> m_render_surfaces;
std::unique_ptr<CudaRenderBuffer> m_pip_render_surface;

tcnn::GPUMemory<char> m_scratch_gpu_memory;

struct Nerf {
NerfTracer tracer;

Expand Down Expand Up @@ -464,21 +470,10 @@ class Testbed {
std::vector<RotationAdamOptimizer> cam_rot_offset;
AdamOptimizer<Eigen::Vector2f> cam_focal_length_offset = AdamOptimizer<Eigen::Vector2f>(0.f);

tcnn::GPUMemory<uint32_t> ray_indices;
tcnn::GPUMemory<Ray> rays;
tcnn::GPUMemory<uint32_t> numsteps; // number of steps each ray took
tcnn::GPUMemory<uint32_t> numsteps_counter; // number of steps each ray took
tcnn::GPUMemory<uint32_t> numsteps_counter_compacted; // number of steps each ray took
tcnn::GPUMemory<uint32_t> ray_counter;
tcnn::GPUMemory<NerfCoordinate> coords;
tcnn::GPUMemory<NerfCoordinate> coords_compacted;
tcnn::GPUMemory<NerfCoordinate> coords_gradient;
tcnn::GPUMemory<precision_t> mlp_out; // space for mlp to output into - half, padded output size
tcnn::GPUMemory<precision_t> mlp_out_trimmed;
tcnn::GPUMemory<precision_t> dloss_dmlp_out; // space for loss gradients - padded_output_width
tcnn::GPUMemory<float> loss;
tcnn::GPUMemory<float> max_level;
tcnn::GPUMemory<float> max_level_compacted;

uint32_t rays_per_batch = 1<<12;
uint32_t n_rays_total = 0;
Expand Down Expand Up @@ -515,11 +510,8 @@ class Testbed {
} training = {};

tcnn::GPUMemory<float> density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network
tcnn::GPUMemory<NerfPosition> density_grid_positions;
tcnn::GPUMemory<uint32_t> density_grid_indices;
tcnn::GPUMemory<uint8_t> density_grid_bitfield;
uint8_t* get_density_grid_bitfield_mip(uint32_t mip);
tcnn::GPUMemory<float> density_grid_tmp;
tcnn::GPUMemory<float> density_grid_mean;
uint32_t density_grid_ema_step = 0;

Expand Down
12 changes: 6 additions & 6 deletions src/marching_cubes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -743,15 +743,15 @@ void compute_mesh_opt_gradients(float thresh,
);
}

void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float> &density, tcnn::GPUMemory<Eigen::Vector3f>& verts_out, tcnn::GPUMemory<uint32_t>& indices_out) {
void marching_cubes_gpu(GPUMemory<char>& scratch_memory, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<Eigen::Vector3f>& verts_out, tcnn::GPUMemory<uint32_t>& indices_out) {
GPUMemory<uint32_t> counters;

counters.enlarge(4);
counters.memset(0);

GPUMemory<int> vertex_grid;
vertex_grid.enlarge(res_3d.x()*res_3d.y()*res_3d.z()*3);
vertex_grid.memset(-1);
scratch_memory.enlarge(res_3d.x() * res_3d.y() * res_3d.z() * 3 * sizeof(int));
scratch_memory.memset(-1);
int* vertex_grid = (int*)scratch_memory.data();

const dim3 threads = { 4, 4, 4 };
const dim3 blocks = { div_round_up((uint32_t)res_3d.x(), threads.x), div_round_up((uint32_t)res_3d.y(), threads.y), div_round_up((uint32_t)res_3d.z(), threads.z) };
Expand All @@ -767,8 +767,8 @@ void marching_cubes_gpu(BoundingBox aabb, Eigen::Vector3i res_3d, float thresh,
verts_out.memset(0);
indices_out.resize(cpucounters[1]);
// actually generate verts
gen_vertices<<<blocks, threads, 0>>>(aabb, res_3d, density.data(), vertex_grid.data(), verts_out.data(), thresh, counters.data()+2);
gen_faces<<<blocks, threads, 0>>>(res_3d, density.data(), vertex_grid.data(), indices_out.data(), thresh, counters.data()+2);
gen_vertices<<<blocks, threads, 0>>>(aabb, res_3d, density.data(), vertex_grid, verts_out.data(), thresh, counters.data()+2);
gen_faces<<<blocks, threads, 0>>>(res_3d, density.data(), vertex_grid, indices_out.data(), thresh, counters.data()+2);
}

void save_mesh(
Expand Down
Loading

0 comments on commit 16b6409

Please sign in to comment.