From 299afdfc0ab300d8518bd489feae07eb4e251328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20M=C3=BCller?= Date: Thu, 10 Feb 2022 18:59:47 +0100 Subject: [PATCH] Significant memory reduction through TCNN's new arena --- dependencies/tiny-cuda-nn | 2 +- .../marching_cubes.h | 2 +- .../neural-graphics-primitives/nerf_network.h | 309 ++++++------------ include/neural-graphics-primitives/testbed.h | 10 +- src/marching_cubes.cu | 10 +- src/testbed_nerf.cu | 32 +- 6 files changed, 138 insertions(+), 227 deletions(-) diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn index 57fcd3310..4faeca656 160000 --- a/dependencies/tiny-cuda-nn +++ b/dependencies/tiny-cuda-nn @@ -1 +1 @@ -Subproject commit 57fcd33109e11e1e62ea3cdf731c22f8417fd0aa +Subproject commit 4faeca656aaa294a5d768f2cbf5096fa39ca91b6 diff --git a/include/neural-graphics-primitives/marching_cubes.h b/include/neural-graphics-primitives/marching_cubes.h index 4804d9d86..f935fc14f 100644 --- a/include/neural-graphics-primitives/marching_cubes.h +++ b/include/neural-graphics-primitives/marching_cubes.h @@ -20,7 +20,7 @@ NGP_NAMESPACE_BEGIN Eigen::Vector3i get_marching_cubes_res(uint32_t res_1d, const BoundingBox &aabb); -void marching_cubes_gpu(tcnn::GPUMemory& scratch_memory, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& vert_out, tcnn::GPUMemory& indices_out); +void marching_cubes_gpu(cudaStream_t stream, BoundingBox aabb, Eigen::Vector3i res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& vert_out, tcnn::GPUMemory& indices_out); // computes the average of the 1ring of all verts, as homogenous coordinates void compute_mesh_1ring(const tcnn::GPUMemory& verts, const tcnn::GPUMemory& indices, tcnn::GPUMemory& output_pos, tcnn::GPUMemory& output_normals); diff --git a/include/neural-graphics-primitives/nerf_network.h b/include/neural-graphics-primitives/nerf_network.h index a165ad279..5ab24b6bb 100644 --- a/include/neural-graphics-primitives/nerf_network.h +++ b/include/neural-graphics-primitives/nerf_network.h @@ -173,10 +173,6 @@ class NerfNetwork : public tcnn::Network { m_pos_encoding->set_output_layout(tcnn::RM); } - m_inference_density_network_input.set_layout(m_pos_encoding->output_layout()); - m_forward_density_network_input.set_layout(m_pos_encoding->output_layout()); - m_backward_dL_ddensity_network_input.set_layout(m_pos_encoding->output_layout()); - json local_density_network_config = density_network; local_density_network_config["n_input_dims"] = m_pos_encoding->num_encoded_dims(); if (!density_network.contains("n_output_dims")) { @@ -195,14 +191,9 @@ class NerfNetwork : public tcnn::Network { virtual ~NerfNetwork() { } void inference(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output) override { - // Make sure our teporary buffers have the correct size for the given batch size - uint32_t batch_size = input.n(); - if (m_inference_density_network_input.n() != batch_size) { - allocate_inference_buffers(batch_size); - } - - inference_mixed_precision(stream, input, m_inference_network_output); - tcnn::linear_kernel(tcnn::cast_from, 0, stream, m_inference_network_output.n_elements(), m_inference_network_output.data(), output.data()); + tcnn::GPUMatrixDynamic inference_network_output{4, input.n(), stream, output.layout()}; + inference_mixed_precision(stream, input, inference_network_output); + tcnn::linear_kernel(tcnn::cast_from, 0, stream, inference_network_output.n_elements(), inference_network_output.data(), output.data()); } void inference_mixed_precision(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output, bool use_inference_matrices = true) override { @@ -210,11 +201,11 @@ class NerfNetwork : public tcnn::Network { throw std::runtime_error("NerfNetwork::inference_mixed_precision input and output must be in column major format."); } - // Make sure our teporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); - if (m_inference_density_network_input.n() != batch_size) { - allocate_inference_buffers(batch_size); - } + tcnn::GPUMatrixDynamic density_network_input{m_pos_encoding->num_encoded_dims(), batch_size, stream, m_pos_encoding->output_layout()}; + tcnn::GPUMatrix density_network_output{m_density_network->padded_output_width(), batch_size, stream}; + tcnn::GPUMatrix rgb_network_input{m_rgb_network_input_width, batch_size, stream}; + tcnn::GPUMatrix rgb_network_output{m_rgb_network->padded_output_width(), batch_size, stream}; // Perform directional encoding and density network query in parallel { @@ -224,7 +215,7 @@ class NerfNetwork : public tcnn::Network { synced_streams.get(0), batch_size, {input.data(), input.m()}, - {m_inference_density_network_input.data(), m_inference_density_network_input.m()}, + {density_network_input.data(), density_network_input.m()}, nullptr, use_inference_matrices ); @@ -232,41 +223,37 @@ class NerfNetwork : public tcnn::Network { synced_streams.get(1), batch_size, {input.data() + m_dir_offset, input.m()}, - {m_inference_rgb_network_input.data() + m_inference_density_network_output.m(), m_inference_rgb_network_input.m()}, + {rgb_network_input.data() + m_density_network->padded_output_width(), rgb_network_input.m()}, nullptr, use_inference_matrices ); - m_density_network->inference_mixed_precision(synced_streams.get(0), m_inference_density_network_input, m_inference_density_network_output, use_inference_matrices); + m_density_network->inference_mixed_precision(synced_streams.get(0), density_network_input, density_network_output, use_inference_matrices); tcnn::linear_kernel(grab_density_network_output, 0, synced_streams.get(0), - m_inference_density_network_output.n_elements(), m_inference_density_network_output.m(), m_inference_rgb_network_input.m() /* stride */, m_inference_density_network_output.data(), m_inference_rgb_network_input.data() + density_network_output.n_elements(), density_network_output.m(), rgb_network_input.m() /* stride */, density_network_output.data(), rgb_network_input.data() ); } - m_rgb_network->inference_mixed_precision(stream, m_inference_rgb_network_input, m_inference_rgb_network_output, use_inference_matrices); + m_rgb_network->inference_mixed_precision(stream, rgb_network_input, rgb_network_output, use_inference_matrices); tcnn::linear_kernel(assemble_rgbd, 0, stream, - output.n_elements(), m_inference_density_network_output.m(), m_inference_rgb_network_output.m(), m_inference_density_network_output.data(), m_inference_rgb_network_output.data(), output.data() + output.n_elements(), density_network_output.m(), rgb_network_output.m(), density_network_output.data(), rgb_network_output.data(), output.data() ); } void density(cudaStream_t stream, const tcnn::PitchedPtr& input, tcnn::GPUMatrixDynamic& output, bool use_inference_matrices = true) { - // Make sure our temporary buffers have the correct size for the given batch size uint32_t batch_size = output.n(); - if (m_inference_density_network_input.n() != batch_size) { - allocate_inference_buffers(batch_size); - } - + tcnn::GPUMatrixDynamic density_network_input{m_pos_encoding->num_encoded_dims(), batch_size, stream, m_pos_encoding->output_layout()}; m_pos_encoding->encode( stream, batch_size, input, - {m_inference_density_network_input.data(), m_inference_density_network_input.m()}, + {density_network_input.data(), density_network_input.m()}, nullptr, use_inference_matrices ); - m_density_network->inference_mixed_precision(stream, m_inference_density_network_input, output, use_inference_matrices); + m_density_network->inference_mixed_precision(stream, density_network_input, output, use_inference_matrices); } void density(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output, bool use_inference_matrices = true) { @@ -292,47 +279,53 @@ class NerfNetwork : public tcnn::Network { // Make sure our temporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); - if (m_forward_density_network_input.n() != batch_size) { - allocate_forward_buffers(batch_size); - } - // Perform directional encoding and density network query in parallel - { - tcnn::SyncedMultiStream synced_streams{stream, 2}; + m_forward.density_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->num_encoded_dims(), batch_size, stream, m_pos_encoding->output_layout()}; + m_forward.rgb_network_input = tcnn::GPUMatrix{m_rgb_network_input_width, batch_size, stream}; - m_pos_encoding->encode( - synced_streams.get(0), - batch_size, - {input.data(), input.m()}, - {m_forward_density_network_input.data(), m_forward_density_network_input.m()}, - prepare_input_gradients ? m_forward_pos_encoding_forward_gradient.data() : nullptr, - use_inference_matrices - ); - m_dir_encoding->encode( - synced_streams.get(1), - batch_size, - {input.data() + m_dir_offset, input.m()}, - {m_forward_rgb_network_input.data() + m_forward_density_network_output.m(), m_forward_rgb_network_input.m()}, - prepare_input_gradients ? m_forward_dir_encoding_forward_gradient.data() : nullptr, - use_inference_matrices - ); + if (prepare_input_gradients) { + m_forward.pos_encoding_forward_gradient = tcnn::GPUMatrix{m_pos_encoding->num_forward_gradient_dims(), batch_size, stream}; + m_forward.dir_encoding_forward_gradient = tcnn::GPUMatrix{m_dir_encoding->num_forward_gradient_dims(), batch_size, stream}; + } - m_density_network->forward(synced_streams.get(0), m_forward_density_network_input, &m_forward_density_network_output, use_inference_matrices, prepare_input_gradients); + m_pos_encoding->encode( + stream, + batch_size, + {input.data(), input.m()}, + {m_forward.density_network_input.data(), m_forward.density_network_input.m()}, + prepare_input_gradients ? m_forward.pos_encoding_forward_gradient.data() : nullptr, + use_inference_matrices + ); + m_dir_encoding->encode( + stream, + batch_size, + {input.data() + m_dir_offset, input.m()}, + {m_forward.rgb_network_input.data() + m_density_network->padded_output_width(), m_forward.rgb_network_input.m()}, + prepare_input_gradients ? m_forward.dir_encoding_forward_gradient.data() : nullptr, + use_inference_matrices + ); - tcnn::linear_kernel(grab_density_network_output, 0, synced_streams.get(0), - m_forward_density_network_output.n_elements(), m_forward_density_network_output.m(), m_forward_rgb_network_input.m() /* stride */, m_forward_density_network_output.data(), m_forward_rgb_network_input.data() - ); - } + m_forward.density_network_output = tcnn::GPUMatrix{m_density_network->padded_output_width(), batch_size, stream}; + m_density_network->forward(stream, m_forward.density_network_input, &m_forward.density_network_output, use_inference_matrices, prepare_input_gradients); - m_rgb_network->forward(stream, m_forward_rgb_network_input, &m_forward_rgb_network_output, use_inference_matrices, prepare_input_gradients); + tcnn::linear_kernel(grab_density_network_output, 0, stream, + m_forward.density_network_output.n_elements(), m_forward.density_network_output.m(), m_forward.rgb_network_input.m() /* stride */, m_forward.density_network_output.data(), m_forward.rgb_network_input.data() + ); + + m_forward.rgb_network_output = tcnn::GPUMatrix{m_rgb_network->padded_output_width(), batch_size, stream}; + m_rgb_network->forward(stream, m_forward.rgb_network_input, &m_forward.rgb_network_output, use_inference_matrices, prepare_input_gradients); if (output) { tcnn::linear_kernel(assemble_rgbd, 0, stream, - output->n_elements(), m_forward_density_network_output.m(), m_forward_rgb_network_output.m(), m_forward_density_network_output.data(), m_forward_rgb_network_output.data(), output->data() + output->n_elements(), m_forward.density_network_output.m(), m_forward.rgb_network_output.m(), m_forward.density_network_output.data(), m_forward.rgb_network_output.data(), output->data() ); } } + void forward_clear() override { + m_forward.clear(); + } + void backward( cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, @@ -348,62 +341,65 @@ class NerfNetwork : public tcnn::Network { // Make sure our teporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); - if (m_backward_dL_ddensity_network_output.n() != batch_size) { - allocate_backward_buffers(batch_size); - } + tcnn::GPUMatrix dL_drgb{m_rgb_network->padded_output_width(), batch_size, stream}; + CUDA_CHECK_THROW(cudaMemsetAsync(dL_drgb.data(), 0, dL_drgb.n_bytes(), stream)); tcnn::linear_kernel(extract_rgb, 0, stream, - batch_size*3, m_backward_dL_drgb.m(), dL_doutput.data(), m_backward_dL_drgb.data() + batch_size*3, dL_drgb.m(), dL_doutput.data(), dL_drgb.data() ); - m_rgb_network->backward(stream, m_forward_rgb_network_input, m_forward_rgb_network_output, m_backward_dL_drgb, &m_backward_dL_drgb_network_input, use_inference_matrices, compute_param_gradients); + tcnn::GPUMatrix dL_drgb_network_input{m_rgb_network_input_width, batch_size, stream}; + m_rgb_network->backward(stream, m_forward.rgb_network_input, m_forward.rgb_network_output, dL_drgb, &dL_drgb_network_input, use_inference_matrices, compute_param_gradients); // Backprop through dir encoding if it is trainable or if we need input gradients - tcnn::SyncedMultiStream synced_streams{stream, 2}; - if (m_dir_encoding->n_params() > 0 || dL_dinput) { - tcnn::linear_kernel(extract_dir_gradient, 0, synced_streams.get(1), - m_backward_dL_ddir_encoding_output.n_elements(), m_backward_dL_ddensity_network_output.m(), m_backward_dL_ddir_encoding_output.m(), m_backward_dL_drgb_network_input.m(), m_backward_dL_drgb_network_input.data(), m_backward_dL_ddir_encoding_output.data() + tcnn::GPUMatrix dL_ddir_encoding_output{m_dir_encoding->num_encoded_dims(), batch_size, stream}; + + tcnn::linear_kernel(extract_dir_gradient, 0, stream, + dL_ddir_encoding_output.n_elements(), m_density_network->padded_output_width(), dL_ddir_encoding_output.m(), dL_drgb_network_input.m(), dL_drgb_network_input.data(), dL_ddir_encoding_output.data() ); m_dir_encoding->backward( - synced_streams.get(1), + stream, batch_size, - {m_backward_dL_ddir_encoding_output.data(), m_backward_dL_ddir_encoding_output.m()}, - dL_dinput ? m_forward_dir_encoding_forward_gradient.data() : nullptr, + {dL_ddir_encoding_output.data(), dL_ddir_encoding_output.m()}, + dL_dinput ? m_forward.dir_encoding_forward_gradient.data() : nullptr, dL_dinput ? tcnn::PitchedPtr{dL_dinput->data() + m_dir_offset, dL_dinput->m()} : tcnn::PitchedPtr{}, {input.data() + m_dir_offset, input.m()} ); } - tcnn::linear_kernel(extract_density_gradient, 0, synced_streams.get(0), - m_backward_dL_ddensity_network_output.n_elements(), - m_backward_dL_ddensity_network_output.m(), - m_backward_dL_drgb_network_input.m(), + tcnn::GPUMatrix dL_ddensity_network_output{m_density_network->padded_output_width(), batch_size, stream}; + tcnn::linear_kernel(extract_density_gradient, 0, stream, + dL_ddensity_network_output.n_elements(), + dL_ddensity_network_output.m(), + dL_drgb_network_input.m(), dL_doutput.data(), - m_forward_density_network_output.data(), - m_backward_dL_drgb_network_input.data(), - m_backward_dL_ddensity_network_output.data() + m_forward.density_network_output.data(), + dL_drgb_network_input.data(), + dL_ddensity_network_output.data() ); - tcnn::GPUMatrixDynamic* dL_ddensity_network_input = nullptr; + tcnn::GPUMatrixDynamic dL_ddensity_network_input; if (m_pos_encoding->n_params() > 0 || dL_dinput) { - dL_ddensity_network_input = &m_backward_dL_ddensity_network_input; + dL_ddensity_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->num_encoded_dims(), batch_size, stream, m_pos_encoding->output_layout()}; } - m_density_network->backward(synced_streams.get(0), m_forward_density_network_input, m_forward_density_network_output, m_backward_dL_ddensity_network_output, dL_ddensity_network_input, use_inference_matrices, compute_param_gradients); + m_density_network->backward(stream, m_forward.density_network_input, m_forward.density_network_output, dL_ddensity_network_output, dL_ddensity_network_input.data() ? &dL_ddensity_network_input : nullptr, use_inference_matrices, compute_param_gradients); // Backprop through pos encoding if it is trainable or if we need input gradients - if (dL_ddensity_network_input) { + if (dL_ddensity_network_input.data()) { m_pos_encoding->backward( - synced_streams.get(0), + stream, batch_size, - {dL_ddensity_network_input->data(), dL_ddensity_network_input->m()}, - dL_dinput ? m_forward_pos_encoding_forward_gradient.data() : nullptr, + {dL_ddensity_network_input.data(), dL_ddensity_network_input.m()}, + dL_dinput ? m_forward.pos_encoding_forward_gradient.data() : nullptr, dL_dinput ? tcnn::PitchedPtr{dL_dinput->data(), dL_dinput->m()} : tcnn::PitchedPtr{}, {input.data(), input.m()} ); } + + forward_clear(); } void set_params(T* params, T* inference_params, T* backward_params, T* gradients) override { @@ -513,11 +509,11 @@ class NerfNetwork : public tcnn::Network { uint32_t width(uint32_t layer) const override { if (layer == 0) { - return m_forward_density_network_input.m(); + return m_forward.density_network_input.m(); } else if (layer < m_density_network->num_forward_activations() + 1) { return m_density_network->width(layer - 1); } else if (layer == m_density_network->num_forward_activations() + 1) { - return m_forward_rgb_network_input.m(); + return m_forward.rgb_network_input.m(); } else { return m_rgb_network->width(layer - 2 - m_density_network->num_forward_activations()); } @@ -528,12 +524,16 @@ class NerfNetwork : public tcnn::Network { } const T* forward_activations(uint32_t layer) const override { + if (!m_forward.density_network_input.data()) { + throw std::runtime_error{"Must call forward() before accessing activations."}; + } + if (layer == 0) { - return m_forward_density_network_input.data(); + return m_forward.density_network_input.data(); } else if (layer < m_density_network->num_forward_activations() + 1) { return m_density_network->forward_activations(layer - 1); } else if (layer == m_density_network->num_forward_activations() + 1) { - return m_forward_rgb_network_input.data(); + return m_forward.rgb_network_input.data(); } else { return m_rgb_network->forward_activations(layer - 2 - m_density_network->num_forward_activations()); } @@ -547,94 +547,6 @@ class NerfNetwork : public tcnn::Network { return m_dir_encoding; } -private: - void allocate_inference_buffers(uint32_t batch_size) { - m_inference_density_network_input.set_size(m_pos_encoding->num_encoded_dims(), batch_size); - m_inference_density_network_output.set_size(m_density_network->padded_output_width(), batch_size); - - m_inference_rgb_network_input.set_size(m_rgb_network_input_width, batch_size); - m_inference_rgb_network_output.set_size(m_rgb_network->padded_output_width(), batch_size); - - m_inference_network_output.set_size(4, batch_size); - - tcnn::GPUMatrixBase::allocate_shared_memory( - m_inference_buffer, - { - &m_inference_density_network_input, - &m_inference_density_network_output, - - &m_inference_rgb_network_input, - &m_inference_rgb_network_output, - - &m_inference_network_output, - } - ); - - CUDA_CHECK_THROW(cudaMemset(m_inference_rgb_network_input.data(), 0, m_inference_rgb_network_input.n_bytes())); - } - - void allocate_forward_buffers(uint32_t batch_size) { - m_forward_density_network_input.set_size(m_pos_encoding->num_encoded_dims(), batch_size); - m_forward_density_network_output.set_size(m_density_network->padded_output_width(), batch_size); - - m_forward_rgb_network_input.set_size(m_rgb_network_input_width, batch_size); - m_forward_rgb_network_output.set_size(m_rgb_network->padded_output_width(), batch_size); - - m_forward_network_output.set_size(4, batch_size); - - m_forward_pos_encoding_forward_gradient.set_size(m_pos_encoding->num_forward_gradient_dims(), batch_size); - m_forward_dir_encoding_forward_gradient.set_size(m_dir_encoding->num_forward_gradient_dims(), batch_size); - - tcnn::GPUMatrixBase::allocate_shared_memory( - m_forward_buffer, - { - &m_forward_density_network_input, - &m_forward_density_network_output, - - &m_forward_rgb_network_input, - &m_forward_rgb_network_output, - - &m_forward_network_output, - - &m_forward_pos_encoding_forward_gradient, - &m_forward_dir_encoding_forward_gradient, - } - ); - - CUDA_CHECK_THROW(cudaMemset(m_forward_rgb_network_input.data(), 0, m_forward_rgb_network_input.n_bytes())); - } - - void allocate_backward_buffers(uint32_t batch_size) { - m_backward_dL_drgb.set_size(m_rgb_network->padded_output_width(), batch_size); - - m_backward_dL_drgb_network_input.set_size(m_rgb_network_input_width, batch_size); - m_backward_dL_ddensity_network_output.set_size(m_density_network->padded_output_width(), batch_size); - m_backward_dL_ddir_encoding_output.set_size(m_dir_encoding->num_encoded_dims(), batch_size); - - m_backward_dL_ddensity_network_input.set_size(m_pos_encoding->num_encoded_dims(), batch_size); - - m_backward_dL_dpos.set_size(m_n_pos_dims, batch_size); - m_backward_dL_ddir.set_size(m_n_dir_dims, batch_size); - - tcnn::GPUMatrixBase::allocate_shared_memory( - m_backward_buffer, - { - &m_backward_dL_drgb, - - &m_backward_dL_drgb_network_input, - &m_backward_dL_ddensity_network_output, - &m_backward_dL_ddir_encoding_output, - - &m_backward_dL_ddensity_network_input, - - &m_backward_dL_dpos, - &m_backward_dL_ddir, - } - ); - - CUDA_CHECK_THROW(cudaMemset(m_backward_dL_drgb.data(), 0, m_backward_dL_drgb.n_bytes())); - } - private: std::unique_ptr> m_density_network; std::unique_ptr> m_rgb_network; @@ -646,33 +558,24 @@ class NerfNetwork : public tcnn::Network { uint32_t m_n_dir_dims; uint32_t m_dir_offset; - // Temporary buffers to hold inference data - tcnn::GPUMemory m_inference_buffer; - tcnn::GPUMatrixDynamic m_inference_density_network_input; - tcnn::GPUMatrix m_inference_density_network_output; - tcnn::GPUMatrix m_inference_rgb_network_input; - tcnn::GPUMatrix m_inference_rgb_network_output; - tcnn::GPUMatrix m_inference_network_output; - - // Temporary buffers to hold forward data - tcnn::GPUMemory m_forward_buffer; - tcnn::GPUMatrixDynamic m_forward_density_network_input; - tcnn::GPUMatrix m_forward_density_network_output; - tcnn::GPUMatrix m_forward_rgb_network_input; - tcnn::GPUMatrix m_forward_rgb_network_output; - tcnn::GPUMatrix m_forward_network_output; // Only needed when visualizing - tcnn::GPUMatrix m_forward_pos_encoding_forward_gradient; // Only needed when computing input gradients - tcnn::GPUMatrix m_forward_dir_encoding_forward_gradient; // Only needed when computing input gradients - - // Temporary buffers to hold backward data - tcnn::GPUMemory m_backward_buffer; - tcnn::GPUMatrix m_backward_dL_drgb; - tcnn::GPUMatrix m_backward_dL_drgb_network_input; - tcnn::GPUMatrix m_backward_dL_ddensity_network_output; - tcnn::GPUMatrix m_backward_dL_ddir_encoding_output; // Only needed when training the dir encoding or computing input gradients - tcnn::GPUMatrixDynamic m_backward_dL_ddensity_network_input; - tcnn::GPUMatrix m_backward_dL_dpos; // Only needed when computing input gradients - tcnn::GPUMatrix m_backward_dL_ddir; // Only needed when computing input gradients + // // Storage of forward pass data + struct { + tcnn::GPUMatrixDynamic density_network_input; + tcnn::GPUMatrix density_network_output; + tcnn::GPUMatrix rgb_network_input; + tcnn::GPUMatrix rgb_network_output; + tcnn::GPUMatrix pos_encoding_forward_gradient; // Only needed when computing input gradients + tcnn::GPUMatrix dir_encoding_forward_gradient; // Only needed when computing input gradients + + void clear() { + density_network_input = tcnn::GPUMatrixDynamic{}; + density_network_output = tcnn::GPUMatrix{}; + rgb_network_input = tcnn::GPUMatrix{}; + rgb_network_output = tcnn::GPUMatrix{}; + pos_encoding_forward_gradient = tcnn::GPUMatrix{}; + dir_encoding_forward_gradient = tcnn::GPUMatrix{}; + } + } m_forward; }; NGP_NAMESPACE_END diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h index 7b8af6055..64dc2e1f5 100644 --- a/include/neural-graphics-primitives/testbed.h +++ b/include/neural-graphics-primitives/testbed.h @@ -113,7 +113,6 @@ class Testbed { NerfTracer() : m_hit_counter(1), m_alive_counter(1) {} void init_rays_from_camera( - tcnn::GPUMemory& scratch_memory, uint32_t spp, uint32_t padded_output_width, const Eigen::Vector2i& resolution, @@ -159,11 +158,15 @@ class Testbed { cudaStream_t stream ); - void enlarge(tcnn::GPUMemory& scratch_memory, size_t n_elements, uint32_t padded_output_width); + void enlarge(size_t n_elements, uint32_t padded_output_width, cudaStream_t stream); RaysNerfSoa& rays_hit() { return m_rays_hit; } RaysNerfSoa& rays_init() { return m_rays[0]; } uint32_t n_rays_initialized() const { return m_n_rays_initialized; } + void clear() { + m_scratch_alloc = {}; + } + private: RaysNerfSoa m_rays[2]; RaysNerfSoa m_rays_hit; @@ -172,6 +175,7 @@ class Testbed { tcnn::GPUMemory m_hit_counter; tcnn::GPUMemory m_alive_counter; uint32_t m_n_rays_initialized = 0; + tcnn::GPUMemoryArena::Allocation m_scratch_alloc; }; class FiniteDifferenceNormalsApproximator { @@ -423,8 +427,6 @@ class Testbed { std::vector m_render_surfaces; std::unique_ptr m_pip_render_surface; - tcnn::GPUMemory m_scratch_gpu_memory; - struct Nerf { NerfTracer tracer; diff --git a/src/marching_cubes.cu b/src/marching_cubes.cu index 508039cfd..9aa9c63e3 100644 --- a/src/marching_cubes.cu +++ b/src/marching_cubes.cu @@ -754,15 +754,17 @@ void compute_mesh_opt_gradients(float thresh, ); } -void marching_cubes_gpu(GPUMemory& scratch_memory, BoundingBox aabb, Vector3i res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& verts_out, tcnn::GPUMemory& indices_out) { +void marching_cubes_gpu(cudaStream_t stream, BoundingBox aabb, Vector3i res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& verts_out, tcnn::GPUMemory& indices_out) { GPUMemory counters; counters.enlarge(4); counters.memset(0); - scratch_memory.enlarge(res_3d.x() * res_3d.y() * res_3d.z() * 3 * sizeof(int)); - scratch_memory.memset(-1); - int* vertex_grid = (int*)scratch_memory.data(); + size_t n_bytes = res_3d.x() * res_3d.y() * res_3d.z() * 3 * sizeof(int); + auto workspace = allocate_workspace(stream, n_bytes); + CUDA_CHECK_THROW(cudaMemsetAsync(workspace.data(), -1, n_bytes, stream)); + + int* vertex_grid = (int*)workspace.data(); const dim3 threads = { 4, 4, 4 }; const dim3 blocks = { div_round_up((uint32_t)res_3d.x(), threads.x), div_round_up((uint32_t)res_3d.y(), threads.y), div_round_up((uint32_t)res_3d.z(), threads.z) }; diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu index 33d216a09..a769c2a3f 100644 --- a/src/testbed_nerf.cu +++ b/src/testbed_nerf.cu @@ -1743,7 +1743,6 @@ __global__ void safe_divide(const uint32_t num_elements, float* __restrict__ ino } void Testbed::NerfTracer::init_rays_from_camera( - GPUMemory& scratch_memory, uint32_t spp, uint32_t padded_output_width, const Vector2i& resolution, @@ -1769,7 +1768,7 @@ void Testbed::NerfTracer::init_rays_from_camera( ) { // Make sure we have enough memory reserved to render at the requested resolution size_t n_pixels = (size_t)resolution.x() * resolution.y(); - enlarge(scratch_memory, n_pixels, padded_output_width); + enlarge(n_pixels, padded_output_width, stream); const dim3 threads = { 16, 8, 1 }; const dim3 blocks = { div_round_up((uint32_t)resolution.x(), threads.x), div_round_up((uint32_t)resolution.y(), threads.y), 1 }; @@ -1923,10 +1922,10 @@ uint32_t Testbed::NerfTracer::trace( return n_hit; } -void Testbed::NerfTracer::enlarge(tcnn::GPUMemory& scratch_memory, size_t n_elements, uint32_t padded_output_width) { +void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_width, cudaStream_t stream) { n_elements = next_multiple(n_elements, size_t(BATCH_SIZE_MULTIPLE)); // network inference rounds n_elements up to 256, and uses these arrays, so we must do so also. - auto scratch = scratch_memory.enlarge_and_distribute< + auto scratch = allocate_workspace_and_distribute< Array4f, NerfPayload, // m_rays[0] Array4f, NerfPayload, // m_rays[1] Array4f, NerfPayload, // m_rays_hit @@ -1934,6 +1933,7 @@ void Testbed::NerfTracer::enlarge(tcnn::GPUMemory& scratch_memory, size_t network_precision_t, NerfCoordinate >( + stream, &m_scratch_alloc, n_elements, n_elements, n_elements, n_elements, n_elements, n_elements, @@ -1950,9 +1950,6 @@ void Testbed::NerfTracer::enlarge(tcnn::GPUMemory& scratch_memory, size_t } void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_res, const Vector2f& focal_length, const Matrix& camera_matrix0, const Matrix& camera_matrix1, const Vector2f& screen_center, cudaStream_t stream) { - // Reserve the memory for max-res rendering to prevent stuttering - m_nerf.tracer.enlarge(m_scratch_gpu_memory, max_res.x() * max_res.y(), m_network->padded_output_width()); - float plane_z = m_slice_plane_z + m_scale; if (m_render_mode == ERenderMode::Slice) { plane_z = -plane_z; @@ -1960,8 +1957,11 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r ERenderMode render_mode = m_visualized_dimension > -1 ? ERenderMode::EncodingVis : m_render_mode; + ScopeGuard tmp_memory_guard{[&]() { + m_nerf.tracer.clear(); + }}; + m_nerf.tracer.init_rays_from_camera( - m_scratch_gpu_memory, render_buffer.spp(), m_network->padded_output_width(), render_buffer.resolution(), @@ -2183,12 +2183,13 @@ void Testbed::update_density_grid_nerf(float decay, uint32_t n_uniform_density_g const uint32_t padded_output_width = m_nerf_network->padded_density_output_width(); - auto scratch = m_scratch_gpu_memory.enlarge_and_distribute< + GPUMemoryArena::Allocation alloc; + auto scratch = allocate_workspace_and_distribute< NerfPosition, // positions at which the NN will be queried for density evaluation uint32_t, // indices of corresponding density grid cells float, // the resulting densities `density_grid_tmp` to be merged with the running estimate of the grid network_precision_t // output of the MLP before being converted to densities. - >(n_density_grid_samples, n_elements, n_elements, n_density_grid_samples * padded_output_width); + >(stream, &alloc, n_density_grid_samples, n_elements, n_elements, n_density_grid_samples * padded_output_width); NerfPosition* density_grid_positions = std::get<0>(scratch); uint32_t* density_grid_indices = std::get<1>(scratch); @@ -2501,7 +2502,8 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, uint32_t n_rays_per_ba m_nerf.training.ray_counter.enlarge(1); - auto scratch = m_scratch_gpu_memory.enlarge_and_distribute< + GPUMemoryArena::Allocation alloc; + auto scratch = allocate_workspace_and_distribute< uint32_t, // ray_indices Ray, // rays uint32_t, // numsteps @@ -2514,6 +2516,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, uint32_t n_rays_per_ba NerfCoordinate, // coords_gradient float // max_level_compacted >( + stream, &alloc, n_rays_per_batch, n_rays_per_batch, n_rays_per_batch * 2, @@ -2799,10 +2802,11 @@ GPUMemory Testbed::get_density_on_grid(Vector3i res3d, const BoundingBox& const uint32_t padded_output_width = nerf_mode ? m_nerf_network->padded_density_output_width() : m_network->padded_output_width(); - auto scratch = m_scratch_gpu_memory.enlarge_and_distribute< + GPUMemoryArena::Allocation alloc; + auto scratch = allocate_workspace_and_distribute< NerfPosition, network_precision_t - >(n_elements, batch_size * padded_output_width); + >(m_inference_stream, &alloc, n_elements, batch_size * padded_output_width); NerfPosition* positions = std::get<0>(scratch); network_precision_t* mlp_out = std::get<1>(scratch); @@ -2871,7 +2875,7 @@ int Testbed::marching_cubes(Vector3i res3d, const BoundingBox& aabb, float thres res3d.z() = next_multiple((unsigned int)res3d.z(), 16u); GPUMemory density = get_density_on_grid(res3d, aabb); - marching_cubes_gpu(m_scratch_gpu_memory, m_render_aabb, res3d, thresh, density, m_mesh.verts, m_mesh.indices); + marching_cubes_gpu(m_inference_stream, m_render_aabb, res3d, thresh, density, m_mesh.verts, m_mesh.indices); uint32_t n_verts = (uint32_t)m_mesh.verts.size(); m_mesh.verts_gradient.resize(n_verts);