nerfstudio-project · tancik · Mar 10, 2023 · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023
diff --git a/docs/developer_guides/pipelines/dataparsers.md b/docs/developer_guides/pipelines/dataparsers.md
@@ -67,10 +67,16 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """How much to downscale images. If not set, images are chosen such that the max dimension is <1600px."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    orientation_method: Literal["pca", "up"] = "up"
+    orientation_method: Literal["pca", "up", "vertical", "none"] = "up"
     """The method to use for orientation."""
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses."""
+    auto_scale_poses: bool = True
+    """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
     train_split_fraction: float = 0.9
     """The fraction of images to use for training. The remaining images are for eval."""
+    depth_unit_scale_factor: float = 1e-3
+    """Scales the depth values to meters. Default value is 0.001 for a millimeter to meter conversion."""
 
 @dataclass
 class Nerfstudio(DataParser):

diff --git a/nerfstudio/cameras/camera_utils.py b/nerfstudio/cameras/camera_utils.py
@@ -429,35 +429,89 @@ def rotation_matrix(a: TensorType[3], b: TensorType[3]) -> TensorType[3, 3]:
     return torch.eye(3) + skew_sym_mat + skew_sym_mat @ skew_sym_mat * ((1 - c) / (s**2 + 1e-8))
 
 
+def focus_of_attention(poses: TensorType["num_poses":..., 4, 4], initial_focus: TensorType[3]) -> TensorType[3]:
+    """Compute the focus of attention of a set of cameras. Only cameras
+    that have the focus of attention in front of them are considered.
+
+     Args:
+        poses: The poses to orient.
+        initial_focus: The 3D point views to decide which cameras are initially activated.
+
+    Returns:
+        The 3D position of the focus of attention.
+    """
+    # References to the same method in third-party code:
+    # https://github.com/google-research/multinerf/blob/1c8b1c552133cdb2de1c1f3c871b2813f6662265/internal/camera_utils.py#L145
+    # https://github.com/bmild/nerf/blob/18b8aebda6700ed659cb27a0c348b737a5f6ab60/load_llff.py#L197
+    active_directions = -poses[:, :3, 2:3]
+    active_origins = poses[:, :3, 3:4]
+    # initial value for testing if the focus_pt is in front or behind
+    focus_pt = initial_focus
+    # Prune cameras which have the current have the focus_pt behind them.
+    active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+    done = False
+    # We need at least two active cameras, else fallback on the previous solution.
+    # This may be the "poses" solution if no cameras are active on first iteration, e.g.
+    # they are in an outward-looking configuration.
+    while torch.sum(active.int()) > 1 and not done:
+        active_directions = active_directions[active]
+        active_origins = active_origins[active]
+        # https://en.wikipedia.org/wiki/Line–line_intersection#In_more_than_two_dimensions
+        m = torch.eye(3) - active_directions * torch.transpose(active_directions, -2, -1)
+        mt_m = torch.transpose(m, -2, -1) @ m
+        focus_pt = torch.linalg.inv(mt_m.mean(0)) @ (mt_m @ active_origins).mean(0)[:, 0]
+        active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+        if active.all():
+            # the set of active cameras did not change, so we're done.
+            done = True
+    return focus_pt
+
+
 def auto_orient_and_center_poses(
-    poses: TensorType["num_poses":..., 4, 4], method: Literal["pca", "up", "none"] = "up", center_poses: bool = True
+    poses: TensorType["num_poses":..., 4, 4],
+    method: Literal["pca", "up", "vertical", "none"] = "up",
+    center_method: Literal["poses", "focus", "none"] = "poses",
 ) -> Tuple[TensorType["num_poses":..., 3, 4], TensorType[4, 4]]:
     """Orients and centers the poses. We provide two methods for orientation: pca and up.
 
-    pca: Orient the poses so that the principal component of the points is aligned with the axes.
-        This method works well when all of the cameras are in the same plane.
+    pca: Orient the poses so that the principal directions of the camera centers are aligned
+        with the axes, Z corresponding to the smallest principal component.
+        This method works well when all of the cameras are in the same plane, for example when
+        images are taken using a mobile robot.
     up: Orient the poses so that the average up vector is aligned with the z axis.
         This method works well when images are not at arbitrary angles.
+    vertical: Orient the poses so that the Z 3D direction projects close to the
+        y axis in images. This method works better if cameras are not all
+        looking in the same 3D direction, which may happen in camera arrays or in LLFF.
 
+    There are two centering methods:
+    poses: The poses are centered around the origin.
+    focus: The origin is set to the focus of attention of all cameras (the
+        closest point to cameras optical axes). Recommended for inward-looking
+        camera configurations.
 
     Args:
         poses: The poses to orient.
         method: The method to use for orientation.
-        center_poses: If True, the poses are centered around the origin.
+        center_method: The method to use to center the poses.
 
     Returns:
         Tuple of the oriented poses and the transform matrix.
     """
 
-    translation = poses[..., :3, 3]
+    origins = poses[..., :3, 3]
 
-    mean_translation = torch.mean(translation, dim=0)
-    translation_diff = translation - mean_translation
+    mean_origin = torch.mean(origins, dim=0)
+    translation_diff = origins - mean_origin
 
-    if center_poses:
-        translation = mean_translation
+    if center_method == "poses":
+        translation = mean_origin
+    elif center_method == "focus":
+        translation = focus_of_attention(poses, mean_origin)
+    elif center_method == "none":
+        translation = torch.zeros_like(mean_origin)
     else:
-        translation = torch.zeros_like(mean_translation)
+        raise ValueError(f"Unknown value for center_method: {center_method}")
 
     if method == "pca":
         _, eigvec = torch.linalg.eigh(translation_diff.T @ translation_diff)
@@ -471,9 +525,41 @@ def auto_orient_and_center_poses(
 
         if oriented_poses.mean(axis=0)[2, 1] < 0:
             oriented_poses[:, 1:3] = -1 * oriented_poses[:, 1:3]
-    elif method == "up":
+    elif method in ("up", "vertical"):
         up = torch.mean(poses[:, :3, 1], dim=0)
         up = up / torch.linalg.norm(up)
+        if method == "vertical":
+            # If cameras are not all parallel (e.g. not in an LLFF configuration),
+            # we can find the 3D direction that most projects vertically in all
+            # cameras by minimizing ||Xu|| s.t. ||u||=1. This total least squares
+            # problem is solved by SVD.
+            x_axis_matrix = poses[:, :3, 0]
+            _, S, Vh = torch.linalg.svd(x_axis_matrix, full_matrices=False)
+            # Singular values are S_i=||Xv_i|| for each right singular vector v_i.
+            # ||S|| = sqrt(n) because lines of X are all unit vectors and the v_i
+            # are an orthonormal basis.
+            # ||Xv_i|| = sqrt(sum(dot(x_axis_j,v_i)^2)), thus S_i/sqrt(n) is the
+            # RMS of cosines between x axes and v_i. If the second smallest singular
+            # value corresponds to an angle error less than 10° (cos(80°)=0.17),
+            # this is probably a degenerate camera configuration (typical values
+            # are around 5° average error for the true vertical). In this case,
+            # rather than taking the vector corresponding to the smallest singular
+            # value, we project the "up" vector on the plane spanned by the two
+            # best singular vectors. We could also just fallback to the "up"
+            # solution.
+            if S[1] > 0.17 * math.sqrt(poses.shape[0]):
+                # regular non-degenerate configuration
+                up_vertical = Vh[2, :]
+                # It may be pointing up or down. Use "up" to disambiguate the sign.
+                up = up_vertical if torch.dot(up_vertical, up) > 0 else -up_vertical
+            else:
+                # Degenerate configuration: project "up" on the plane spanned by
+                # the last two right singular vectors (which are orthogonal to the
+                # first). v_0 is a unit vector, no need to divide by its norm when
+                # projecting.
+                up = up - Vh[0, :] * torch.dot(up, Vh[0, :])
+                # re-normalize
+                up = up / torch.linalg.norm(up)
 
         rotation = rotation_matrix(up, torch.Tensor([0, 0, 1]))
         transform = torch.cat([rotation, rotation @ -translation[..., None]], dim=-1)
@@ -483,5 +569,7 @@ def auto_orient_and_center_poses(
         transform[:3, 3] = -translation
         transform = transform[:3, :]
         oriented_poses = transform @ poses
+    else:
+        raise ValueError(f"Unknown value for method: {method}")
 
     return oriented_poses, transform
diff --git a/nerfstudio/data/dataparsers/arkitscenes_dataparser.py b/nerfstudio/data/dataparsers/arkitscenes_dataparser.py
@@ -21,7 +21,9 @@
 import cv2
 import numpy as np
 import torch
+from typing_extensions import Literal
 
+from nerfstudio.cameras import camera_utils
 from nerfstudio.cameras.cameras import Cameras, CameraType
 from nerfstudio.data.dataparsers.base_dataparser import (
     DataParser,
@@ -70,11 +72,13 @@ class ARKitScenesDataParserConfig(DataParserConfig):
     """target class to instantiate"""
     data: Path = Path("data/ARKitScenes/3dod/Validation/41069021")
     """Path to ARKitScenes folder with densely extracted scenes."""
+    scale_factor: float = 1.0
+    """How much to scale the camera origins by."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    center_poses: bool = True
-    """Whether to center the poses."""
-    scale_poses: bool = True
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses."""
+    auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
     train_split_fraction: float = 0.9
     """The fraction of images to use for training. The remaining images are for eval."""
@@ -141,11 +145,19 @@ def _generate_dataparser_outputs(self, split="train"):
         poses = torch.from_numpy(np.stack(poses).astype(np.float32))
         intrinsics = torch.from_numpy(np.stack(intrinsics).astype(np.float32))
 
-        if self.config.center_poses:
-            poses[:, :3, 3] -= poses[:, :3, 3].mean(dim=0)
+        poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
+            poses,
+            method="none",
+            center_method=self.config.center_method,
+        )
+
+        # Scale poses
+        scale_factor = 1.0
+        if self.config.auto_scale_poses:
+            scale_factor /= float(torch.max(torch.abs(poses[:, :3, 3])))
+        scale_factor *= self.config.scale_factor
 
-        if self.config.scale_poses:
-            poses[:, :3, 3] /= poses[:, :3, 3].abs().max()
+        poses[:, :3, 3] *= scale_factor
 
         # Choose image_filenames and poses based on split, but after auto orient and scaling the poses.
         image_filenames = [image_filenames[i] for i in indices]
@@ -177,6 +189,8 @@ def _generate_dataparser_outputs(self, split="train"):
             image_filenames=image_filenames,
             cameras=cameras,
             scene_box=scene_box,
+            dataparser_scale=scale_factor,
+            dataparser_transform=transform_matrix,
             metadata={
                 "depth_filenames": depth_filenames if len(depth_filenames) > 0 else None,
                 "depth_unit_scale_factor": self.config.depth_unit_scale_factor,

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -54,10 +54,10 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """How much to downscale images. If not set, images are chosen such that the max dimension is <1600px."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    orientation_method: Literal["pca", "up", "none"] = "up"
+    orientation_method: Literal["pca", "up", "vertical", "none"] = "up"
     """The method to use for orientation."""
-    center_poses: bool = True
-    """Whether to center the poses."""
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses."""
     auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
     train_split_fraction: float = 0.9
@@ -209,7 +209,7 @@ def _generate_dataparser_outputs(self, split="train"):
         poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
             poses,
             method=orientation_method,
-            center_poses=self.config.center_poses,
+            center_method=self.config.center_method,
         )
 
         # Scale poses

diff --git a/nerfstudio/data/dataparsers/phototourism_dataparser.py b/nerfstudio/data/dataparsers/phototourism_dataparser.py
@@ -60,12 +60,12 @@ class PhototourismDataParserConfig(DataParserConfig):
     """The fraction of images to use for training. The remaining images are for eval."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    orientation_method: Literal["pca", "up", "none"] = "up"
+    orientation_method: Literal["pca", "up", "vertical", "none"] = "up"
     """The method to use for orientation."""
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses."""
     auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
-    center_poses: bool = True
-    """Whether to center the poses."""
 
 
 @dataclass
@@ -147,7 +147,7 @@ def _generate_dataparser_outputs(self, split="train"):
             raise ValueError(f"Unknown dataparser split {split}")
 
         poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
-            poses, method=self.config.orientation_method, center_poses=self.config.center_poses
+            poses, method=self.config.orientation_method, center_method=self.config.center_method
         )
 
         # Scale poses

diff --git a/nerfstudio/data/dataparsers/scannet_dataparser.py b/nerfstudio/data/dataparsers/scannet_dataparser.py
@@ -21,7 +21,9 @@
 import cv2
 import numpy as np
 import torch
+from typing_extensions import Literal
 
+from nerfstudio.cameras import camera_utils
 from nerfstudio.cameras.cameras import Cameras, CameraType
 from nerfstudio.data.dataparsers.base_dataparser import (
     DataParser,
@@ -47,11 +49,13 @@ class ScanNetDataParserConfig(DataParserConfig):
     """target class to instantiate"""
     data: Path = Path("data/scannet/scene0423_02")
     """Path to ScanNet folder with densely extracted scenes."""
+    scale_factor: float = 1.0
+    """How much to scale the camera origins by."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    center_poses: bool = True
-    """Whether to center the poses."""
-    scale_poses: bool = True
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses."""
+    auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
     train_split_fraction: float = 0.9
     """The fraction of images to use for training. The remaining images are for eval."""
@@ -115,11 +119,19 @@ def _generate_dataparser_outputs(self, split="train"):
         poses = torch.from_numpy(np.stack(poses).astype(np.float32))
         intrinsics = torch.from_numpy(np.stack(intrinsics).astype(np.float32))
 
-        if self.config.center_poses:
-            poses[:, :3, 3] -= poses[:, :3, 3].mean(dim=0)
+        poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
+            poses,
+            method="none",
+            center_method=self.config.center_method,
+        )
+
+        # Scale poses
+        scale_factor = 1.0
+        if self.config.auto_scale_poses:
+            scale_factor /= float(torch.max(torch.abs(poses[:, :3, 3])))
+        scale_factor *= self.config.scale_factor
 
-        if self.config.scale_poses:
-            poses[:, :3, 3] /= poses[:, :3, 3].abs().max()
+        poses[:, :3, 3] *= scale_factor
 
         # Choose image_filenames and poses based on split, but after auto orient and scaling the poses.
         image_filenames = [image_filenames[i] for i in indices]
@@ -151,6 +163,8 @@ def _generate_dataparser_outputs(self, split="train"):
             image_filenames=image_filenames,
             cameras=cameras,
             scene_box=scene_box,
+            dataparser_scale=scale_factor,
+            dataparser_transform=transform_matrix,
             metadata={
                 "depth_filenames": depth_filenames if len(depth_filenames) > 0 else None,
                 "depth_unit_scale_factor": self.config.depth_unit_scale_factor,

diff --git a/nerfstudio/data/dataparsers/sdfstudio_dataparser.py b/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
@@ -116,7 +116,7 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
             camera_to_worlds, transform = camera_utils.auto_orient_and_center_poses(
                 camera_to_worlds,
                 method="up",
-                center_poses=False,
+                center_method="none",
             )
 
         # scene box from meta data