From 34bf44e7f53c0865236fabab448fcde46b9a4618 Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Wed, 14 Aug 2024 13:49:39 +0300
Subject: [PATCH 1/6] add a way to orient based on main direction of the flat
 surface

---
 .../data/dataparsers/nerfstudio_dataparser.py | 126 +++++++++++++++---
 1 file changed, 107 insertions(+), 19 deletions(-)

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index e11902c094..088716cd2d 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -53,7 +53,7 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """How much to downscale images. If not set, images are chosen such that the max dimension is <1600px."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    orientation_method: Literal["pca", "up", "vertical", "none"] = "up"
+    orientation_method: Literal["pca", "up", "vertical", "none", "align"] = "up"
     """The method to use for orientation."""
     center_method: Literal["poses", "focus", "none"] = "poses"
     """The method to use to center the poses."""
@@ -232,6 +232,8 @@ def _generate_dataparser_outputs(self, split="train"):
             CONSOLE.log(f"[yellow] Dataset is overriding orientation method to {orientation_method}")
         else:
             orientation_method = self.config.orientation_method
+            if orientation_method == "align":
+                orientation_method = "up"
 
         poses = torch.from_numpy(np.array(poses).astype(np.float32))
         poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
@@ -298,22 +300,6 @@ def _generate_dataparser_outputs(self, split="train"):
         if (camera_type in [CameraType.FISHEYE, CameraType.FISHEYE624]) and (fisheye_crop_radius is not None):
             metadata["fisheye_crop_radius"] = fisheye_crop_radius
 
-        cameras = Cameras(
-            fx=fx,
-            fy=fy,
-            cx=cx,
-            cy=cy,
-            distortion_params=distortion_params,
-            height=height,
-            width=width,
-            camera_to_worlds=poses[:, :3, :4],
-            camera_type=camera_type,
-            metadata=metadata,
-        )
-
-        assert self.downscale_factor is not None
-        cameras.rescale_output_resolution(scaling_factor=1.0 / self.downscale_factor)
-
         # The naming is somewhat confusing, but:
         # - transform_matrix contains the transformation to dataparser output coordinates from saved coordinates.
         # - dataparser_transform_matrix contains the transformation to dataparser output coordinates from original data coordinates.
@@ -348,6 +334,9 @@ def _generate_dataparser_outputs(self, split="train"):
         except AttributeError:
             self.prompted_user = False
 
+        alignment_matrix = None
+        sparse_points = None
+
         # Load 3D points
         if self.config.load_3D_points:
             if "ply_file_path" in meta:
@@ -399,10 +388,45 @@ def _generate_dataparser_outputs(self, split="train"):
 
             if ply_file_path:
                 sparse_points = self._load_3D_points(ply_file_path, transform_matrix, scale_factor)
-                if sparse_points is not None:
-                    metadata.update(sparse_points)
+
+                if orientation_method == "align":
+                    points3D_xyz = sparse_points["points3D_xyz"]
+                    aligned_points3D, alignment_matrix = self._align_points_to_target_plane(
+                        points3D_xyz, torch.tensor([0, 1, 0], dtype=torch.float32)
+                    )
+                    sparse_points["points3D_xyz"] = aligned_points3D[:, :3]
             self.prompted_user = True
 
+        if alignment_matrix is not None:
+            num_poses = poses.shape[0]
+            bottom_row = torch.tensor([0, 0, 0, 1], dtype=torch.float32).unsqueeze(0).expand(num_poses, -1, -1)
+            poses_homogeneous = torch.cat([poses, bottom_row], dim=1)  # Shape: (num_poses, 4, 4)
+
+            poses = alignment_matrix @ poses_homogeneous
+            dataparser_transform_matrix = torch.cat(
+                [dataparser_transform_matrix, torch.tensor([0, 0, 0, 1], dtype=torch.float32).unsqueeze(0)], dim=0
+            )
+            dataparser_transform_matrix = alignment_matrix @ dataparser_transform_matrix
+
+        cameras = Cameras(
+            fx=fx,
+            fy=fy,
+            cx=cx,
+            cy=cy,
+            distortion_params=distortion_params,
+            height=height,
+            width=width,
+            camera_to_worlds=poses[:, :3, :4],
+            camera_type=camera_type,
+            metadata=metadata,
+        )
+
+        assert self.downscale_factor is not None
+        cameras.rescale_output_resolution(scaling_factor=1.0 / self.downscale_factor)
+
+        if sparse_points is not None:
+            metadata.update(sparse_points)
+
         dataparser_outputs = DataparserOutputs(
             image_filenames=image_filenames,
             cameras=cameras,
@@ -458,6 +482,70 @@ def _load_3D_points(self, ply_file_path: Path, transform_matrix: torch.Tensor, s
         }
         return out
 
+    @staticmethod
+    def _align_points_to_target_plane(points: torch.Tensor, target_normal: torch.Tensor):
+        """Aligns a set of 3D points (in homogeneous coordinates) to a target plane defined by its normal vector.
+
+        Args:
+            points: A torch tensor of shape (n, 4) representing the 3D points in homogeneous coordinates.
+            target_normal: A torch tensor of shape (3, ) representing the normal vector of the target plane.
+
+        Returns:
+            A tuple containing:
+            - aligned_points: The 3D points aligned to the target plane as a torch tensor of shape (n, 4).
+            - alignment_matrix: The 4x4 alignment matrix used for alignment.
+        """
+        points_xyz = points[:, :3]  # Shape: (n, 3)
+
+        # Calculate the centroid (mean of points)
+        centroid = torch.mean(points_xyz, dim=0)  # Shape: (3,)
+
+        # Center the points around the centroid
+        centered_points = points_xyz - centroid  # Shape: (n, 3)
+
+        # Perform SVD
+        _, _, vh = torch.linalg.svd(centered_points)  # vh shape: (3, 3)
+
+        # The last right singular vector is the normal to the plane
+        normal = vh[-1]  # Shape: (3,)
+
+        # Calculate the rotation axis and angle
+        rotation_axis = torch.cross(normal, target_normal)  # Shape: (3,)
+        rotation_axis_norm = torch.norm(rotation_axis)
+
+        if rotation_axis_norm != 0:
+            rotation_axis /= rotation_axis_norm
+            cos_theta = torch.dot(normal, target_normal)
+            theta = torch.arccos(cos_theta)
+
+            # Create the rotation matrix using Rodrigues' rotation formula
+            K = torch.tensor(
+                [
+                    [0, -rotation_axis[2], rotation_axis[1]],
+                    [rotation_axis[2], 0, -rotation_axis[0]],
+                    [-rotation_axis[1], rotation_axis[0], 0],
+                ],
+                dtype=torch.float32,
+            )
+            rotation_matrix = torch.eye(3) + torch.sin(theta) * K + (1 - torch.cos(theta)) * (K @ K)
+        else:
+            rotation_matrix = torch.eye(3)  # If the normal is already aligned, no rotation needed
+
+        # Create the 4x4 alignment matrix
+        alignment_matrix = torch.eye(4, dtype=torch.float32)  # Shape: (4, 4)
+        alignment_matrix[:3, :3] = rotation_matrix  # Insert rotation part
+        alignment_matrix[:3, 3] = -rotation_matrix @ centroid  # Apply translation
+
+        # Ensure the points are in homogeneous coordinates
+        if points.shape[1] == 3:
+            points = torch.cat([points, torch.ones((points.shape[0], 1), dtype=torch.float32)], dim=1)  # Shape: (n, 4)
+
+        # Apply the alignment transformation
+        aligned_points = alignment_matrix @ points.T  # Shape: (4, n)
+        aligned_points = aligned_points.T  # Shape: (n, 4)
+
+        return aligned_points, alignment_matrix
+
     def _get_fname(self, filepath: Path, data_dir: Path, downsample_folder_prefix="images_") -> Path:
         """Get the filename of the image file.
         downsample_folder_prefix can be used to point to auxiliary image data, e.g. masks

From 50863bfc4327b1b2e1b59662af5f1cb3152efac1 Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Mon, 26 Aug 2024 13:36:43 +0300
Subject: [PATCH 2/6] add a way to align main flat surface

---
 .../data/dataparsers/base_dataparser.py       |  2 +-
 .../data/dataparsers/nerfstudio_dataparser.py | 89 ++++++++++++-------
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/nerfstudio/data/dataparsers/base_dataparser.py b/nerfstudio/data/dataparsers/base_dataparser.py
index 5cf1e6bdbf..3e59cf2778 100644
--- a/nerfstudio/data/dataparsers/base_dataparser.py
+++ b/nerfstudio/data/dataparsers/base_dataparser.py
@@ -57,7 +57,7 @@ class DataparserOutputs:
     """Camera object storing collection of camera information in dataset."""
     alpha_color: Optional[Float[Tensor, "3"]] = None
     """Color of dataset background."""
-    scene_box: SceneBox = field(default_factory=lambda: SceneBox(aabb=torch.tensor([[-1, -1, -1], [1, 1, 1]])))
+    scene_box: SceneBox = SceneBox(aabb=torch.tensor([[-0.283, -0.032, -0.328], [0.661, 0.034, 0.370]]))
     """Scene box of dataset. Used to bound the scene or provide the scene scale depending on model."""
     mask_filenames: Optional[List[Path]] = None
     """Filenames for any masks that are required"""
diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index 088716cd2d..350cab8dee 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -53,8 +53,16 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """How much to downscale images. If not set, images are chosen such that the max dimension is <1600px."""
     scene_scale: float = 1.0
     """How much to scale the region of interest by."""
-    orientation_method: Literal["pca", "up", "vertical", "none", "align"] = "up"
+    orientation_method: Literal[
+        "pca",
+        "up",
+        "vertical",
+        "align",
+        "none",
+    ] = "vertical"
     """The method to use for orientation."""
+    target_normal: Tuple[float, float, float] = (1.0, 0.0, 0.0)
+    """The normal vector to align the scene to, represented as a tuple of floats."""
     center_method: Literal["poses", "focus", "none"] = "poses"
     """The method to use to center the poses."""
     auto_scale_poses: bool = True
@@ -260,6 +268,7 @@ def _generate_dataparser_outputs(self, split="train"):
 
         # in x,y,z order
         # assumes that the scene is centered at the origin
+        # _ = self.config.scene_scale
         aabb_scale = self.config.scene_scale
         scene_box = SceneBox(
             aabb=torch.tensor(
@@ -389,10 +398,11 @@ def _generate_dataparser_outputs(self, split="train"):
             if ply_file_path:
                 sparse_points = self._load_3D_points(ply_file_path, transform_matrix, scale_factor)
 
-                if orientation_method == "align":
+                if sparse_points is not None and self.config.orientation_method == "align":
+                    target_normal_tensor = torch.tensor(self.config.target_normal, dtype=torch.float32)
                     points3D_xyz = sparse_points["points3D_xyz"]
                     aligned_points3D, alignment_matrix = self._align_points_to_target_plane(
-                        points3D_xyz, torch.tensor([0, 1, 0], dtype=torch.float32)
+                        points3D_xyz, target_normal_tensor
                     )
                     sparse_points["points3D_xyz"] = aligned_points3D[:, :3]
             self.prompted_user = True
@@ -483,53 +493,64 @@ def _load_3D_points(self, ply_file_path: Path, transform_matrix: torch.Tensor, s
         return out
 
     @staticmethod
-    def _align_points_to_target_plane(points: torch.Tensor, target_normal: torch.Tensor):
+    def _align_points_to_target_plane(
+        points: torch.Tensor,
+        target_normal: Tuple[float, float, float],
+        target_point: Tuple[float, float, float],
+        threshold: float = 1.0,
+        max_iterations: int = 5,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Aligns a set of 3D points (in homogeneous coordinates) to a target plane defined by its normal vector.
 
         Args:
             points: A torch tensor of shape (n, 4) representing the 3D points in homogeneous coordinates.
             target_normal: A torch tensor of shape (3, ) representing the normal vector of the target plane.
+            threshold: The distance threshold for identifying inliers.
+            max_iterations: The maximum number of iterations for refining inliers.
 
         Returns:
             A tuple containing:
             - aligned_points: The 3D points aligned to the target plane as a torch tensor of shape (n, 4).
             - alignment_matrix: The 4x4 alignment matrix used for alignment.
         """
-        points_xyz = points[:, :3]  # Shape: (n, 3)
 
-        # Calculate the centroid (mean of points)
-        centroid = torch.mean(points_xyz, dim=0)  # Shape: (3,)
+        def filter_outliers(points_xyz, threshold, max_iterations):
+            inlier_mask = torch.ones(points_xyz.size(0), dtype=torch.bool)
+
+            for _ in range(max_iterations):
+                current_inliers = points_xyz[inlier_mask]
+                centroid = torch.mean(current_inliers, dim=0)
+
+                centered_points = current_inliers - centroid
+                _, _, vh = torch.linalg.svd(centered_points)
+
+                normal = vh[-1]
+
+                distances = torch.abs((points_xyz - centroid) @ normal)
+                new_inlier_mask = distances < threshold
+
+                inlier_mask = inlier_mask & new_inlier_mask
 
-        # Center the points around the centroid
-        centered_points = points_xyz - centroid  # Shape: (n, 3)
+                threshold *= 0.9  # Reduce threshold for more aggressiveness
 
-        # Perform SVD
-        _, _, vh = torch.linalg.svd(centered_points)  # vh shape: (3, 3)
+            return inlier_mask
 
-        # The last right singular vector is the normal to the plane
+        points_xyz = points[:, :3]  # Shape: (n, 3)
+        inlier_mask = filter_outliers(points_xyz, threshold, max_iterations)
+        inliers = points_xyz[inlier_mask]
+
+        # Calculate the centroid using only inliers
+        centroid = torch.mean(inliers, dim=0)  # Shape: (3,)
+
+        # Center the inlier points around the centroid
+        centered_inliers = inliers - centroid  # Shape: (m, 3) where m <= n
+
+        # Perform SVD on inliers to find the normal
+        _, _, vh = torch.linalg.svd(centered_inliers)  # vh shape: (3, 3)
         normal = vh[-1]  # Shape: (3,)
 
-        # Calculate the rotation axis and angle
-        rotation_axis = torch.cross(normal, target_normal)  # Shape: (3,)
-        rotation_axis_norm = torch.norm(rotation_axis)
-
-        if rotation_axis_norm != 0:
-            rotation_axis /= rotation_axis_norm
-            cos_theta = torch.dot(normal, target_normal)
-            theta = torch.arccos(cos_theta)
-
-            # Create the rotation matrix using Rodrigues' rotation formula
-            K = torch.tensor(
-                [
-                    [0, -rotation_axis[2], rotation_axis[1]],
-                    [rotation_axis[2], 0, -rotation_axis[0]],
-                    [-rotation_axis[1], rotation_axis[0], 0],
-                ],
-                dtype=torch.float32,
-            )
-            rotation_matrix = torch.eye(3) + torch.sin(theta) * K + (1 - torch.cos(theta)) * (K @ K)
-        else:
-            rotation_matrix = torch.eye(3)  # If the normal is already aligned, no rotation needed
+        # Use the provided helper function to get the rotation matrix
+        rotation_matrix = camera_utils.rotation_matrix_between(normal, target_normal)
 
         # Create the 4x4 alignment matrix
         alignment_matrix = torch.eye(4, dtype=torch.float32)  # Shape: (4, 4)
@@ -540,7 +561,7 @@ def _align_points_to_target_plane(points: torch.Tensor, target_normal: torch.Ten
         if points.shape[1] == 3:
             points = torch.cat([points, torch.ones((points.shape[0], 1), dtype=torch.float32)], dim=1)  # Shape: (n, 4)
 
-        # Apply the alignment transformation
+        # Apply the alignment transformation to all points
         aligned_points = alignment_matrix @ points.T  # Shape: (4, n)
         aligned_points = aligned_points.T  # Shape: (n, 4)
 

From 39236adb105a38eb035208f8050992bf782d7ae7 Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Mon, 26 Aug 2024 13:37:47 +0300
Subject: [PATCH 3/6] use default param for scene box

---
 nerfstudio/data/dataparsers/base_dataparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nerfstudio/data/dataparsers/base_dataparser.py b/nerfstudio/data/dataparsers/base_dataparser.py
index 3e59cf2778..5cf1e6bdbf 100644
--- a/nerfstudio/data/dataparsers/base_dataparser.py
+++ b/nerfstudio/data/dataparsers/base_dataparser.py
@@ -57,7 +57,7 @@ class DataparserOutputs:
     """Camera object storing collection of camera information in dataset."""
     alpha_color: Optional[Float[Tensor, "3"]] = None
     """Color of dataset background."""
-    scene_box: SceneBox = SceneBox(aabb=torch.tensor([[-0.283, -0.032, -0.328], [0.661, 0.034, 0.370]]))
+    scene_box: SceneBox = field(default_factory=lambda: SceneBox(aabb=torch.tensor([[-1, -1, -1], [1, 1, 1]])))
     """Scene box of dataset. Used to bound the scene or provide the scene scale depending on model."""
     mask_filenames: Optional[List[Path]] = None
     """Filenames for any masks that are required"""

From 72006e14748f98480348e6e389f8d6d6f91c55cb Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Mon, 26 Aug 2024 13:39:42 +0300
Subject: [PATCH 4/6] annotate _load_3D_points output

---
 nerfstudio/data/dataparsers/nerfstudio_dataparser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index 350cab8dee..09d40fc38f 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -453,7 +453,9 @@ def _generate_dataparser_outputs(self, split="train"):
         )
         return dataparser_outputs
 
-    def _load_3D_points(self, ply_file_path: Path, transform_matrix: torch.Tensor, scale_factor: float):
+    def _load_3D_points(
+        self, ply_file_path: Path, transform_matrix: torch.Tensor, scale_factor: float
+    ) -> Optional[Dict[str, torch.Tensor]]:
         """Loads point clouds positions and colors from .ply
 
         Args:

From d0c97f23737cd7c18f8f7eaf4545748e59e1d520 Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Mon, 26 Aug 2024 13:44:29 +0300
Subject: [PATCH 5/6] change arg type

---
 nerfstudio/data/dataparsers/nerfstudio_dataparser.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index 09d40fc38f..7d2d9d543c 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -497,8 +497,7 @@ def _load_3D_points(
     @staticmethod
     def _align_points_to_target_plane(
         points: torch.Tensor,
-        target_normal: Tuple[float, float, float],
-        target_point: Tuple[float, float, float],
+        target_normal: Float[Tensor, "3"],
         threshold: float = 1.0,
         max_iterations: int = 5,
     ) -> Tuple[torch.Tensor, torch.Tensor]:

From a3fceabbb96f7096719cbb6af2662cf64dc893a5 Mon Sep 17 00:00:00 2001
From: Hoanh Le <hoanhle100100@gmail.com>
Date: Mon, 26 Aug 2024 13:52:12 +0300
Subject: [PATCH 6/6] add missing types

---
 nerfstudio/data/dataparsers/nerfstudio_dataparser.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index 7d2d9d543c..226449e5fd 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -17,11 +17,13 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Literal, Optional, Tuple, Type
+from typing import Dict, Literal, Optional, Tuple, Type
 
 import numpy as np
 import torch
+from jaxtyping import Float
 from PIL import Image
+from torch import Tensor
 
 from nerfstudio.cameras import camera_utils
 from nerfstudio.cameras.cameras import CAMERA_MODEL_TO_TYPE, Cameras, CameraType