Implement rowsize_to_index (Cloud-Drift#273)

* Implement rowsize_to_index * Update analysis.py expand docstring an example * lint --------- Co-authored-by: Shane Elipot <selipot@miami.edu> Co-authored-by: Philippe Miron <philippe.miron@dtn.com>
philippemiron · Nov 16, 2023 · adbf94d · adbf94d
1 parent d2c3861
commit adbf94d
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 4 deletions.
diff --git a/clouddrift/analysis.py b/clouddrift/analysis.py
@@ -454,6 +454,33 @@ def segment(
         return np.concatenate(segment_sizes)
 
 
+def rowsize_to_index(rowsize: Union[list, np.ndarray, xr.DataArray]) -> np.ndarray:
+    """Convert a list of row sizes to a list of indices.
+
+    This function is typically used to obtain the indices of data rows organized
+    in a ragged array.
+
+    Parameters
+    ----------
+    rowsize : list or np.ndarray or xr.DataArray
+        A list of row sizes.
+
+    Returns
+    -------
+    np.ndarray
+        A list of indices.
+
+    Examples
+    --------
+
+    To obtain the indices within a ragged array of three consecutive rows of sizes 100, 202, and 53:
+
+    >>> rowsize_to_index([100, 202, 53])
+    array([0, 100, 302, 355])
+    """
+    return np.cumsum(np.insert(np.array(rowsize), 0, 0))
+
+
 def position_from_velocity(
     u: np.ndarray,
     v: np.ndarray,
@@ -1023,7 +1050,7 @@ def subset(
             raise ValueError(f"Unknown variable '{key}'.")
 
     # remove data when trajectories are filtered
-    traj_idx = np.insert(np.cumsum(ds[rowsize_var_name].values), 0, 0)
+    traj_idx = rowsize_to_index(ds[rowsize_var_name].values)
     for i in np.where(~mask_traj)[0]:
         mask_obs[slice(traj_idx[i], traj_idx[i + 1])] = False
 
@@ -1093,5 +1120,5 @@ def unpack_ragged(
         )):
             u, v = velocity_from_position(lon, lat, time)
     """
-    indices = np.insert(np.cumsum(np.array(rowsize)), 0, 0)
+    indices = rowsize_to_index(rowsize)
     return [ragged_array[indices[n] : indices[n + 1]] for n in range(indices.size - 1)]
diff --git a/clouddrift/raggedarray.py b/clouddrift/raggedarray.py
@@ -4,6 +4,7 @@
 Datasets and Awkward Arrays.
 """
 import awkward as ak
+from clouddrift.analysis import rowsize_to_index
 import xarray as xr
 import numpy as np
 from collections.abc import Callable
@@ -316,7 +317,7 @@ def allocate(
         ds = preprocess_func(indices[0], **kwargs)
         nb_traj = len(rowsize)
         nb_obs = np.sum(rowsize).astype("int")
-        index_traj = np.insert(np.cumsum(rowsize), 0, 0)
+        index_traj = rowsize_to_index(rowsize)
 
         # allocate memory
         coords = {}
@@ -410,7 +411,7 @@ def to_awkward(self):
         ak.Array
             Awkward Array containing the ragged array and its attributes
         """
-        index_traj = np.insert(np.cumsum(self.metadata["rowsize"]), 0, 0)
+        index_traj = rowsize_to_index(self.metadata["rowsize"])
         offset = ak.index.Index64(index_traj)
 
         data = []

diff --git a/tests/analysis_tests.py b/tests/analysis_tests.py
@@ -5,6 +5,7 @@
     position_from_velocity,
     ragged_to_regular,
     regular_to_ragged,
+    rowsize_to_index,
     segment,
     subset,
     unpack_ragged,
@@ -889,3 +890,14 @@ def test_unpack_ragged(self):
         self.assertTrue(
             np.all([lon[n].size == ds["rowsize"][n] for n in range(len(lon))])
         )
+
+
+class rowsize_to_index_tests(unittest.TestCase):
+    def test_rowsize_to_index(self):
+        rowsize = [2, 3, 4]
+        expected = np.array([0, 2, 5, 9])
+        self.assertTrue(np.all(rowsize_to_index(rowsize) == expected))
+        self.assertTrue(np.all(rowsize_to_index(np.array(rowsize)) == expected))
+        self.assertTrue(
+            np.all(rowsize_to_index(xr.DataArray(data=rowsize)) == expected)
+        )