Cloud-Drift · kevinsantana11 · Aug 15, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/clouddrift/adapters/gdp/__init__.py b/clouddrift/adapters/gdp/__init__.py
@@ -7,10 +7,12 @@
 
 import os
 import tempfile
+import typing
 
 import numpy as np
 import pandas as pd
 import xarray as xr
+from numpy.typing import NDArray
 
 from clouddrift.adapters.utils import download_with_progress
 from clouddrift.raggedarray import DimNames
@@ -269,7 +271,9 @@ def str_to_float(value: str, default: float = np.nan) -> float:
         return default
 
 
-def cut_str(value: str, max_length: int) -> np.chararray:
+def cut_str(
+    value: str, max_length: int
+) -> np.chararray[typing.Any, np.dtype[np.bytes_]]:
     """Cut a string to a specific length and return it as a numpy chararray.
 
     Parameters
@@ -289,7 +293,7 @@ def cut_str(value: str, max_length: int) -> np.chararray:
     return charar
 
 
-def drogue_presence(lost_time, time) -> np.ndarray:
+def drogue_presence(lost_time, time) -> NDArray[typing.Any]:
     """Create drogue status from the drogue lost time and the trajectory time.
 
     Parameters

diff --git a/clouddrift/adapters/gdp/gdpsource.py b/clouddrift/adapters/gdp/gdpsource.py
@@ -5,12 +5,13 @@
 import logging
 import os
 import tempfile
+import typing
 import warnings
 from collections import defaultdict
 from concurrent.futures import Future, ProcessPoolExecutor, as_completed
-from typing import Callable
 
 import numpy as np
+import numpy.typing as np_typing
 import pandas as pd
 import xarray as xr
 from tqdm.asyncio import tqdm
@@ -56,7 +57,7 @@
     "death_code",
 ]
 
-_VARS_FILL_MAP: dict = {
+_VARS_FILL_MAP: dict[str, int | str | np.datetime64] = {
     "wmo_number": -999,
     "program_number": -999,
     "buoys_type": "N/A",
@@ -70,7 +71,7 @@
     "death_code": -999,
 }
 
-_VAR_DTYPES: dict = {
+_VAR_DTYPES: dict[str, type | np.dtype[np.datetime64]] = {
     "rowsize": np.int64,
     "wmo_number": np.int64,
     "program_number": np.int64,
@@ -126,7 +127,7 @@
 }
 
 
-VARS_ATTRS: dict = {
+VARS_ATTRS: dict[str, dict[str, str]] = {
     "id": {"long_name": "Global Drifter Program Buoy ID", "units": "-"},
     "rowsize": {
         "long_name": "Number of observations per trajectory",
@@ -334,7 +335,9 @@
     return dataset
 
 
-def _apply_remove(df: pd.DataFrame, filters: list[Callable]) -> pd.DataFrame:
+def _apply_remove(
+    df: pd.DataFrame, filters: list[typing.Callable[..., typing.Any]]
+) -> pd.DataFrame:
     temp_df = df
     for filter_ in filters:
         mask = filter_(temp_df)
@@ -344,7 +347,7 @@
 
 def _apply_transform(
     df: pd.DataFrame,
-    transforms: dict[str, tuple[list[str], Callable]],
+    transforms: dict[str, tuple[list[str], typing.Callable[..., typing.Any]]],
 ) -> pd.DataFrame:
     tmp_df = df
     for output_col in transforms.keys():
@@ -359,8 +362,10 @@
 
 
 def _parse_datetime_with_day_ratio(
-    month_series: np.ndarray, day_series: np.ndarray, year_series: np.ndarray
-) -> np.ndarray:
+    month_series: np_typing.NDArray[np.float32],
+    day_series: np_typing.NDArray[np.float32],
+    year_series: np_typing.NDArray[np.float32],
+) -> np_typing.NDArray[np.datetime64]:
     values = list()
     for month, day_with_ratio, year in zip(month_series, day_series, year_series):
         day = day_with_ratio // 1
@@ -479,7 +484,7 @@
     )
 
     sort_coord = traj_dataset.coords["obs_index"]
-    vals: np.ndarray = sort_coord.data
+    vals: np_typing.NDArray[np.int64] = sort_coord.data
     sort_coord_dim = sort_coord.dims[-1]
     sort_key = vals.argsort()
 
@@ -531,8 +536,8 @@
                 chunksize=chunk_size,
             )
 
-            joblist = list[Future]()
-            jobmap = dict[Future, pd.DataFrame]()
+            joblist = list[Future[dict[int, xr.Dataset]]]()
+            jobmap = dict[Future[dict[int, xr.Dataset]], pd.DataFrame]()
             for idx, chunk in enumerate(file_chunks):
                 if max_chunks is not None and idx >= max_chunks:
                     break
@@ -568,7 +573,7 @@
                     drifter_chunked_datasets[id_].append(drifter_ds)
                 bar.update()
 
-        combine_jobmap = dict[Future, int]()
+        combine_jobmap = dict[Future[xr.Dataset], int]()
         for id_ in drifter_chunked_datasets.keys():
             datasets = drifter_chunked_datasets[id_]
 

diff --git a/clouddrift/adapters/hurdat2.py b/clouddrift/adapters/hurdat2.py
@@ -390,7 +390,9 @@ def to_raggedarray(
         preprocess_func=lambda idx: track_data[idx].to_xarray_dataset(),
         attrs_global=TrackData.global_attrs,
         attrs_variables={
-            field.name: field.metadata
+            field.name: dict(
+                field.metadata
+            )  # type cast needed for static type analysis
             for field in fields(HeaderLine) + fields(DataLine)
         },
     )

diff --git a/clouddrift/adapters/utils.py b/clouddrift/adapters/utils.py
@@ -1,15 +1,14 @@
 import concurrent.futures
 import logging
 import os
+import typing
 import urllib
 from datetime import datetime
 from io import BufferedIOBase, BufferedWriter
-from typing import Callable, Sequence
 
 import requests
 from tenacity import (
     RetryCallState,
-    WrappedFn,
     retry,
     retry_if_exception,
     stop_after_attempt,
@@ -30,7 +29,10 @@
 _CHUNK_SIZE = 1_048_576  # 1MiB
 _logger = logging.getLogger(__name__)
 
-standard_retry_protocol: Callable[[WrappedFn], WrappedFn] = retry(
+_Func = typing.Callable[..., typing.Any]
+_Wrapper = typing.Callable[[_Func], _Func]
+
+standard_retry_protocol: _Wrapper = retry(
     retry=retry_if_exception(
         lambda ex: isinstance(
             ex,
@@ -51,19 +53,19 @@
 
 
 def download_with_progress(
-    download_map: Sequence[
+    download_map: typing.Sequence[
         tuple[str, BufferedIOBase | str] | tuple[str, BufferedIOBase | str, float]
     ],
     show_list_progress: bool | None = None,
     desc: str = "Downloading files",
-    custom_retry_protocol: Callable[[WrappedFn], WrappedFn] | None = None,
+    custom_retry_protocol: _Wrapper | None = None,
 ):
     if show_list_progress is None:
         show_list_progress = len(download_map) > 20
     if custom_retry_protocol is None:
         retry_protocol = standard_retry_protocol
     else:
-        retry_protocol = custom_retry_protocol  # type: ignore
+        retry_protocol = custom_retry_protocol
 
     executor = concurrent.futures.ThreadPoolExecutor()
     futures: dict[
@@ -156,10 +158,10 @@
                 )
 
     _logger.debug(f"Downloading from {url} to {output}...")
-    bar = None
-
     with requests.get(url, timeout=5, stream=True) as response:
         buffer: BufferedWriter | BufferedIOBase | None = None
+        bar = None
+
         try:
             if isinstance(output, (str,)):
                 buffer = open(output, "wb")
@@ -179,15 +181,14 @@
                     nrows=2,
                     disable=_DISABLE_SHOW_PROGRESS,
                 )
+
             for chunk in response.iter_content(_CHUNK_SIZE):
                 if not chunk:
                     break
                 buffer.write(chunk)
                 if bar is not None:
                     bar.update(len(chunk))
         finally:
-            if response is not None:
-                response.close()
             if bar is not None:
                 bar.close()
             if buffer is not None and isinstance(output, (str,)):

diff --git a/clouddrift/kinematics.py b/clouddrift/kinematics.py
@@ -3,9 +3,10 @@
 """
 
 import numpy as np
-import pandas as pd
+import numpy.typing as np_typing
 import xarray as xr
 
+import clouddrift.typing as cd_typing
 from clouddrift.sphere import (
     EARTH_RADIUS_METERS,
     bearing,
@@ -21,9 +22,9 @@
 
 
 def kinetic_energy(
-    u: float | list | np.ndarray | xr.DataArray | pd.Series,
-    v: float | list | np.ndarray | xr.DataArray | pd.Series | None = None,
-) -> float | np.ndarray | xr.DataArray:
+    u: float | cd_typing.ArrayTypes,
+    v: float | cd_typing.ArrayTypes | None = None,
+) -> float | np_typing.NDArray[np.float64] | xr.DataArray:
     """Compute kinetic energy from zonal and meridional velocities.
 
     Parameters
@@ -531,7 +532,7 @@ def velocity_from_position(
     coord_system: str = "spherical",
     difference_scheme: str = "forward",
     time_axis: int = -1,
-) -> tuple[xr.DataArray, xr.DataArray]:
+) -> tuple[np.ndarray, np.ndarray]:
     """Compute velocity from arrays of positions and time.
 
     x and y can be provided as longitude and latitude in degrees if

diff --git a/clouddrift/ragged.py b/clouddrift/ragged.py
@@ -2,26 +2,34 @@
 Transformational and inquiry functions for ragged arrays.
 """
 
+import typing
 import warnings
 from collections.abc import Callable, Iterable
 from concurrent import futures
 from datetime import timedelta
 
 import numpy as np
+import numpy.typing as np_typing
 import pandas as pd
 import xarray as xr
 
+import clouddrift.typing as cd_typing
+
+_ArrayOutput = typing.TypeVar(
+    "_ArrayOutput", bound=tuple[np.ndarray, np.ndarray] | np.ndarray
+)
+
 
 def apply_ragged(
-    func: callable,
-    arrays: list[np.ndarray | xr.DataArray] | np.ndarray | xr.DataArray,
-    rowsize: list[int] | np.ndarray[int] | xr.DataArray,
-    *args: tuple,
+    func: Callable[..., _ArrayOutput],
+    arrays: cd_typing.ArrayTypes,
+    rowsize: cd_typing.ArrayTypes,
+    *args: typing.Any,
     rows: int | Iterable[int] = None,
     axis: int = 0,
     executor: futures.Executor = futures.ThreadPoolExecutor(max_workers=None),
-    **kwargs: dict,
-) -> tuple[np.ndarray] | np.ndarray:
+    **kwargs: typing.Any,
+) -> _ArrayOutput:
     """Apply a function to a ragged array.
 
     The function ``func`` will be applied to each contiguous row of ``arrays`` as
@@ -423,7 +431,7 @@ def regular_to_ragged(
     return array[valid], np.sum(valid, axis=1)
 
 
-def rowsize_to_index(rowsize: list | np.ndarray | xr.DataArray) -> np.ndarray:
+def rowsize_to_index(rowsize: cd_typing.ArrayTypes) -> np.ndarray:
     """Convert a list of row sizes to a list of indices.
 
     This function is typically used to obtain the indices of data rows organized
@@ -450,10 +458,10 @@ def rowsize_to_index(rowsize: list | np.ndarray | xr.DataArray) -> np.ndarray:
 
 
 def segment(
-    x: np.ndarray,
+    x: cd_typing.ArrayTypes,
     tolerance: float | np.timedelta64 | timedelta | pd.Timedelta,
-    rowsize: np.ndarray[int] = None,
-) -> np.ndarray[int]:
+    rowsize: np_typing.NDArray[np.int64] | None = None,
+) -> np_typing.NDArray[np.int64]:
     """Divide an array into segments based on a tolerance value.
 
     Parameters
@@ -787,11 +795,11 @@ def subset(
 
 
 def unpack(
-    ragged_array: np.ndarray,
-    rowsize: np.ndarray[int],
-    rows: int | Iterable[int] = None,
+    ragged_array: cd_typing.ArrayTypes,
+    rowsize: cd_typing.ArrayTypes,
+    rows: int | np.int64 | Iterable[int] | None = None,
     axis: int = 0,
-) -> list[np.ndarray]:
+) -> list[np_typing.NDArray[typing.Any]]:
     """Unpack a ragged array into a list of regular arrays.
 
     Unpacking a ``np.ndarray`` ragged array is about 2 orders of magnitude