🐛 use zarr archive for gdp6h dataset (Cloud-Drift#387)

* the `netcdf-c` library provides a feature to download URLs using a byte stream to help with lazy loading. The issue with this feature is that it seems to have a fixed 100s timeout for the HTTP calls it makes with `curllib`. Its also slower than leveraging the zarr archive. --------- Co-authored-by: Shane Elipot <selipot@miami.edu>
kevinsantana11 · Apr 9, 2024 · 4c05591 · 4c05591
1 parent 63a0d95
commit 4c05591
Showing 1 changed file with 32 additions and 42 deletions.
diff --git a/clouddrift/datasets.py b/clouddrift/datasets.py
@@ -6,8 +6,6 @@
 """
 
 import os
-import platform
-from io import BytesIO
 from typing import Callable
 
 import xarray as xr
@@ -115,55 +113,47 @@ def gdp6h(decode_times: bool = True) -> xr.Dataset:
     >>> from clouddrift.datasets import gdp6h
     >>> ds = gdp6h()
     >>> ds
-    <xarray.Dataset>
+    <xarray.Dataset> Size: 2GB
     Dimensions:                (traj: 27647, obs: 46535470)
     Coordinates:
-        ids                    (obs) int64 7702204 7702204 ... 300234061198840
-        time                   (obs) float64 2.879e+08 2.879e+08 ... 1.697e+09
+    id                     (traj) int64 221kB ...
+    time                   (obs) datetime64[ns] 372MB ...
     Dimensions without coordinates: traj, obs
-    Data variables: (12/50)
-        ID                     (traj) int64 7702204 7702201 ... 300234061198840
-        rowsize                (traj) int32 92 1747 1943 1385 1819 ... 54 53 51 28
-        WMO                    (traj) int32 0 0 0 0 ... 6203890 6203888 4101885
-        expno                  (traj) int32 40 40 40 40 ... 31412 21421 21421 31412
-        deploy_date            (traj) float32 2.878e+08 2.878e+08 ... 1.696e+09 nan
-        deploy_lat             (traj) float32 -7.798 -4.9 -3.18 ... 9.9 11.9 nan
-        ...                     ...
-        vn                     (obs) float32 nan 0.1056 0.04974 ... 0.7384 nan
-        temp                   (obs) float32 28.35 28.3 nan ... 29.08 28.97 28.92
-        err_lat                (obs) float32 0.009737 0.007097 ... 0.001659 0.001687
-        err_lon                (obs) float32 0.00614 0.004583 ... 0.002471 0.002545
-        err_temp               (obs) float32 0.08666 0.08757 ... 0.03665 0.03665
-        drogue_status          (obs) bool False False False False ... True True True
+    Data variables: (12/49)
+    BuoyTypeManufacturer   (traj) |S20 553kB ...
+    BuoyTypeSensorArray    (traj) |S20 553kB ...
+    CurrentProgram         (traj) float64 221kB ...
+    DeployingCountry       (traj) |S20 553kB ...
+    DeployingShip          (traj) |S20 553kB ...
+    DeploymentComments     (traj) |S20 553kB ...
+    ...                     ...
+    start_lon              (traj) float32 111kB ...
+    temp                   (obs) float32 186MB ...
+    typebuoy               (traj) |S10 276kB ...
+    typedeath              (traj) int8 28kB ...
+    ve                     (obs) float32 186MB ...
+    vn                     (obs) float32 186MB ...
     Attributes: (12/18)
-        title:                Global Drifter Program drifting buoy collection
-        history:              version September 2023. Metadata from dirall.dat an...
-        Conventions:          CF-1.6
-        time_coverage_start:  1979-02-15:00:00:00Z
-        time_coverage_end:    2023-10-18:18:00:00Z
-        date_created:         2023-12-22T17:50:22.242943
-        ...                   ...
-        contributor_name:     NOAA Global Drifter Program
-        contributor_role:     Data Acquisition Center
-        institution:          NOAA Atlantic Oceanographic and Meteorological Labo...
-        acknowledgement:      Lumpkin, Rick; Centurioni, Luca (2019). NOAA Global...
-        summary:              Global Drifter Program six-hourly data
-        doi:                  10.25921/7ntx-z961
+    Conventions:          CF-1.6
+    acknowledgement:      Lumpkin, Rick; Centurioni, Luca (2019). NOAA Global...
+    contributor_name:     NOAA Global Drifter Program
+    contributor_role:     Data Acquisition Center
+    date_created:         2024-04-04T13:44:01.176967
+    doi:                  10.25921/7ntx-z961
+    ...                   ...
+    publisher_name:       GDP Drifter DAC
+    publisher_url:        https://www.aoml.noaa.gov/phod/gdp
+    summary:              Global Drifter Program six-hourly data
+    time_coverage_end:    2023-10-18:18:00:00Z
+    time_coverage_start:  1979-02-15:00:00:00Z
+    title:                Global Drifter Program drifting buoy collection
 
     See Also
     --------
     :func:`gdp1h`
     """
-    url = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/gdp6h_ragged_may23.nc#mode=bytes"
-
-    if platform.system() == "Windows":
-        buffer = BytesIO()
-        adapters.utils.download_with_progress([(f"{url}#mode=bytes", buffer, None)])
-        ds = xr.open_dataset(buffer, decode_times=decode_times)
-    else:
-        ds = xr.open_dataset(f"{url}", decode_times=decode_times)
-
-    ds = ds.rename_vars({"ID": "id"}).assign_coords({"id": ds.ID}).drop_vars(["ids"])
+    url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/experimental/gdp6h_ragged_sep23.zarr"
+    ds = xr.open_dataset(url, decode_times=decode_times, engine="zarr")
     return ds