Gdp-update (#282)

* update datasets.gdp1h() * lint * move GDP_VERSION * file pattern change * fix the fix * typo * adjust path with experimental url * actually I prefer this * forgot the default value --------- Co-authored-by: Philippe Miron <philippe.miron@dtn.com>
Cloud-Drift · Oct 4, 2023 · af51004 · af51004
1 parent c71062b
commit af51004
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 27 deletions.
diff --git a/clouddrift/adapters/gdp.py b/clouddrift/adapters/gdp.py
@@ -12,8 +12,6 @@
 import urllib.request
 import warnings
 
-GDP_VERSION = "2.00"
-
 GDP_COORDS = [
     "ids",
     "time",

diff --git a/clouddrift/adapters/gdp1h.py b/clouddrift/adapters/gdp1h.py
@@ -18,12 +18,14 @@
 import warnings
 import xarray as xr
 
+GDP_VERSION = "2.01"
 
-GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/v2.00/netcdf/"
+GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/v2.01/netcdf/"
 GDP_DATA_URL_EXPERIMENTAL = (
     "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
 )
 GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp")
+GDP_TMP_PATH_EXPERIMENTAL = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp_exp")
 GDP_DATA = [
     "lon",
     "lat",
@@ -51,7 +53,7 @@ def download(
     drifter_ids: list = None,
     n_random_id: int = None,
     url: str = GDP_DATA_URL,
-    tmp_path: str = GDP_TMP_PATH,
+    tmp_path: str = None,
 ):
     """Download individual NetCDF files from the AOML server.
 
@@ -70,17 +72,21 @@ def download(
     Returns
     -------
     out : list
-        List of retrived drifters
+        List of retrieved drifters
     """
 
+    # adjust the tmp_path if using the experimental source
+    if tmp_path is None:
+        tmp_path = GDP_TMP_PATH if url == GDP_DATA_URL else GDP_TMP_PATH_EXPERIMENTAL
+
     print(f"Downloading GDP hourly data from {url} to {tmp_path}...")
 
     # Create a temporary directory if doesn't already exists.
     os.makedirs(tmp_path, exist_ok=True)
 
     if url == GDP_DATA_URL:
-        pattern = "drifter_[0-9]*.nc"
-        filename_pattern = "drifter_{id}.nc"
+        pattern = "drifter_hourly_[0-9]*.nc"
+        filename_pattern = "drifter_hourly_{id}.nc"
     elif url == GDP_DATA_URL_EXPERIMENTAL:
         pattern = "drifter_hourly_[0-9]*.nc"
         filename_pattern = "drifter_hourly_{id}.nc"
@@ -482,7 +488,7 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
     # global attributes
     attrs = {
         "title": "Global Drifter Program hourly drifting buoy collection",
-        "history": f"version {gdp.GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
+        "history": f"version {GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
         "Conventions": "CF-1.6",
         "date_created": datetime.now().isoformat(),
         "publisher_name": "GDP Drifter DAC",
@@ -520,7 +526,7 @@ def to_raggedarray(
     drifter_ids: Optional[list[int]] = None,
     n_random_id: Optional[int] = None,
     url: Optional[str] = GDP_DATA_URL,
-    tmp_path: Optional[str] = GDP_TMP_PATH,
+    tmp_path: Optional[str] = None,
 ) -> RaggedArray:
     """Download and process individual GDP hourly files and return a RaggedArray
     instance with the data.
@@ -547,7 +553,7 @@ def to_raggedarray(
     --------
 
     Invoke `to_raggedarray` without any arguments to download all drifter data
-    from the 2.00 GDP feed:
+    from the 2.01 GDP feed:
 
     >>> from clouddrift.adapters.gdp1h import to_raggedarray
     >>> ra = to_raggedarray()
@@ -582,10 +588,15 @@ def to_raggedarray(
     >>> arr = ra.to_awkward()
     >>> arr.to_parquet("gdp1h.parquet")
     """
+
+    # adjust the tmp_path if using the experimental source
+    if tmp_path is None:
+        tmp_path = GDP_TMP_PATH if url == GDP_DATA_URL else GDP_TMP_PATH_EXPERIMENTAL
+
     ids = download(drifter_ids, n_random_id, url, tmp_path)
 
     if url == GDP_DATA_URL:
-        filename_pattern = "drifter_{id}.nc"
+        filename_pattern = "drifter_hourly_{id}.nc"
     elif url == GDP_DATA_URL_EXPERIMENTAL:
         filename_pattern = "drifter_hourly_{id}.nc"
     else:

diff --git a/clouddrift/datasets.py b/clouddrift/datasets.py
@@ -8,14 +8,12 @@
 
 
 def gdp1h() -> xr.Dataset:
-    """Returns the NOAA Global Drifter Program (GDP) hourly dataset as an Xarray
-    dataset.
+    """Returns the latest version of the NOAA Global Drifter Program (GDP) hourly
+    dataset as an Xarray dataset.
 
-    The data is accessed from a public AWS S3 bucket accessible at
-    https://registry.opendata.aws/noaa-oar-hourly-gdp/. This dataset includes
-    corrections and additional metadata since the original submission of the
-    dataset to NCEI (accessible via https://doi.org/10.25921/x46c-3620). We
-    recommend using this dataset over the one distributed via NCEI.
+    The data is accessed from zarr archive hosted on a public AWS S3 bucket accessible at
+    https://registry.opendata.aws/noaa-oar-hourly-gdp/. Original data source from NOAA NCEI
+    is https://doi.org/10.25921/x46c-3620).
 
     Returns
     -------
@@ -28,23 +26,21 @@ def gdp1h() -> xr.Dataset:
     >>> ds = gdp1h()
     >>> ds
     <xarray.Dataset>
-    Dimensions:                (traj: 17324, obs: 165754333)
+    Dimensions:                (traj: 19396, obs: 197214787)
     Coordinates:
         ids                    (obs) int64 ...
-        lat                    (obs) float32 ...
-        lon                    (obs) float32 ...
         time                   (obs) datetime64[ns] ...
     Dimensions without coordinates: traj, obs
-    Data variables: (12/55)
+    Data variables: (12/60)
         BuoyTypeManufacturer   (traj) |S20 ...
         BuoyTypeSensorArray    (traj) |S20 ...
-        CurrentProgram         (traj) float64 ...
+        CurrentProgram         (traj) float32 ...
         DeployingCountry       (traj) |S20 ...
         DeployingShip          (traj) |S20 ...
         DeploymentComments     (traj) |S20 ...
         ...                     ...
-        sst1                   (obs) float64 ...
-        sst2                   (obs) float64 ...
+        start_lat              (traj) float32 ...
+        start_lon              (traj) float32 ...
         typebuoy               (traj) |S10 ...
         typedeath              (traj) int8 ...
         ve                     (obs) float32 ...
@@ -54,7 +50,7 @@ def gdp1h() -> xr.Dataset:
         acknowledgement:   Elipot, Shane; Sykulski, Adam; Lumpkin, Rick; Centurio...
         contributor_name:  NOAA Global Drifter Program
         contributor_role:  Data Acquisition Center
-        date_created:      2022-12-09T06:02:29.684949
+        date_created:      2023-09-08T17:05:12.130123
         doi:               10.25921/x46c-3620
         ...                ...
         processing_level:  Level 2 QC by GDP drifter DAC
@@ -68,7 +64,7 @@ def gdp1h() -> xr.Dataset:
     --------
     :func:`gdp6h`
     """
-    url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp_v2.00.zarr"
+    url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp-v2.01.zarr"
     return xr.open_dataset(url, engine="zarr")