Cloud-Drift · milancurcic · Aug 29, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/clouddrift/adapters/gdp.py b/clouddrift/adapters/gdp.py
@@ -6,10 +6,11 @@
 """
 
 import numpy as np
+import os
 import pandas as pd
 import xarray as xr
 import urllib.request
-import os
+import warnings
 
 GDP_VERSION = "2.00"
 
@@ -161,6 +162,8 @@ def fetch_netcdf(url: str, file: str):
     """
     if not os.path.isfile(file):
         urllib.request.urlretrieve(url, file)
+    else:
+        warnings.warn(f"{file} already exists; skip download.")
 
 
 def decode_date(t):

diff --git a/clouddrift/adapters/gdp1h.py b/clouddrift/adapters/gdp1h.py
@@ -48,7 +48,10 @@
 
 
 def download(
-    drifter_ids: list = None, n_random_id: int = None, url: str = GDP_DATA_URL
+    drifter_ids: list = None,
+    n_random_id: int = None,
+    url: str = GDP_DATA_URL,
+    tmp_path: str = GDP_TMP_PATH,
 ):
     """Download individual NetCDF files from the AOML server.
 
@@ -60,17 +63,20 @@ def download(
         Randomly select n_random_id drifter IDs to download (Default: None)
     url : str
         URL from which to download the data (Default: GDP_DATA_URL). Alternatively, it can be GDP_DATA_URL_EXPERIMENTAL.
+    tmp_path : str, optional
+        Path to the directory where the individual NetCDF files are stored
+        (default /tmp/clouddrift/gdp)
 
     Returns
     -------
     out : list
         List of retrived drifters
     """
 
-    print(f"Downloading GDP hourly data to {GDP_TMP_PATH}...")
+    print(f"Downloading GDP hourly data from {url} to {tmp_path}...")
 
     # Create a temporary directory if doesn't already exists.
-    os.makedirs(GDP_TMP_PATH, exist_ok=True)
+    os.makedirs(tmp_path, exist_ok=True)
 
     if url == GDP_DATA_URL:
         pattern = "drifter_[0-9]*.nc"
@@ -103,7 +109,7 @@ def download(
         for i in drifter_ids:
             file = filename_pattern.format(id=i)
             urls.append(os.path.join(url, file))
-            files.append(os.path.join(GDP_TMP_PATH, file))
+            files.append(os.path.join(tmp_path, file))
 
         # parallel retrieving of individual netCDF files
         list(
@@ -493,6 +499,7 @@ def to_raggedarray(
     drifter_ids: Optional[list[int]] = None,
     n_random_id: Optional[int] = None,
     url: Optional[str] = GDP_DATA_URL,
+    tmp_path: Optional[str] = GDP_TMP_PATH,
 ) -> RaggedArray:
     """Download and process individual GDP hourly files and return a RaggedArray
     instance with the data.
@@ -506,6 +513,9 @@ def to_raggedarray(
     url : str, optional
         URL from which to download the data (Default: GDP_DATA_URL).
         Alternatively, it can be GDP_DATA_URL_EXPERIMENTAL.
+    tmp_path : str, optional
+        Path to the directory where the individual NetCDF files are stored
+        (default /tmp/clouddrift/gdp)
 
     Returns
     -------
@@ -551,7 +561,7 @@ def to_raggedarray(
     >>> arr = ra.to_awkward()
     >>> arr.to_parquet("gdp1h.parquet")
     """
-    ids = download(drifter_ids, n_random_id, url)
+    ids = download(drifter_ids, n_random_id, url, tmp_path)
 
     if url == GDP_DATA_URL:
         filename_pattern = "drifter_{id}.nc"
@@ -568,5 +578,5 @@ def to_raggedarray(
         name_data=GDP_DATA,
         rowsize_func=gdp.rowsize,
         filename_pattern=filename_pattern,
-        tmp_path=GDP_TMP_PATH,
+        tmp_path=tmp_path,
     )
diff --git a/clouddrift/adapters/gdp6h.py b/clouddrift/adapters/gdp6h.py
@@ -35,7 +35,10 @@
 
 
 def download(
-    drifter_ids: list = None, n_random_id: int = None, url: str = GDP_DATA_URL
+    drifter_ids: list = None,
+    n_random_id: int = None,
+    url: str = GDP_DATA_URL,
+    tmp_path: str = GDP_TMP_PATH,
 ):
     """Download individual NetCDF files from the AOML server.
 
@@ -47,17 +50,20 @@ def download(
         Randomly select n_random_id drifter IDs to download (Default: None)
     url : str
         URL from which to download the data (Default: GDP_DATA_URL). Alternatively, it can be GDP_DATA_URL_EXPERIMENTAL.
+    tmp_path : str, optional
+        Path to the directory where the individual NetCDF files are stored
+        (default /tmp/clouddrift/gdp6h)
 
     Returns
     -------
     out : list
         List of retrived drifters
     """
 
-    print(f"Downloading GDP 6-hourly data to {GDP_TMP_PATH}...")
+    print(f"Downloading GDP 6-hourly data to {tmp_path}...")
 
     # Create a temporary directory if doesn't already exists.
-    os.makedirs(GDP_TMP_PATH, exist_ok=True)
+    os.makedirs(tmp_path, exist_ok=True)
 
     pattern = "drifter_[0-9]*.nc"
     directory_list = [
@@ -95,10 +101,7 @@ def download(
                 executor.map(
                     gdp.fetch_netcdf,
                     drifter_urls,
-                    [
-                        os.path.join(GDP_TMP_PATH, os.path.basename(f))
-                        for f in drifter_urls
-                    ],
+                    [os.path.join(tmp_path, os.path.basename(f)) for f in drifter_urls],
                 ),
                 total=len(drifter_urls),
                 desc="Downloading files",
@@ -424,6 +427,7 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
 def to_raggedarray(
     drifter_ids: Optional[list[int]] = None,
     n_random_id: Optional[int] = None,
+    tmp_path: Optional[str] = GDP_TMP_PATH,
 ) -> RaggedArray:
     """Download and process individual GDP 6-hourly files and return a
     RaggedArray instance with the data.
@@ -434,6 +438,9 @@ def to_raggedarray(
         List of drifters to retrieve (Default: all)
     n_random_id : list[int], optional
         Randomly select n_random_id drifter NetCDF files
+    tmp_path : str, optional
+        Path to the directory where the individual NetCDF files are stored
+        (default /tmp/clouddrift/gdp6h)
 
     Returns
     -------
@@ -473,7 +480,7 @@ def to_raggedarray(
     >>> arr = ra.to_awkward()
     >>> arr.to_parquet("gdp6h.parquet")
     """
-    ids = download(drifter_ids, n_random_id, GDP_DATA_URL)
+    ids = download(drifter_ids, n_random_id, GDP_DATA_URL, tmp_path)
 
     return RaggedArray.from_files(
         indices=ids,
@@ -483,5 +490,5 @@ def to_raggedarray(
         name_data=GDP_DATA,
         rowsize_func=gdp.rowsize,
         filename_pattern="drifter_{id}.nc",
-        tmp_path=GDP_TMP_PATH,
+        tmp_path=tmp_path,
     )