Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdp-update #282

Merged
merged 14 commits into from
Oct 4, 2023
2 changes: 0 additions & 2 deletions clouddrift/adapters/gdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import urllib.request
import warnings

GDP_VERSION = "2.00"

GDP_COORDS = [
"ids",
"time",
Expand Down
29 changes: 20 additions & 9 deletions clouddrift/adapters/gdp1h.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
import warnings
import xarray as xr

GDP_VERSION = "2.01"

GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/v2.00/netcdf/"
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/v2.01/netcdf/"
GDP_DATA_URL_EXPERIMENTAL = (
"https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
)
GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp")
GDP_TMP_PATH_EXPERIMENTAL = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp_exp")
GDP_DATA = [
"lon",
"lat",
Expand Down Expand Up @@ -51,7 +53,7 @@ def download(
drifter_ids: list = None,
n_random_id: int = None,
url: str = GDP_DATA_URL,
tmp_path: str = GDP_TMP_PATH,
tmp_path: str = None,
):
"""Download individual NetCDF files from the AOML server.

Expand All @@ -70,17 +72,21 @@ def download(
Returns
-------
out : list
List of retrived drifters
List of retrieved drifters
"""

# adjust the tmp_path if using the experimental source
if tmp_path is None:
tmp_path = GDP_TMP_PATH if url == GDP_DATA_URL else GDP_TMP_PATH_EXPERIMENTAL

print(f"Downloading GDP hourly data from {url} to {tmp_path}...")

# Create a temporary directory if doesn't already exists.
os.makedirs(tmp_path, exist_ok=True)

if url == GDP_DATA_URL:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not useful anymore but should we keep it to bring awareness of some possible changes upstream?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept it in case eventually there is a difference with the two datasets. If we are sure it is not going to change I would remove it.

pattern = "drifter_[0-9]*.nc"
filename_pattern = "drifter_{id}.nc"
pattern = "drifter_hourly_[0-9]*.nc"
filename_pattern = "drifter_hourly_{id}.nc"
elif url == GDP_DATA_URL_EXPERIMENTAL:
pattern = "drifter_hourly_[0-9]*.nc"
filename_pattern = "drifter_hourly_{id}.nc"
Expand Down Expand Up @@ -482,7 +488,7 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
# global attributes
attrs = {
"title": "Global Drifter Program hourly drifting buoy collection",
"history": f"version {gdp.GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
"history": f"version {GDP_VERSION}. Metadata from dirall.dat and deplog.dat",
"Conventions": "CF-1.6",
"date_created": datetime.now().isoformat(),
"publisher_name": "GDP Drifter DAC",
Expand Down Expand Up @@ -520,7 +526,7 @@ def to_raggedarray(
drifter_ids: Optional[list[int]] = None,
n_random_id: Optional[int] = None,
url: Optional[str] = GDP_DATA_URL,
tmp_path: Optional[str] = GDP_TMP_PATH,
tmp_path: Optional[str] = None,
) -> RaggedArray:
"""Download and process individual GDP hourly files and return a RaggedArray
instance with the data.
Expand All @@ -547,7 +553,7 @@ def to_raggedarray(
--------

Invoke `to_raggedarray` without any arguments to download all drifter data
from the 2.00 GDP feed:
from the 2.01 GDP feed:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make the docstring include automatically the GDP_VERSION variable here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's possible. The doc is also generated from the source code and it would have to be interpreted to get the value..


>>> from clouddrift.adapters.gdp1h import to_raggedarray
>>> ra = to_raggedarray()
Expand Down Expand Up @@ -582,10 +588,15 @@ def to_raggedarray(
>>> arr = ra.to_awkward()
>>> arr.to_parquet("gdp1h.parquet")
"""

# adjust the tmp_path if using the experimental source
if tmp_path is None:
tmp_path = GDP_TMP_PATH if url == GDP_DATA_URL else GDP_TMP_PATH_EXPERIMENTAL

ids = download(drifter_ids, n_random_id, url, tmp_path)

if url == GDP_DATA_URL:
filename_pattern = "drifter_{id}.nc"
filename_pattern = "drifter_hourly_{id}.nc"
elif url == GDP_DATA_URL_EXPERIMENTAL:
filename_pattern = "drifter_hourly_{id}.nc"
else:
Expand Down
28 changes: 12 additions & 16 deletions clouddrift/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@


def gdp1h() -> xr.Dataset:
"""Returns the NOAA Global Drifter Program (GDP) hourly dataset as an Xarray
dataset.
"""Returns the latest version of the NOAA Global Drifter Program (GDP) hourly
dataset as an Xarray dataset.

The data is accessed from a public AWS S3 bucket accessible at
https://registry.opendata.aws/noaa-oar-hourly-gdp/. This dataset includes
corrections and additional metadata since the original submission of the
dataset to NCEI (accessible via https://doi.org/10.25921/x46c-3620). We
recommend using this dataset over the one distributed via NCEI.
The data is accessed from zarr archive hosted on a public AWS S3 bucket accessible at
https://registry.opendata.aws/noaa-oar-hourly-gdp/. Original data source from NOAA NCEI
is https://doi.org/10.25921/x46c-3620).

Returns
-------
Expand All @@ -28,23 +26,21 @@ def gdp1h() -> xr.Dataset:
>>> ds = gdp1h()
>>> ds
<xarray.Dataset>
Dimensions: (traj: 17324, obs: 165754333)
Dimensions: (traj: 19396, obs: 197214787)
Coordinates:
ids (obs) int64 ...
lat (obs) float32 ...
lon (obs) float32 ...
time (obs) datetime64[ns] ...
Dimensions without coordinates: traj, obs
Data variables: (12/55)
Data variables: (12/60)
BuoyTypeManufacturer (traj) |S20 ...
BuoyTypeSensorArray (traj) |S20 ...
CurrentProgram (traj) float64 ...
CurrentProgram (traj) float32 ...
DeployingCountry (traj) |S20 ...
DeployingShip (traj) |S20 ...
DeploymentComments (traj) |S20 ...
... ...
sst1 (obs) float64 ...
sst2 (obs) float64 ...
start_lat (traj) float32 ...
start_lon (traj) float32 ...
typebuoy (traj) |S10 ...
typedeath (traj) int8 ...
ve (obs) float32 ...
Expand All @@ -54,7 +50,7 @@ def gdp1h() -> xr.Dataset:
acknowledgement: Elipot, Shane; Sykulski, Adam; Lumpkin, Rick; Centurio...
contributor_name: NOAA Global Drifter Program
contributor_role: Data Acquisition Center
date_created: 2022-12-09T06:02:29.684949
date_created: 2023-09-08T17:05:12.130123
doi: 10.25921/x46c-3620
... ...
processing_level: Level 2 QC by GDP drifter DAC
Expand All @@ -68,7 +64,7 @@ def gdp1h() -> xr.Dataset:
--------
:func:`gdp6h`
"""
url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp_v2.00.zarr"
url = "https://noaa-oar-hourly-gdp-pds.s3.amazonaws.com/latest/gdp-v2.01.zarr"
return xr.open_dataset(url, engine="zarr")


Expand Down