Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

subsurface floats dataset #321

Merged
merged 18 commits into from
Nov 15, 2023
1 change: 1 addition & 0 deletions clouddrift/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
import clouddrift.adapters.gdp6h
import clouddrift.adapters.glad
import clouddrift.adapters.mosaic
import clouddrift.adapters.subsurface_floats
202 changes: 202 additions & 0 deletions clouddrift/adapters/subsurface_floats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
"""
This module defines functions to adapt as a ragged-array dataset a collection of data
from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
across the world between 1989 and 2015.

The dataset is hosted at https://www.aoml.noaa.gov/phod/float_traj/index.php

Example
-------
>>> from clouddrift.adapters import subsurface_floats
>>> ds = subsurface_floats.to_xarray()
"""

from datetime import datetime
import numpy as np
import os
import pandas as pd
import scipy.io
import tempfile
import urllib.request
import xarray as xr
import warnings

SUBSURFACE_FLOATS_DATA_URL = (
"https://www.aoml.noaa.gov/phod/float_traj/files/allFloats_12122017.mat"
)
SUBSURFACE_FLOATS_VERSION = "December 2017 (version 2)"
SUBSURFACE_FLOATS_TMP_PATH = os.path.join(
tempfile.gettempdir(), "clouddrift", "subsurface_floats"
)


def download(file: str):
if not os.path.isfile(file):
print(
f"Downloading Subsurface float trajectories from {SUBSURFACE_FLOATS_DATA_URL} to {file}..."
)
urllib.request.urlretrieve(SUBSURFACE_FLOATS_DATA_URL, file)
else:
warnings.warn(f"{file} already exists; skip download.")

Check warning on line 40 in clouddrift/adapters/subsurface_floats.py

View check run for this annotation

Codecov / codecov/patch

clouddrift/adapters/subsurface_floats.py#L40

Added line #L40 was not covered by tests


def to_xarray(
tmp_path: str = None,
):
if tmp_path is None:
tmp_path = SUBSURFACE_FLOATS_TMP_PATH
os.makedirs(tmp_path, exist_ok=True)

local_file = f"{tmp_path}/{SUBSURFACE_FLOATS_DATA_URL.split('/')[-1]}"
download(local_file)
source_data = scipy.io.loadmat(local_file)

# metadata
meta_variables = [
"expList",
"expName",
"expOrg",
"expPI",
"fltType",
"indexExp",
"indexFlt",
]

metadata = {}
for var in meta_variables:
metadata[var] = np.array([v.flatten()[0] for v in source_data[var].flatten()])

# bring the expList to the "traj" dimension
_, float_per_exp = np.unique(metadata["indexExp"], return_counts=True)
metadata["expList"] = np.repeat(metadata["expList"], float_per_exp)

# data
data_variables = ["dtnum", "lon", "lat", "p", "t", "u", "v"]
data = {}
for var in data_variables:
data[var] = np.concatenate([v.flatten() for v in source_data[var].flatten()])

# create rowsize variable
rowsize = np.array([len(v) for v in source_data["dtnum"].flatten()])
assert np.sum(rowsize) == len(data["dtnum"])

# Unix epoch start (1970-01-01)
origin_datenum = 719529

ds = xr.Dataset(
{
"expList": (["traj"], metadata["expList"]),
"expName": (["traj"], metadata["expName"]),
"expOrg": (["traj"], metadata["expOrg"]),
"expPI": (["traj"], metadata["expPI"]),
"indexExp": (["traj"], metadata["indexExp"]),
"fltType": (["traj"], metadata["fltType"]),
"id": (["traj"], metadata["indexFlt"]),
"rowsize": (["traj"], rowsize),
"time": (
["obs"],
pd.to_datetime(data["dtnum"] - origin_datenum, unit="D"),
),
"lon": (["obs"], data["lon"]),
"lat": (["obs"], data["lat"]),
"pres": (["obs"], data["p"]),
"temp": (["obs"], data["t"]),
"ve": (["obs"], data["u"]),
"vn": (["obs"], data["v"]),
}
)

# Cast double floats to singles
double_vars = ["lat", "lon"]
for var in [v for v in ds.variables if v not in double_vars]:
if ds[var].dtype == "float64":
ds[var] = ds[var].astype("float32")

# define attributes
vars_attrs = {
"expList": {
"long_name": "Experiment list",
"units": "-",
},
"expName": {
"long_name": "Experiment name",
"units": "-",
},
"expOrg": {
"long_name": "Experiment organization",
"units": "-",
},
"expPI": {
"long_name": "Experiment principal investigator",
"units": "-",
},
"indexExp": {
"long_name": "Experiment index number",
"units": "-",
"comment": "The index matches the float with its experiment metadata",
},
"fltType": {
"long_name": "Float type",
"units": "-",
},
"id": {"long_name": "Float ID", "units": "-"},
"lon": {
"long_name": "Longitude",
"standard_name": "longitude",
"units": "degrees_east",
},
"lat": {
"long_name": "Latitude",
"standard_name": "latitude",
"units": "degrees_north",
},
"rowsize": {
"long_name": "Number of observations per trajectory",
"sample_dimension": "obs",
"units": "-",
},
"pres": {
"long_name": "Pressure",
"standard_name": "sea_water_pressure",
"units": "dbar",
},
"temp": {
"long_name": "Temperature",
"standard_name": "sea_water_temperature",
"units": "degree_C",
},
"ve": {
"long_name": "Eastward velocity",
"standard_name": "eastward_sea_water_velocity",
"units": "m s-1",
},
"vn": {
"long_name": "Northward velocity",
"standard_name": "northward_sea_water_velocity",
"units": "m s-1",
},
}

# global attributes
attrs = {
"title": "Subsurface float trajectories dataset",
"history": SUBSURFACE_FLOATS_VERSION,
"date_created": datetime.now().isoformat(),
"publisher_name": "WOCE Subsurface Float Data Assembly Center and NOAA AOML",
"publisher_url": "https://www.aoml.noaa.gov/phod/float_traj/data.php",
"licence": "freely available",
"acknowledgement": f"Maintained by Andree Ramsey and Heather Furey from the Woods Hole Oceanographic Institution",
}

# set attributes
for var in vars_attrs.keys():
if var in ds.keys():
ds[var].attrs = vars_attrs[var]
else:
warnings.warn(f"Variable {var} not found in upstream data; skipping.")

Check warning on line 196 in clouddrift/adapters/subsurface_floats.py

View check run for this annotation

Codecov / codecov/patch

clouddrift/adapters/subsurface_floats.py#L196

Added line #L196 was not covered by tests
ds.attrs = attrs

# set coordinates
ds = ds.set_coords(["time", "id"])

return ds
102 changes: 95 additions & 7 deletions clouddrift/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,13 @@
* time (obs) datetime64[ns] 2012-07-20T01:15:00.143960 ... 2012-...
* id (traj) object 'CARTHE_001' 'CARTHE_002' ... 'CARTHE_451'
Data variables:
latitude (obs) float32 28.56 28.56 28.56 28.56 ... 26.33 26.33 26.33
longitude (obs) float32 -87.21 -87.21 -87.21 ... -87.09 -87.09 -87.08
position_error (obs) float32 10.0 10.0 10.0 10.0 ... 227.7 228.2 228.6
u (obs) float32 0.023 0.022 0.021 0.021 ... 0.501 0.465 0.425
v (obs) float32 -0.247 -0.23 -0.213 ... -0.268 -0.248 -0.226
velocity_error (obs) float32 0.033 0.033 0.033 0.033 ... 0.033 0.033 0.033
rowsize (traj) int64 7696 1385 2965 3729 ... 1749 1535 3077 2631
latitude (obs) float32 ...
longitude (obs) float32 ...
position_error (obs) float32 ...
u (obs) float32 ...
v (obs) float32 ...
velocity_error (obs) float32 ...
rowsize (traj) int64 ...
Attributes:
title: GLAD experiment CODE-style drifter trajectories (low-pass f...
institution: Consortium for Advanced Research on Transport of Hydrocarbo...
Expand Down Expand Up @@ -253,3 +253,91 @@
else:
ds = xr.open_dataset(mosaic_path)
return ds


def subsurface_floats() -> xr.Dataset:
"""Returns the subsurface floats dataset as an Xarray dataset.

The function will first look for the ragged-array dataset on the local
filesystem. If it is not found, the dataset will be downloaded using the
corresponding adapter function and stored for later access.

The upstream data is available at
https://www.aoml.noaa.gov/phod/float_traj/files/allFloats_12122017.mat.

This dataset of subsurface float observations was compiled by the WOCE Subsurface
Float Data Assembly Center (WFDAC) in Woods Hole maintained by Andree Ramsey and
Heather Furey and copied to NOAA/AOML in October 2014 (version 1) and in December
2017 (version 2). Subsequent updates will be included as additional appropriate
float data, quality controlled by the appropriate principal investigators, is
submitted for inclusion.

Note that these observations are collected by ALACE/RAFOS/Eurofloat-style
acoustically-tracked, neutrally-buoyant subsurface floats which collect data while
drifting beneath the ocean surface. These data are the result of the effort and
resources of many individuals and institutions. You are encouraged to acknowledge
the work of the data originators and Data Centers in publications arising from use
of these data.

The float data were originally divided by project at the WFDAC. Here they have been
compiled in a single Matlab data set. See here for more information on the variables
contained in these files.

Returns
-------
xarray.Dataset
Subsurface floats dataset as a ragged array

Examples
--------
>>> from clouddrift.datasets import subsurface_floats
>>> ds = subsurface_floats()
>>> ds
<xarray.Dataset>
Dimensions: (traj: 2193, obs: 1402840)
Coordinates:
id (traj) uint16 ...
time (obs) datetime64[ns] ...
Dimensions without coordinates: traj, obs
Data variables: (12/13)
expList (traj) object ...
expName (traj) object ...
expOrg (traj) object ...
expPI (traj) object ...
indexExp (traj) uint8 ...
fltType (traj) object ...
... ...
lon (obs) float64 ...
lat (obs) float64 ...
pres (obs) float64 ...
temp (obs) float64 ...
ve (obs) float64 ...
vn (obs) float64 ...
Attributes:
title: Subsurface float trajectories dataset
history: December 2017 (version 2)
date_created: 2023-11-14T22:30:38.831656
publisher_name: WOCE Subsurface Float Data Assembly Center and NOAA AOML
publisher_url: https://www.aoml.noaa.gov/phod/float_traj/data.php
licence: freely available
acknowledgement: Maintained by Andree Ramsey and Heather Furey from the ...

References
----------
WOCE Subsurface Float Data Assembly Center (WFDAC) https://www.aoml.noaa.gov/phod/float_traj/index.php
"""

clouddrift_path = (
os.path.expanduser("~/.clouddrift")
if not os.getenv("CLOUDDRIFT_PATH")
else os.getenv("CLOUDDRIFT_PATH")
)
local_file = f"{clouddrift_path}/data/subsurface_floats.nc"
if not os.path.exists(local_file):
print(f"{local_file} not found; download from upstream repository.")
ds = adapters.subsurface_floats.to_xarray()
os.makedirs(os.path.dirname(local_file), exist_ok=True)
ds.to_netcdf(local_file)
else:
ds = xr.open_dataset(local_file)

Check warning on line 342 in clouddrift/datasets.py

View check run for this annotation

Codecov / codecov/patch

clouddrift/datasets.py#L342

Added line #L342 was not covered by tests
return ds
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Auto-generated summary of CloudDrift's API. For more details and examples, refer
adapters.gdp6h
adapters.glad
adapters.mosaic
adapters.subsurface_floats
datasets
kinematics
ragged
Expand Down
10 changes: 7 additions & 3 deletions docs/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ Currently available datasets are:
- :func:`clouddrift.datasets.mosaic`: MOSAiC sea-ice drift dataset as a ragged
array processed from the upstream dataset hosted at the
`NSF's Arctic Data Center <https://doi.org/10.18739/A2KP7TS83>`_.
- :func:`clouddrift.datasets.subsurface_floats`: The subsurface float trajectories dataset as
hosted by NOAA AOML at
`NOAA's Atlantic Oceanographic and Meteorological Laboratory (AOML) <https://www.aoml.noaa.gov/phod/float_traj/index.php>`
and maintained by Andree Ramsey and Heather Furey from the Woods Hole Oceanographic Institution.

The GDP datasets are accessed lazily, so the data is only downloaded when
specific array values are referenced. The GLAD and MOSAiC datasets are
downloaded in their entirety when the function is called for the first time and
stored locally for later use.
specific array values are referenced. The GLAD, MOSAiC, and Subsurface floats
datasets are downloaded in their entirety when the function is called for the first
time and stored locally for later use.
4 changes: 4 additions & 0 deletions tests/datasets_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,7 @@ def test_glad_subset_and_apply_ragged_work(self):
self.assertTrue(ds_sub)
mean_lon = apply_ragged(np.mean, [ds_sub.longitude], ds_sub.rowsize)
self.assertTrue(mean_lon.size == 2)

def test_subsurface_floats_opens(self):
ds = datasets.subsurface_floats()
self.assertTrue(ds)