Skip to content

Commit

Permalink
Convert NetCDF to Zarr
Browse files Browse the repository at this point in the history
  • Loading branch information
jrbourbeau committed Sep 18, 2024
1 parent 624979c commit 8c6482b
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
matrix:
os: [ubuntu-latest]
python_version: ["3.10"]
pytest_args: [tests --ignore=tests/tpch]
pytest_args: [tests/geospatial/test_netcdf_to_zarr.py --ignore=tests/tpch]
extra-env: [""]
name_prefix: [tests]
include:
Expand Down
1 change: 1 addition & 0 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ dependencies:
- gilknocker ==0.4.1
- openssl >1.1.0g
- rioxarray ==0.17.0
- h5netcdf ==1.3.0

########################################################
# PLEASE READ:
Expand Down
93 changes: 93 additions & 0 deletions tests/geospatial/test_netcdf_to_zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import xarray as xr
from dask.utils import format_bytes


def test_netcdf_to_zarr(
scale,
s3,
s3_url,
client_factory,
cluster_kwargs={
"workspace": "dask-engineering",
"region": "us-west-2",
"wait_for_workers": True,
},
scale_kwargs={
"small": {"n_workers": 10},
"medium": {"n_workers": 100},
"large": {"n_workers": 200},
},
):
with client_factory(
**scale_kwargs[scale], **cluster_kwargs
) as client: # noqa: F841
# Define models and variables of interest
models = [
"ACCESS-CM2",
"ACCESS-ESM1-5",
"CMCC-ESM2",
"CNRM-CM6-1",
"CNRM-ESM2-1",
"CanESM5",
"EC-Earth3",
"EC-Earth3-Veg-LR",
"FGOALS-g3",
"GFDL-ESM4",
"GISS-E2-1-G",
"INM-CM4-8",
"INM-CM5-0",
"KACE-1-0-G",
"MIROC-ES2L",
"MPI-ESM1-2-HR",
"MPI-ESM1-2-LR",
"MRI-ESM2-0",
"NorESM2-LM",
"NorESM2-MM",
"TaiESM1",
"UKESM1-0-LL",
]
variables = [
"hurs",
"huss",
"pr",
"rlds",
"rsds",
"sfcWind",
"tas",
"tasmax",
"tasmin",
]

if scale == "small":
# 130 files (152.83 GiB)
# One model and one variable
models = models[:1]
variables = variables[:1]
elif scale == "medium":
# 715 files (XX TiB)
# One model and all variables
models = models[:1]
else:
# 11635 files (XX TiB)
# All models and variables
pass

# Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6
# for dataset details.
file_list = []
for model in models:
for variable in variables:
source_directory = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc"
file_list += [f"s3://{path}" for path in s3.glob(source_directory)]
files = [s3.open(f) for f in file_list]
print(f"Processing {len(files)} NetCDF files")

ds = xr.open_mfdataset(
files,
engine="h5netcdf",
combine="nested",
concat_dim="time",
parallel=True,
)
print(f"Converting {format_bytes(ds.nbytes)} from NetCDF to Zarr")
ds.to_zarr(s3_url)

0 comments on commit 8c6482b

Please sign in to comment.