diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 69d483925d..5345b189b8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,7 @@ jobs: matrix: os: [ubuntu-latest] python_version: ["3.10"] - pytest_args: [tests --ignore=tests/tpch] + pytest_args: [tests/geospatial/test_netcdf_to_zarr.py --ignore=tests/tpch] extra-env: [""] name_prefix: [tests] include: diff --git a/ci/environment.yml b/ci/environment.yml index 68f2e7177e..eb1031506d 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -48,6 +48,7 @@ dependencies: - gilknocker ==0.4.1 - openssl >1.1.0g - rioxarray ==0.17.0 + - h5netcdf ==1.3.0 ######################################################## # PLEASE READ: diff --git a/tests/geospatial/test_netcdf_to_zarr.py b/tests/geospatial/test_netcdf_to_zarr.py new file mode 100644 index 0000000000..0ab73b9388 --- /dev/null +++ b/tests/geospatial/test_netcdf_to_zarr.py @@ -0,0 +1,93 @@ +import xarray as xr +from dask.utils import format_bytes + + +def test_netcdf_to_zarr( + scale, + s3, + s3_url, + client_factory, + cluster_kwargs={ + "workspace": "dask-engineering", + "region": "us-west-2", + "wait_for_workers": True, + }, + scale_kwargs={ + "small": {"n_workers": 10}, + "medium": {"n_workers": 100}, + "large": {"n_workers": 200}, + }, +): + with client_factory( + **scale_kwargs[scale], **cluster_kwargs + ) as client: # noqa: F841 + # Define models and variables of interest + models = [ + "ACCESS-CM2", + "ACCESS-ESM1-5", + "CMCC-ESM2", + "CNRM-CM6-1", + "CNRM-ESM2-1", + "CanESM5", + "EC-Earth3", + "EC-Earth3-Veg-LR", + "FGOALS-g3", + "GFDL-ESM4", + "GISS-E2-1-G", + "INM-CM4-8", + "INM-CM5-0", + "KACE-1-0-G", + "MIROC-ES2L", + "MPI-ESM1-2-HR", + "MPI-ESM1-2-LR", + "MRI-ESM2-0", + "NorESM2-LM", + "NorESM2-MM", + "TaiESM1", + "UKESM1-0-LL", + ] + variables = [ + "hurs", + "huss", + "pr", + "rlds", + "rsds", + "sfcWind", + "tas", + "tasmax", + "tasmin", + ] + + if scale == "small": + # 130 files (152.83 GiB) + # One model and one variable + models = models[:1] + variables = variables[:1] + elif scale == "medium": + # 715 files (XX TiB) + # One model and all variables + models = models[:1] + else: + # 11635 files (XX TiB) + # All models and variables + pass + + # Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6 + # for dataset details. + file_list = [] + for model in models: + for variable in variables: + source_directory = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc" + file_list += [f"s3://{path}" for path in s3.glob(source_directory)] + files = [s3.open(f) for f in file_list] + print(f"Processing {len(files)} NetCDF files") + + ds = xr.open_mfdataset( + files, + engine="h5netcdf", + combine="nested", + concat_dim="time", + parallel=True, + ) + print(f"Converting {format_bytes(ds.nbytes)} from NetCDF to Zarr") + ds.to_zarr(s3_url)