From fbdefbdd833d7b09b9bba954e8aa5c50aa12f2f8 Mon Sep 17 00:00:00 2001 From: jbusecke Date: Wed, 5 Apr 2023 13:03:27 -0400 Subject: [PATCH 1/5] stash commit to run local tests on base branch --- tests/conftest.py | 4 ++++ tests/data_generation.py | 8 ++++++-- tests/test_end_to_end.py | 19 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bc100f28..f87511ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -215,6 +215,10 @@ def daily_xarray_dataset_with_coordinateless_dimension(daily_xarray_dataset): del ds["lon"] return ds +# @pytest.fixture(scope="session") +# def daily_xarray_dataset_with_extra_dimension_coordinates(): +# return make_ds(add_extra_dim_coords=True) + @pytest.fixture(scope="session") def netcdf_local_paths_sequential_1d(daily_xarray_dataset, tmpdir_factory): diff --git a/tests/data_generation.py b/tests/data_generation.py index 729d4267..e519918e 100644 --- a/tests/data_generation.py +++ b/tests/data_generation.py @@ -3,12 +3,12 @@ import xarray as xr -def make_ds(nt=10, non_dim_coords=False): +def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False): """Return a synthetic random xarray dataset.""" np.random.seed(2) # TODO: change nt to 11 in order to catch the edge case where # items_per_input does not evenly divide the length of the sequence dimension - ny, nx = 18, 36 + ny, nx, ne = 18, 36, 2 time = pd.date_range(start="2010-01-01", periods=nt, freq="D") lon = (np.arange(nx) + 0.5) * 360 / nx lon_attrs = {"units": "degrees_east", "long_name": "longitude"} @@ -28,6 +28,10 @@ def make_ds(nt=10, non_dim_coords=False): if non_dim_coords: coords["timestep"] = ("time", np.arange(nt)) coords["baz"] = (("lat", "lon"), np.random.rand(ny, nx)) + + if add_extra_dim_coords: + # introduce a coordinate with a dimension not used in the data variables + coords["extra_dim_coord"] = (("extra_dim", "time"), np.random.rand(ne, nt)) ds = xr.Dataset( {"bar": (dims, bar, bar_attrs), "foo": (dims, foo, foo_attrs)}, diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index a7e8a78c..ccad4f14 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -67,3 +67,22 @@ def test_xarray_zarr_subpath( ds = xr.open_dataset(os.path.join(tmp_target_url, "subpath"), engine="zarr") xr.testing.assert_equal(ds.load(), daily_xarray_dataset) + +# from .data_generation import make_ds +# def test_failure_chunk_regions(): +# ds = make_ds(non_dim_coords=True, add_extra_dim_coords=True) +# print(ds) +# assert False + + # # create a dummy dataset similar to https://github.com/pangeo-forge/pangeo-forge-recipes/issues/504 + # nx, ny, nt, nb = 3, 5, 10, 2 + # data = xr.DataArray(np.random.rand(nx, ny, nt), dims=["x", "y", "time"]) + # true_coord = xr.DataArray(np.random.rand(nx, ny), dims=["x", "y"]) + # issue_coord = xr.DataArray(np.random.rand(nt, nb), dims=["time", 'bnds']) + # # ds = xr.Dataset( + # # {'data': data, 'issue_coord': issue_coord}, coords={'true_coord': true_coord} + # # ) + # ds = xr.Dataset({'data': data}, coords={'true_coord': true_coord, 'issue_coord': issue_coord}) + # schema = dataset_to_schema(ds) + # print(determine_target_chunks(schema, specified_chunks={'time': 1, 'x': nx, 'y': ny, 'bnds': nb})) + # print(ds) \ No newline at end of file From 8e57c06450765c98c7e8c977cfa886ae325862d8 Mon Sep 17 00:00:00 2001 From: jbusecke Date: Thu, 6 Apr 2023 14:47:31 -0400 Subject: [PATCH 2/5] Add all new tests --- tests/conftest.py | 57 +++++++++++++++++++++++++++++-- tests/data_generation.py | 9 +++-- tests/test_end_to_end.py | 72 ++++++++++++++++++++++++++++++---------- 3 files changed, 115 insertions(+), 23 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f87511ed..4379a4a7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -203,6 +203,9 @@ def make_http_paths(netcdf_local_paths, request): def daily_xarray_dataset(): return make_ds(nt=10) +@pytest.fixture(scope="session") +def daily_xarray_dataset_cftime(): + return make_ds(nt=10, use_cftime=True) @pytest.fixture(scope="session") def daily_xarray_dataset_with_coordinateless_dimension(daily_xarray_dataset): @@ -215,9 +218,11 @@ def daily_xarray_dataset_with_coordinateless_dimension(daily_xarray_dataset): del ds["lon"] return ds -# @pytest.fixture(scope="session") -# def daily_xarray_dataset_with_extra_dimension_coordinates(): -# return make_ds(add_extra_dim_coords=True) +@pytest.fixture(scope="session") +def daily_xarray_dataset_with_extra_dimension_coordinates(): + ds = make_ds(nt=11, add_extra_dim_coords=True) + ds['extra_dim_var'] = ds['extra_dim_coord'] + return ds @pytest.fixture(scope="session") @@ -226,6 +231,12 @@ def netcdf_local_paths_sequential_1d(daily_xarray_dataset, tmpdir_factory): daily_xarray_dataset, tmpdir_factory, "D", split_up_files_by_day, file_type="netcdf4" ) +@pytest.fixture(scope="session") +def netcdf_local_paths_sequential_1d_cftime(daily_xarray_dataset_cftime, tmpdir_factory): + return make_local_paths( + daily_xarray_dataset, tmpdir_factory, "D", split_up_files_by_day, file_type="netcdf4" + ) + @pytest.fixture(scope="session") def netcdf3_local_paths_sequential_1d(daily_xarray_dataset, tmpdir_factory): @@ -259,6 +270,15 @@ def netcdf_local_paths_sequential_2d(daily_xarray_dataset, tmpdir_factory): def netcdf_local_paths_sequential(request): return request.param +@pytest.fixture(scope="session") +def netcdf_local_paths_sequential_cftime(daily_xarray_dataset_cftime, tmpdir_factory): + return make_local_paths( + daily_xarray_dataset_cftime, + tmpdir_factory, + "D", + split_up_files_by_day, + file_type="netcdf4", + ) @pytest.fixture(scope="session") def netcdf_local_paths_sequential_multivariable_1d(daily_xarray_dataset, tmpdir_factory): @@ -307,6 +327,28 @@ def netcdf_local_paths_sequential_multivariable_with_coordinateless_dimension( file_type="netcdf4", ) +@pytest.fixture(scope='session') +def netcdf_local_paths_sequential_with_extra_dimension_coordinate( + daily_xarray_dataset_with_extra_dimension_coordinates, tmpdir_factory + ): + return make_local_paths( + daily_xarray_dataset_with_extra_dimension_coordinates, + tmpdir_factory, + "D", + split_up_files_by_day, + file_type="netcdf4", + ) + +@pytest.fixture( + scope="session", + params=[ + lazy_fixture("netcdf_local_paths_sequential_with_extra_dimension_coordinate"), + ], +) +def netcdf_local_paths_sequential_extra_dimension_coordinate(request): + return request.param + + @pytest.fixture( scope="session", @@ -384,6 +426,9 @@ def netcdf_local_paths_sequential_with_coordinateless_dimension( def netcdf_local_file_pattern_sequential(netcdf_local_paths_sequential): return make_file_pattern(netcdf_local_paths_sequential) +@pytest.fixture(scope="session") +def netcdf_local_file_pattern_sequential_cftime(netcdf_local_paths_sequential_cftime): + return make_file_pattern(netcdf_local_paths_sequential_cftime) @pytest.fixture(scope="session") def netcdf_local_file_pattern_sequential_multivariable( @@ -422,6 +467,12 @@ def netcdf_local_file_pattern_sequential_with_coordinateless_dimension( """ return make_file_pattern(netcdf_local_paths_sequential_with_coordinateless_dimension) +@pytest.fixture(scope='session') +def netcdf_local_file_pattern_sequential_extra_dimension_coordinate( + netcdf_local_paths_sequential_extra_dimension_coordinate, + ): + return make_file_pattern(netcdf_local_paths_sequential_extra_dimension_coordinate) + # Storage fixtures -------------------------------------------------------------------------------- diff --git a/tests/data_generation.py b/tests/data_generation.py index e519918e..ede86454 100644 --- a/tests/data_generation.py +++ b/tests/data_generation.py @@ -3,13 +3,16 @@ import xarray as xr -def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False): +def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False, use_cftime=False): """Return a synthetic random xarray dataset.""" np.random.seed(2) # TODO: change nt to 11 in order to catch the edge case where # items_per_input does not evenly divide the length of the sequence dimension ny, nx, ne = 18, 36, 2 - time = pd.date_range(start="2010-01-01", periods=nt, freq="D") + if use_cftime: + time = xr.cftime_range(start="2010-01-01", periods=nt, freq="D") + else: + time = pd.date_range(start="2010-01-01", periods=nt, freq="D") lon = (np.arange(nx) + 0.5) * 360 / nx lon_attrs = {"units": "degrees_east", "long_name": "longitude"} lat = (np.arange(ny) + 0.5) * 180 / ny @@ -32,6 +35,7 @@ def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False): if add_extra_dim_coords: # introduce a coordinate with a dimension not used in the data variables coords["extra_dim_coord"] = (("extra_dim", "time"), np.random.rand(ne, nt)) + coords["extra_dim"] = ("extra_dim", np.arange(ne)) ds = xr.Dataset( {"bar": (dims, bar, bar_attrs), "foo": (dims, foo, foo_attrs)}, @@ -41,6 +45,7 @@ def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False): # Add time coord encoding # Remove "%H:%M:%s" as it will be dropped when time is 0:0:0 + # if not use_cftime: ds.time.encoding = { "units": f"days since {time[0].strftime('%Y-%m-%d')}", "calendar": "proleptic_gregorian", diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index ccad4f14..ed05faf7 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -19,7 +19,7 @@ def pipeline(): yield p -@pytest.mark.parametrize("target_chunks", [{"time": 1}, {"time": 2}, {"time": 3}]) +@pytest.mark.parametrize("target_chunks", [{"time": 1}, {"time": 2}, {"time": 3}, {'time':1, 'lon': 18}]) def test_xarray_zarr( daily_xarray_dataset, netcdf_local_file_pattern_sequential, @@ -45,6 +45,31 @@ def test_xarray_zarr( assert ds.time.encoding["chunks"] == (target_chunks["time"],) xr.testing.assert_equal(ds.load(), daily_xarray_dataset) +@pytest.mark.parametrize("target_chunks", [{"time": 1}, {"time": 2}, {"time": 3}]) +def test_xarray_zarr_cftime( + daily_xarray_dataset_cftime, + netcdf_local_file_pattern_sequential_cftime, + pipeline, + tmp_target_url, + target_chunks, +): + pattern = netcdf_local_file_pattern_sequential_cftime + with pipeline as p: + ( + p + | beam.Create(pattern.items()) + | OpenWithXarray(file_type=pattern.file_type) + | StoreToZarr( + target_root=tmp_target_url, + store_name="store", + target_chunks=target_chunks, + combine_dims=pattern.combine_dim_keys, + ) + ) + + ds = xr.open_dataset(os.path.join(tmp_target_url, "store"), engine="zarr", use_cftime=True) + assert ds.time.encoding["chunks"] == (target_chunks["time"],) + xr.testing.assert_equal(ds.load(), daily_xarray_dataset_cftime) def test_xarray_zarr_subpath( daily_xarray_dataset, @@ -68,21 +93,32 @@ def test_xarray_zarr_subpath( ds = xr.open_dataset(os.path.join(tmp_target_url, "subpath"), engine="zarr") xr.testing.assert_equal(ds.load(), daily_xarray_dataset) -# from .data_generation import make_ds -# def test_failure_chunk_regions(): -# ds = make_ds(non_dim_coords=True, add_extra_dim_coords=True) -# print(ds) -# assert False +@pytest.mark.parametrize("target_chunks", [{"time": 1}, {"time": 2}, {"time": 3}]) +def test_xarray_zarr_extra_dimension_coordinate( + daily_xarray_dataset_with_extra_dimension_coordinates, + netcdf_local_file_pattern_sequential_extra_dimension_coordinate, + pipeline, + tmp_target_url, + target_chunks, +): + # triggers https://github.com/pangeo-forge/pangeo-forge-recipes/issues/504 + target_chunks['extra_dim'] = 2 + + pattern = netcdf_local_file_pattern_sequential_extra_dimension_coordinate + + with pipeline as p: + ( + p + | beam.Create(pattern.items()) + | OpenWithXarray(file_type=pattern.file_type) + | StoreToZarr( + target_root=tmp_target_url, + store_name="store", + target_chunks=target_chunks, + combine_dims=pattern.combine_dim_keys, + ) + ) - # # create a dummy dataset similar to https://github.com/pangeo-forge/pangeo-forge-recipes/issues/504 - # nx, ny, nt, nb = 3, 5, 10, 2 - # data = xr.DataArray(np.random.rand(nx, ny, nt), dims=["x", "y", "time"]) - # true_coord = xr.DataArray(np.random.rand(nx, ny), dims=["x", "y"]) - # issue_coord = xr.DataArray(np.random.rand(nt, nb), dims=["time", 'bnds']) - # # ds = xr.Dataset( - # # {'data': data, 'issue_coord': issue_coord}, coords={'true_coord': true_coord} - # # ) - # ds = xr.Dataset({'data': data}, coords={'true_coord': true_coord, 'issue_coord': issue_coord}) - # schema = dataset_to_schema(ds) - # print(determine_target_chunks(schema, specified_chunks={'time': 1, 'x': nx, 'y': ny, 'bnds': nb})) - # print(ds) \ No newline at end of file + ds = xr.open_dataset(os.path.join(tmp_target_url, "store"), engine="zarr") + assert ds.time.encoding["chunks"] == (target_chunks["time"],) + xr.testing.assert_equal(ds.load(), daily_xarray_dataset_with_extra_dimension_coordinates) \ No newline at end of file From cd298065a2896c9d6b10a3c4130682e97e45481a Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Thu, 6 Apr 2023 16:05:37 -0400 Subject: [PATCH 3/5] remove time based tests --- tests/conftest.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4379a4a7..7a75847e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -203,10 +203,6 @@ def make_http_paths(netcdf_local_paths, request): def daily_xarray_dataset(): return make_ds(nt=10) -@pytest.fixture(scope="session") -def daily_xarray_dataset_cftime(): - return make_ds(nt=10, use_cftime=True) - @pytest.fixture(scope="session") def daily_xarray_dataset_with_coordinateless_dimension(daily_xarray_dataset): """ @@ -231,12 +227,6 @@ def netcdf_local_paths_sequential_1d(daily_xarray_dataset, tmpdir_factory): daily_xarray_dataset, tmpdir_factory, "D", split_up_files_by_day, file_type="netcdf4" ) -@pytest.fixture(scope="session") -def netcdf_local_paths_sequential_1d_cftime(daily_xarray_dataset_cftime, tmpdir_factory): - return make_local_paths( - daily_xarray_dataset, tmpdir_factory, "D", split_up_files_by_day, file_type="netcdf4" - ) - @pytest.fixture(scope="session") def netcdf3_local_paths_sequential_1d(daily_xarray_dataset, tmpdir_factory): @@ -270,15 +260,6 @@ def netcdf_local_paths_sequential_2d(daily_xarray_dataset, tmpdir_factory): def netcdf_local_paths_sequential(request): return request.param -@pytest.fixture(scope="session") -def netcdf_local_paths_sequential_cftime(daily_xarray_dataset_cftime, tmpdir_factory): - return make_local_paths( - daily_xarray_dataset_cftime, - tmpdir_factory, - "D", - split_up_files_by_day, - file_type="netcdf4", - ) @pytest.fixture(scope="session") def netcdf_local_paths_sequential_multivariable_1d(daily_xarray_dataset, tmpdir_factory): @@ -426,10 +407,6 @@ def netcdf_local_paths_sequential_with_coordinateless_dimension( def netcdf_local_file_pattern_sequential(netcdf_local_paths_sequential): return make_file_pattern(netcdf_local_paths_sequential) -@pytest.fixture(scope="session") -def netcdf_local_file_pattern_sequential_cftime(netcdf_local_paths_sequential_cftime): - return make_file_pattern(netcdf_local_paths_sequential_cftime) - @pytest.fixture(scope="session") def netcdf_local_file_pattern_sequential_multivariable( netcdf_local_paths_sequential_multivariable, From 1956e20b107af4294962ad76f425f44ea79a868a Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Thu, 6 Apr 2023 16:06:48 -0400 Subject: [PATCH 4/5] remove more cftime stuff --- tests/data_generation.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/data_generation.py b/tests/data_generation.py index ede86454..d7c10e46 100644 --- a/tests/data_generation.py +++ b/tests/data_generation.py @@ -3,16 +3,13 @@ import xarray as xr -def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False, use_cftime=False): +def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False): """Return a synthetic random xarray dataset.""" np.random.seed(2) # TODO: change nt to 11 in order to catch the edge case where # items_per_input does not evenly divide the length of the sequence dimension ny, nx, ne = 18, 36, 2 - if use_cftime: - time = xr.cftime_range(start="2010-01-01", periods=nt, freq="D") - else: - time = pd.date_range(start="2010-01-01", periods=nt, freq="D") + time = pd.date_range(start="2010-01-01", periods=nt, freq="D") lon = (np.arange(nx) + 0.5) * 360 / nx lon_attrs = {"units": "degrees_east", "long_name": "longitude"} lat = (np.arange(ny) + 0.5) * 180 / ny @@ -45,7 +42,6 @@ def make_ds(nt=10, non_dim_coords=False, add_extra_dim_coords=False, use_cftime= # Add time coord encoding # Remove "%H:%M:%s" as it will be dropped when time is 0:0:0 - # if not use_cftime: ds.time.encoding = { "units": f"days since {time[0].strftime('%Y-%m-%d')}", "calendar": "proleptic_gregorian", From d5c7d6ada3ab57c992b10f0d5363cfffbe827769 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Thu, 6 Apr 2023 16:08:23 -0400 Subject: [PATCH 5/5] and even more cftime stuff removed --- tests/test_end_to_end.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index ed05faf7..54eb9183 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -45,32 +45,6 @@ def test_xarray_zarr( assert ds.time.encoding["chunks"] == (target_chunks["time"],) xr.testing.assert_equal(ds.load(), daily_xarray_dataset) -@pytest.mark.parametrize("target_chunks", [{"time": 1}, {"time": 2}, {"time": 3}]) -def test_xarray_zarr_cftime( - daily_xarray_dataset_cftime, - netcdf_local_file_pattern_sequential_cftime, - pipeline, - tmp_target_url, - target_chunks, -): - pattern = netcdf_local_file_pattern_sequential_cftime - with pipeline as p: - ( - p - | beam.Create(pattern.items()) - | OpenWithXarray(file_type=pattern.file_type) - | StoreToZarr( - target_root=tmp_target_url, - store_name="store", - target_chunks=target_chunks, - combine_dims=pattern.combine_dim_keys, - ) - ) - - ds = xr.open_dataset(os.path.join(tmp_target_url, "store"), engine="zarr", use_cftime=True) - assert ds.time.encoding["chunks"] == (target_chunks["time"],) - xr.testing.assert_equal(ds.load(), daily_xarray_dataset_cftime) - def test_xarray_zarr_subpath( daily_xarray_dataset, netcdf_local_file_pattern_sequential, @@ -121,4 +95,4 @@ def test_xarray_zarr_extra_dimension_coordinate( ds = xr.open_dataset(os.path.join(tmp_target_url, "store"), engine="zarr") assert ds.time.encoding["chunks"] == (target_chunks["time"],) - xr.testing.assert_equal(ds.load(), daily_xarray_dataset_with_extra_dimension_coordinates) \ No newline at end of file + xr.testing.assert_equal(ds.load(), daily_xarray_dataset_with_extra_dimension_coordinates)