diff --git a/pyaerocom/io/cams2_83/reader.py b/pyaerocom/io/cams2_83/reader.py index 375c47e4b..d57338263 100644 --- a/pyaerocom/io/cams2_83/reader.py +++ b/pyaerocom/io/cams2_83/reader.py @@ -14,6 +14,7 @@ from pyaerocom import const from pyaerocom.griddeddata import GriddedData from pyaerocom.io.cams2_83.models import ModelData, ModelName, RunType +from pyaerocom.io.gridded_reader import GriddedReader # from pyaerocom.units_helpers import UALIASES @@ -220,7 +221,7 @@ def check_files(paths: list[Path]) -> list[Path]: return new_paths -class ReadCAMS2_83: +class ReadCAMS2_83(GriddedReader): FREQ_CODES = dict(hour="hourly", day="daily", month="monthly", fullrun="yearly") REVERSE_FREQ_CODES = {val: key for key, val in FREQ_CODES.items()} @@ -291,6 +292,20 @@ def data_id(self, val): self.model = ModelName[model] self.forecast_day = int(day) + @property + def years_avail(self): + return np.unique( + reader.daterange.values.astype("datetime64[Y]").astype("int") + 1970 + ).astype("str") + + @property + def ts_types(self): + return self.REVERSE_FREQ_CODES.keys() + + @property + def vars_provided(self): + return AEROCOM_NAMES.values() + @property def run_type(self): if self._run_type is None: @@ -443,6 +458,11 @@ def read_var(self, var_name: str, ts_type: str | None = None, **kwargs) -> Gridd data_id = "CAMS2-83.EMEP.day0.AN" reader = ReadCAMS2_83(data_dir=data_dir, data_id=data_id) reader.daterange = ("2021-12-01", "2021-12-04") + print( + np.unique(reader.daterange.values.astype("datetime64[Y]").astype("int") + 1970).astype( + "str" + ) + ) print(reader.filepaths) # dates = ("2021-12-01", "2021-12-04") diff --git a/pyaerocom/io/gridded_reader.py b/pyaerocom/io/gridded_reader.py new file mode 100644 index 000000000..38d839d58 --- /dev/null +++ b/pyaerocom/io/gridded_reader.py @@ -0,0 +1,82 @@ +import abc +from typing import Iterator + +from pyaerocom.griddeddata import GriddedData + + +class GriddedReader(abc.ABC): + """Abstract base class for griddel model reader used for collocation""" + + @property + @abc.abstractmethod + def data_id(self) -> str: + """ + Data ID of dataset + """ + pass + + @property + @abc.abstractmethod + def ts_types(self) -> Iterator[str]: + """ + List of available frequencies + + Raises + ------ + AttributeError + if :attr:`data_dir` is not set. + + Returns + ------- + list + list of available frequencies + + """ + pass + + @property + @abc.abstractmethod + def years_avail(self) -> Iterator[str]: + """ + Years available in dataset + """ + pass + + @property + @abc.abstractmethod + def vars_provided(self) -> Iterator[str]: + """Variables provided by this dataset""" + pass + + @abc.abstractmethod + def has_var(self, var_name) -> bool: + """Check if variable is supported + + Parameters + ---------- + var_name : str + variable to be checked + + Returns + ------- + bool + """ + pass + + @abc.abstractmethod + def read_var(self, var_name, ts_type=None, **kwargs) -> GriddedData: + """Load data for given variable. + + Parameters + ---------- + var_name : str + Variable to be read + ts_type : str + Temporal resolution of data to read. Supported are + "hourly", "daily", "monthly" , "yearly". + + Returns + ------- + GriddedData + """ + pass diff --git a/pyaerocom/io/mscw_ctm/reader.py b/pyaerocom/io/mscw_ctm/reader.py index 3f07b2262..d4da15b04 100755 --- a/pyaerocom/io/mscw_ctm/reader.py +++ b/pyaerocom/io/mscw_ctm/reader.py @@ -4,6 +4,7 @@ import os import re import warnings +from collections import namedtuple import numpy as np import xarray as xr @@ -11,6 +12,7 @@ from pyaerocom import const from pyaerocom.exceptions import VarNotAvailableError from pyaerocom.griddeddata import GriddedData +from pyaerocom.io.gridded_reader import GriddedReader from pyaerocom.projection_information import ProjectionInformation from pyaerocom.units_helpers import UALIASES @@ -46,7 +48,7 @@ logger = logging.getLogger(__name__) -class ReadMscwCtm: +class ReadMscwCtm(GriddedReader): """ Class for reading model output from the EMEP MSC-W chemical transport model. @@ -175,15 +177,17 @@ class ReadMscwCtm: #: pattern for 4-digit years for 19XX and 20XX used for trend subdirectories YEAR_PATTERN = r".*((?:19|20)\d\d).*" + class _PrivateFields: + filename: str | None = None + filedata: xr.Dataset | None = None + filepaths: list[str] | None = None + files: list[str] | None = None + data_dir: str | None = None + def __init__(self, data_id=None, data_dir=None, **kwargs): - self._data_dir = None # opened dataset (for performance boost), will be reset if data_dir is # changed - self._filename = None - self._filedata = None - self._filepaths = None - - self._files = None + self._private = self._PrivateFields() self.var_map = emep_variables() if "emep_vars" in kwargs: @@ -197,14 +201,14 @@ def __init__(self, data_id=None, data_dir=None, **kwargs): if not isinstance(data_dir, str) or not os.path.exists(data_dir): raise FileNotFoundError(f"{data_dir}") - self.data_dir = data_dir + self._data_dir = data_dir - self.data_id = data_id - self._filename = self.DEFAULT_FILE_NAME + self._data_id = data_id + self._private.filename = self.DEFAULT_FILE_NAME - def search_all_files(self): + def _search_all_files(self): folders = self._get_trend_folders_from_folder() - self.filepaths = self._get_files_from_folders(folders) + self._filepaths = self._get_files_from_folders(folders) def _get_files_from_folders(self, folders): files = [] @@ -235,7 +239,7 @@ def _get_trend_folders_from_folder(self): List of the names of the subfolder """ - dd = self.data_dir + dd = self._data_dir mscwfiles = [] for freq in self.FREQ_CODES.keys(): @@ -278,7 +282,7 @@ def _get_yrs_from_filepaths(self) -> list[str]: :return: list of years as str """ - fps = self.filepaths + fps = self._filepaths yrs = [] for fp in fps: try: @@ -332,67 +336,71 @@ def _clean_filepaths(self, filepaths, yrs, ts_type): return [d for _, d in sorted(zip(found_yrs, clean_paths))] @property - def data_dir(self): + def data_id(self) -> str | None: + return self._data_id + + @property + def _data_dir(self) -> str | None: """ Directory containing netcdf files """ - if self._data_dir is None: + if self._private.data_dir is None: raise AttributeError(f"data_dir needs to be set before accessing") - return self._data_dir + return self._private.data_dir - @data_dir.setter - def data_dir(self, val): + @_data_dir.setter + def _data_dir(self, val: str): if val is None: raise ValueError(f"Data dir {val} needs to be a dictionary or a file") if not os.path.isdir(val): raise FileNotFoundError(val) - self._data_dir = val - self._filedata = None - self.search_all_files() - self._files = self.filepaths + self._private.data_dir = val + self._private.filedata = None + self._search_all_files() + self._private.files = self._filepaths @property - def filename(self): + def _filename(self) -> str | None: """ Name of latest netcdf file read """ - return self._filename + return self._private.filename - @filename.setter - def filename(self, val): + @_filename.setter + def _filename(self, val): """ Name of netcdf file """ if not isinstance(val, str): # pragma: no cover raise ValueError("needs str") - elif val == self._filename: + elif val == self._private.filename: return - self._filename = val - self._filedata = None + self._private.filename = val + self._private.filedata = None @property - def filepaths(self): + def _filepaths(self) -> list[str]: """ - Path to data file + Paths to data file """ - if self.data_dir is None and self._filepaths is None: # pragma: no cover + if self._data_dir is None and self._filepaths is None: # pragma: no cover raise AttributeError("data_dir or filepaths needs to be set before accessing") - return self._filepaths + return self._private.filepaths - @filepaths.setter - def filepaths(self, value): + @_filepaths.setter + def _filepaths(self, value: list[str]): if not isinstance(value, list): # pragma: no cover raise ValueError("needs to be list of strings") - self._filepaths = value + self._private.filepaths = value @property - def filedata(self): + def _filedata(self) -> xr.Dataset: """ Loaded netcdf file (:class:`xarray.Dataset`) """ - if self._filedata is None: - self.open_file() - return self._filedata + if self._private.filedata is None: + self._open_file() + return self._private.filedata def _check_files_in_data_dir(self, data_dir): """ @@ -431,7 +439,7 @@ def _check_files_in_data_dir(self, data_dir): return matches @property - def ts_type(self): + def _ts_type(self): """ Frequency of time dimension of current data file @@ -446,7 +454,7 @@ def ts_type(self): current ts_type. """ - return self.ts_type_from_filename(self.filename) + return self._ts_type_from_filename(self._filename) @property def ts_types(self): @@ -464,11 +472,11 @@ def ts_types(self): list of available frequencies """ - if not isinstance(self._files, list): + if not isinstance(self._private.files, list): raise AttributeError("please set data_dir first") tsts = [] - for file in self._files: - tsts.append(self.ts_type_from_filename(file)) + for file in self._private.files: + tsts.append(self._ts_type_from_filename(file)) return list(set(tsts)) @property @@ -476,7 +484,6 @@ def years_avail(self): """ Years available in loaded dataset """ - data = self.filepaths years = self._get_yrs_from_filepaths() years = list(np.unique(years)) @@ -487,7 +494,7 @@ def vars_provided(self): """Variables provided by this dataset""" return list(self.var_map) + list(self.AUX_REQUIRES) - def open_file(self): + def _open_file(self): """ Open current netcdf file @@ -497,19 +504,19 @@ def open_file(self): Dict with years as keys and Datasets as items """ - fps = self.filepaths + fps = self._filepaths ds = {} yrs = self._get_yrs_from_filepaths() - ts_type = self.ts_type_from_filename(self.filename) + ts_type = self._ts_type_from_filename(self._filename) fps = self._clean_filepaths(fps, yrs, ts_type) if len(fps) > 1 and ts_type == "hourly": raise ValueError(f"ts_type {ts_type} can not be hourly when using multiple years") logger.info(f"Opening {fps}") ds = xr.open_mfdataset(fps, chunks={"time": 24}) - self._filedata = ds + self._private.filedata = ds return ds @@ -536,7 +543,7 @@ def has_var(self, var_name): return True return False - def ts_type_from_filename(self, filename): + def _ts_type_from_filename(self, filename): """ Get ts_type from filename @@ -559,7 +566,7 @@ def ts_type_from_filename(self, filename): return tstype raise ValueError(f"Failed to retrieve ts_type from filename {filename}") - def filename_from_ts_type(self, ts_type): + def _filename_from_ts_type(self, ts_type): """ Infer file name of data based on input ts_type @@ -671,16 +678,16 @@ def read_var(self, var_name, ts_type=None, **kwargs): var = const.VARS[var_name] var_name_aerocom = var.var_name_aerocom - if self.data_dir is None: # pragma: no cover + if self._data_dir is None: # pragma: no cover raise ValueError("data_dir must be set before reading.") - elif self.filename is None and ts_type is None: # pragma: no cover + elif self._filename is None and ts_type is None: # pragma: no cover raise ValueError("please specify ts_type") elif ts_type is not None: # filename and ts_type are set. update filename if ts_type suggests # that current file has different resolution - self.filename = self.filename_from_ts_type(ts_type) + self._filename = self._filename_from_ts_type(ts_type) - ts_type = self.ts_type + ts_type = self._ts_type arr, proj_info = self._load_var(var_name_aerocom, ts_type) if arr.units in UALIASES: @@ -707,8 +714,8 @@ def read_var(self, var_name, ts_type=None, **kwargs): # At this point a GriddedData object with name gridded should exist - gridded.metadata["data_id"] = self.data_id - gridded.metadata["from_files"] = self.filepaths + gridded.metadata["data_id"] = self._data_id + gridded.metadata["from_files"] = self._filepaths # Remove unneccessary metadata. Better way to do this? for metadata in ["current_date_first", "current_date_last"]: @@ -745,22 +752,22 @@ def _read_var_from_file(self, var_name_aerocom, ts_type): emep_var = self.var_map[var_name_aerocom] try: - filedata = self.filedata + filedata = self._filedata data = filedata[emep_var] proj_info = ProjectionInformation.from_xarray(filedata, emep_var) except KeyError: raise VarNotAvailableError( - f"{var_name_aerocom} ({emep_var}) not available in {self.filename}" + f"{var_name_aerocom} ({emep_var}) not available in {self._filename}" ) data.attrs["long_name"] = var_name_aerocom data.time.attrs["long_name"] = "time" data.time.attrs["standard_name"] = "time" prefix = emep_var.split("_")[0] - data.attrs["units"] = self.preprocess_units(data.units, prefix) + data.attrs["units"] = self._preprocess_units(data.units, prefix) return data, proj_info @staticmethod - def preprocess_units(units, prefix): + def _preprocess_units(units, prefix): """ Update units for certain variables diff --git a/pyaerocom/io/readgridded.py b/pyaerocom/io/readgridded.py index a514a7283..84773f9de 100755 --- a/pyaerocom/io/readgridded.py +++ b/pyaerocom/io/readgridded.py @@ -45,6 +45,7 @@ subtract_cubes, ) from pyaerocom.io.fileconventions import FileConventionRead +from pyaerocom.io.gridded_reader import GriddedReader from pyaerocom.io.helpers import add_file_to_log from pyaerocom.io.iris_io import concatenate_iris_cubes, load_cubes_custom from pyaerocom.metastandards import AerocomDataID @@ -54,7 +55,7 @@ logger = logging.getLogger(__name__) -class ReadGridded: +class ReadGridded(GriddedReader): """Class for reading gridded files using AeroCom file conventions Attributes diff --git a/pyaerocom/io/uemep/__init__.py b/pyaerocom/io/uemep/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pyaerocom/io/uemep/uemep_variables.ini b/pyaerocom/io/uemep/uemep_variables.ini new file mode 100644 index 000000000..8c701f52a --- /dev/null +++ b/pyaerocom/io/uemep/uemep_variables.ini @@ -0,0 +1,6 @@ +[uemep_variables] +concno2 = no2_concentration +conco3 = o3_concentration +concpm25 = pm25_concentration +concpm10 = pm10_concentration +prmm = precipitation diff --git a/tests/colocation/test_colocator.py b/tests/colocation/test_colocator.py index 1431ffd7d..27ba371ca 100644 --- a/tests/colocation/test_colocator.py +++ b/tests/colocation/test_colocator.py @@ -371,7 +371,7 @@ def test_colocator_instantiate_gridded_reader_model_data_dir(setup, path_emep): col = Colocator(col_stp) r = col._instantiate_gridded_reader(what="model") assert isinstance(r, ReadMscwCtm) - assert r.data_dir == model_data_dir + assert r._data_dir == model_data_dir assert r.data_id == model_id diff --git a/tests/io/mscw_ctm/test_reader.py b/tests/io/mscw_ctm/test_reader.py index 55db3d747..944409e66 100644 --- a/tests/io/mscw_ctm/test_reader.py +++ b/tests/io/mscw_ctm/test_reader.py @@ -148,7 +148,7 @@ def test_ReadMscwCtm__get_year_from_nc(data_dir: str): def test_ReadMscwCtm__init__(data_dir: str): reader = ReadMscwCtm("EMEP_2017", data_dir) assert getattr(reader, "data_id") == "EMEP_2017" - assert getattr(reader, "data_dir") == data_dir + assert getattr(reader, "_data_dir") == data_dir def test_ReadMscwCtm__init___error(): @@ -160,8 +160,8 @@ def test_ReadMscwCtm__init___error(): def test_ReadMscwCtm_data_dir(data_dir: str): reader = ReadMscwCtm() - reader.data_dir = data_dir - assert Path(reader.data_dir) == Path(data_dir) + reader._data_dir = data_dir + assert Path(reader._data_dir) == Path(data_dir) @pytest.mark.parametrize( @@ -174,7 +174,7 @@ def test_ReadMscwCtm_data_dir(data_dir: str): def test_ReadMscwCtm_data_dir_error(value, exception, error: str): reader = ReadMscwCtm(value) with pytest.raises(exception) as e: - reader.data_dir = value + reader._data_dir = value assert str(e.value) == error @@ -192,7 +192,7 @@ def test__ReadMscwCtm__check_files_in_data_dir_error(): def test_ReadMscwCtm_ts_type(): reader = ReadMscwCtm() - assert reader.ts_type == "daily" + assert reader._ts_type == "daily" def test_ReadMscwCtm_var_map(): @@ -211,7 +211,7 @@ def test_ReadMscwCtm_read_var(var_name: str, ts_type: str, data_dir: str): if ts_type is not None: assert data.ts_type == ts_type assert data.ts_type is not None - assert data.ts_type == reader.ts_type + assert data.ts_type == reader._ts_type @pytest.mark.parametrize( @@ -276,12 +276,12 @@ def test_ReadMscwCtm_data(data_dir: str): def test_ReadMscwCtm_directory(data_dir: str): reader = ReadMscwCtm(data_dir=data_dir) - assert reader.data_dir == data_dir + assert reader._data_dir == data_dir vars_provided = reader.vars_provided assert "vmro3" in vars_provided assert "concpm10" in vars_provided assert "concno2" in vars_provided - paths = reader.filepaths + paths = reader._filepaths assert len(paths) == 3 @@ -295,12 +295,12 @@ def test_ReadMscwCtm_directory(data_dir: str): ], ) def test_ReadMscwCtm_ts_type_from_filename(reader, filename, ts_type): - assert reader.ts_type_from_filename(filename) == ts_type + assert reader._ts_type_from_filename(filename) == ts_type def test_ReadMscwCtm_ts_type_from_filename_error(reader): with pytest.raises(ValueError) as e: - reader.ts_type_from_filename("blaaa") + reader._ts_type_from_filename("blaaa") assert str(e.value) == "Failed to retrieve ts_type from filename blaaa" @@ -314,12 +314,12 @@ def test_ReadMscwCtm_ts_type_from_filename_error(reader): ], ) def test_ReadMscwCtm_filename_from_ts_type(reader, filename, ts_type): - assert reader.filename_from_ts_type(ts_type) == filename + assert reader._filename_from_ts_type(ts_type) == filename def test_ReadMscwCtm_filename_from_ts_type_error(reader): with pytest.raises(ValueError) as e: - reader.filename_from_ts_type("blaaa") + reader._filename_from_ts_type("blaaa") assert str(e.value) == "unknown ts_type=blaaa" @@ -331,15 +331,15 @@ def test_ReadMscwCtm_years_avail(data_dir: str): def test_ReadMscwCtm_preprocess_units(): units = "" prefix = "AOD" - assert ReadMscwCtm().preprocess_units(units, prefix) == "1" + assert ReadMscwCtm()._preprocess_units(units, prefix) == "1" def test_ReadMscwCtm_open_file(data_dir: str): reader = ReadMscwCtm() with pytest.raises(AttributeError): - reader.open_file() - reader.data_dir = data_dir - data = reader.open_file() + reader._open_file() + reader._data_dir = data_dir + data = reader._open_file() assert isinstance(data, xr.Dataset) assert reader._filedata is data @@ -490,7 +490,7 @@ def test_read_emep_clean_filepaths(data_path: Path, year, years: list[int], freq reader = ReadMscwCtm(data_dir=str(data_path / year)) tst = reader.FREQ_CODES[freq] - filepaths = reader.filepaths + filepaths = reader._filepaths cleaned_paths = reader._clean_filepaths(filepaths, years, tst) assert len(cleaned_paths) == len(years) @@ -517,7 +517,7 @@ def test_read_emep_clean_filepaths(data_path: Path, year, years: list[int], freq def test_read_emep_clean_filepaths_error(data_path: Path, years, freq: str, error: str): reader = ReadMscwCtm(data_dir=str(data_path)) tst = reader.FREQ_CODES[freq] - filepaths = reader.filepaths + filepaths = reader._filepaths with pytest.raises(ValueError) as e: reader._clean_filepaths(filepaths, years, tst) @@ -535,7 +535,7 @@ def test_read_emep_clean_filepaths_error(data_path: Path, years, freq: str, erro def test_read_emep_wrong_filenames(data_path: Path, freq: str, wrong_name: str): reader = ReadMscwCtm(data_dir=str(data_path)) tst = reader.FREQ_CODES[freq] - filepaths = reader.filepaths + filepaths = reader._filepaths years = reader._get_yrs_from_filepaths() cleaned_paths = reader._clean_filepaths(filepaths, years, tst) @@ -559,7 +559,7 @@ def test_read_emep_wrong_filenames(data_path: Path, freq: str, wrong_name: str): def test_read_emep_wrong_tst(data_path: Path, wrong_tst: str): reader = ReadMscwCtm(data_dir=str(data_path)) with pytest.raises(ValueError) as e: - filepaths = reader.filepaths + filepaths = reader._filepaths wrong_path = Path(filepaths[0]).with_name(f"Base_{wrong_tst}.nc") reader._get_tst_from_file(str(wrong_path)) @@ -570,7 +570,7 @@ def test_read_emep_LF_tst(tmp_path: Path): data_path = emep_data_path(tmp_path, "month", vars_and_units={"prmm": "mm"}) reader = ReadMscwCtm(data_dir=str(data_path)) with pytest.raises(ValueError) as e: - filepaths = reader.filepaths + filepaths = reader._filepaths wrong_path = Path(filepaths[0]).with_name(f"Base_LF_month.nc") reader._get_tst_from_file(str(wrong_path)) @@ -580,7 +580,7 @@ def test_read_emep_LF_tst(tmp_path: Path): def test_read_emep_year_defined_twice(tmp_path: Path): data_path = emep_data_path(tmp_path, "day", vars_and_units={"prmm": "mm"}) reader = ReadMscwCtm(data_dir=str(data_path)) - filepaths = reader.filepaths + filepaths = reader._filepaths wrong_path = Path(filepaths[0]).with_name(f"Base_day.nc") filepaths.append(str(wrong_path)) new_yrs = reader._get_yrs_from_filepaths() @@ -600,8 +600,8 @@ def test_read_emep_year_defined_twice(tmp_path: Path): @pytest.mark.parametrize("vars_and_units", [{"prmm": "mm"}]) def test_read_emep_multiple_dirs(data_path: Path, year: str, freq: str, num: int): reader = ReadMscwCtm(data_dir=str(data_path / year)) - assert len(reader.filepaths) == num - assert all(freq in Path(file).stem for file in reader.filepaths) + assert len(reader._filepaths) == num + assert all(freq in Path(file).stem for file in reader._filepaths) tst = reader.FREQ_CODES[freq] data = reader.read_var("prmm", ts_type=tst) @@ -628,15 +628,15 @@ def test_read_emep_multiple_dirs_hour_error(tmp_path: Path): def test_search_all_files(data_path: Path, year: str, num: int): reader = ReadMscwCtm() with pytest.raises(AttributeError): - reader.search_all_files() + reader._search_all_files() with pytest.raises(AttributeError): - reader.filepaths + reader._filepaths reader._data_dir = str(data_path / year) - reader.search_all_files() - assert len(reader.filepaths) == num + reader._search_all_files() + assert len(reader._filepaths) == num @pytest.mark.parametrize( @@ -654,7 +654,7 @@ def test_ts_types(data_path: Path, year: str, freq: list[str]): with pytest.raises(AttributeError): reader.ts_types - reader.data_dir = str(data_path / year) + reader._data_dir = str(data_path / year) ts_types = reader.ts_types assert len(ts_types) == len(freq)