From 7a9bdf690304f380bea10a928fc663ab4188b896 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 23 Sep 2024 17:15:54 +0200 Subject: [PATCH 01/17] feat: add era5 download modules --- openhexa/toolbox/era5/__init__.py | 15 ++ openhexa/toolbox/era5/cds.py | 142 +++++++++++ openhexa/toolbox/era5/google.py | 121 +++++++++ openhexa/toolbox/era5/variables.json | 352 +++++++++++++++++++++++++++ 4 files changed, 630 insertions(+) create mode 100644 openhexa/toolbox/era5/__init__.py create mode 100644 openhexa/toolbox/era5/cds.py create mode 100644 openhexa/toolbox/era5/google.py create mode 100644 openhexa/toolbox/era5/variables.json diff --git a/openhexa/toolbox/era5/__init__.py b/openhexa/toolbox/era5/__init__.py new file mode 100644 index 0000000..71dee32 --- /dev/null +++ b/openhexa/toolbox/era5/__init__.py @@ -0,0 +1,15 @@ +import logging + +import cdsapi + +logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + +BASE_URL = "https://cds-beta.climate.copernicus.eu/api" + + +class ERA5: + def __init__(self, key: str): + self.client = cdsapi.Client( + key=key, + ) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py new file mode 100644 index 0000000..eef6db2 --- /dev/null +++ b/openhexa/toolbox/era5/cds.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import importlib.resources +import json +import logging +import shutil +import tempfile +from calendar import monthrange +from datetime import datetime +from pathlib import Path + +import cads_api_client +import cdsapi + +with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f: + VARIABLES = json.load(f) + +DATASET = "reanalysis-era5-land" + +logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + +URL = "https://cds-beta.climate.copernicus.eu/api" + + +class ParameterError(ValueError): + pass + + +class Client: + def __init__(self, key: str): + self.client = cdsapi.Client(url=URL, key=key, wait_until_complete=True, quiet=True, progress=False) + self.cads_api_client = cads_api_client.ApiClient(key=key, url=URL) + + def latest(self) -> datetime: + """Get date of latest available product.""" + collection = self.cads_api_client.collection(DATASET) + _, end = collection.json["extent"]["temporal"]["interval"][0] + end = datetime.strptime(end, "%Y-%m-%dT00:00:00Z") + return end + + @staticmethod + def build_request( + variable: str, + year: int, + month: int, + days: list[int] = None, + time: list[str] = None, + data_format: str = "grib", + area: list[float] = None, + ) -> dict: + """Build request payload. + + Parameters + ---------- + variable : str + Climate data store variable name (ex: "2m_temperature"). + year : int + Year of interest. + month : int + Month of interest. + days : list[int] + Days of interest. Defauls to None (all days). + time : list[str] + Hours of interest (ex: ["01:00", "06:00", "18:00"]). Defaults to None (all hours). + data_format : str + Output data format ("grib" or "netcdf"). Defaults to "grib". + area : list[float] + Area of interest (north, west, south, east). Defaults to None (world). + + Returns + ------- + dict + Request payload. + + Raises + ------ + ParameterError + Request parameters are not valid. + """ + if variable not in VARIABLES: + raise ParameterError("Variable %s not supported", variable) + + if data_format not in ["grib", "netcdf"]: + raise ParameterError("Data format %s not supported", data_format) + + if area: + n, w, s, e = area + if ((abs(n) > 90) or (abs(s) > 90)) or ((abs(w) > 180) or (abs(e) > 180)): + raise ParameterError("Invalid area of interest") + if (n < s) or (e < w): + raise ParameterError("Invalid area of interest") + + if not days: + dmax = monthrange(year, month)[1] + days = [day for day in range(1, dmax + 1)] + + if not time: + time = [f"{hour:02}:00" for hour in range(0, 24)] + + year = str(year) + month = f"{month:02}" + days = [f"{day:02}" for day in days] + + payload = { + "variable": [variable], + "year": year, + "month": month, + "day": days, + "time": time, + "data_format": data_format, + } + + if area: + payload["area"] = area + + return payload + + def download(self, request: dict, dst_file: str | Path, overwrite: bool = False): + """Download Era5 product. + + Parameters + ---------- + request : dict + Request payload as returned by the build_request() method. + dst_file : Path + Output file path. + overwrite : bool, optional + Overwrite existing file (default=False). + """ + dst_file = Path(dst_file) + dst_file.parent.mkdir(parents=True, exist_ok=True) + + if dst_file.exists() and not overwrite: + log.debug("File %s already exists, skipping download", str(dst_file.absolute())) + return + + with tempfile.NamedTemporaryFile() as tmp: + self.client.retrieve(name=DATASET, request=request, target=tmp.name) + shutil.move(tmp.name, dst_file) + + log.debug("Downloaded Era5 product to %s", str(dst_file.absolute())) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py new file mode 100644 index 0000000..3f4347f --- /dev/null +++ b/openhexa/toolbox/era5/google.py @@ -0,0 +1,121 @@ +"""Download raw historical Era5 products from Google Cloud: +https://console.cloud.google.com/storage/browser/gcp-public-data-arco-era5 + +Products are provided as raw NetCDF files and are usually available with a ~3 month lag. +""" + +from __future__ import annotations + +import importlib.resources +import json +import logging +import shutil +import tempfile +from datetime import datetime +from functools import cached_property +from pathlib import Path + +import requests +from google.cloud import storage + +with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f: + VARIABLES = json.load(f) + +logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + + +class NotFoundError(Exception): + pass + + +class ParameterError(ValueError): + pass + + +class Client: + def __init__(self): + self.client = storage.Client.create_anonymous_client() + self.bucket = self.client.bucket("gcp-public-data-arco-era5") + + @staticmethod + def prefix(variable: str, date: datetime) -> str: + """Build key prefix for a given product.""" + return f"raw/date-variable-single_level/{date.year}/{date.month:02}/{date.day:02}/{variable}/surface.nc" + + def _subdirs(self, prefix: str) -> list[str]: + """List subdirs.""" + blobs = self.client.list_blobs(self.bucket, prefix=prefix, delimiter="/") + prefixes = [] + for page in blobs.pages: + prefixes += page.prefixes + return prefixes + + @cached_property + def latest(self) -> datetime: + """Get date of latest available product.""" + root = "raw/date-variable-single_level/" + subdirs = self._subdirs(root) # years + subdirs = self._subdirs(max(subdirs)) # months + subdirs = self._subdirs(max(subdirs)) # days + subdir = max(subdirs).split("/") + year = int(subdir[-4]) + month = int(subdir[-3]) + day = int(subdir[-2]) + return datetime(year, month, day) + + def find(self, variable: str, date: datetime) -> str | None: + """Find public URL of product. Return None if not found.""" + prefix = self.prefix(variable, date) + blobs = self.client.list_blobs(self.bucket, prefix=prefix, max_results=1) + blobs = list(blobs) + if blobs: + return blobs[0].public_url + else: + return None + + def download(self, variable: str, date: datetime, dst_file: str | Path, overwrite=False): + """Download an Era5 NetCDF product for a given day. + + Parameters + ---------- + variable : str + Climate data store variable name (ex: "2m_temperature"). + date : datetime + Product date (year, month, day). + dst_file : str | Path + Output file. + overwrite : bool, optional + Overwrite existing file (default=False). + + Raises + ------ + ParameterError + Product request parameters are invalid. + NotFoundError + Product not found in bucket. + """ + dst_file = Path(dst_file) + dst_file.parent.mkdir(parents=True, exist_ok=True) + + if dst_file.exists() and not overwrite: + log.debug("Skipping download of %s because file already exists", str(dst_file.absolute())) + return + + if variable not in VARIABLES: + raise ParameterError("%s is not a valid climate data store variable name", variable) + + url = self.find(variable, date) + if not url: + raise NotFoundError("%s product not found for date %s", variable, date.strftime("%Y-%m-%d")) + + with tempfile.NamedTemporaryFile() as tmp: + with open(tmp.name, "wb") as f: + with requests.get(url, stream=True) as r: + for chunk in r.iter_content(chunk_size=1024**2): + if chunk: + f.write(chunk) + + shutil.move(tmp.name, dst_file) + + log.debug("Downloaded %s", str(dst_file.absolute())) diff --git a/openhexa/toolbox/era5/variables.json b/openhexa/toolbox/era5/variables.json new file mode 100644 index 0000000..fbbd894 --- /dev/null +++ b/openhexa/toolbox/era5/variables.json @@ -0,0 +1,352 @@ +{ + "lake_mix_layer_temperature": { + "name": "Lake mix-layer temperature", + "shortname": "lmlt", + "units": "K", + "grib1": true, + "grib2": false + }, + "lake_mix_layer_depth": { + "name": "Lake mix-layer depth", + "shortname": "lmld", + "units": "m", + "grib1": true, + "grib2": false + }, + "lake_bottom_temperature": { + "name": "Lake bottom temperature", + "shortname": "lblt", + "units": "K", + "grib1": true, + "grib2": false + }, + "lake_total_layer_temperature": { + "name": "Lake total layer temperature", + "shortname": "ltlt", + "units": "K", + "grib1": true, + "grib2": false + }, + "lake_shape_factor": { + "name": "Lake shape factor", + "shortname": "lshf", + "units": "dimensionless", + "grib1": true, + "grib2": false + }, + "lake_ice_temperature": { + "name": "Lake ice temperature", + "shortname": "lict", + "units": "K", + "grib1": true, + "grib2": false + }, + "lake_ice_depth": { + "name": "Lake ice depth", + "shortname": "licd", + "units": "m", + "grib1": true, + "grib2": false + }, + "snow_cover": { + "name": "Snow cover", + "shortname": "snowc", + "units": "%", + "grib1": false, + "grib2": true + }, + "snow_depth": { + "name": "Snow depth", + "shortname": "sde", + "units": "m", + "grib1": false, + "grib2": true + }, + "snow_albedo": { + "name": "Snow albedo", + "shortname": "asn", + "units": "(0 - 1)", + "grib1": true, + "grib2": false + }, + "snow_density": { + "name": "Snow density", + "shortname": "rsn", + "units": "kg m**-3", + "grib1": true, + "grib2": false + }, + "volumetric_soil_water_layer_1": { + "name": "Volumetric soil water layer 11", + "shortname": "swvl1", + "units": "m**3 m**-3", + "grib1": true, + "grib2": false + }, + "volumetric_soil_water_layer_2": { + "name": "Volumetric soil water layer 21", + "shortname": "swvl2", + "units": "m**3 m**-3", + "grib1": true, + "grib2": false + }, + "volumetric_soil_water_layer_3": { + "name": "Volumetric soil water layer 31", + "shortname": "swvl3", + "units": "m**3 m**-3", + "grib1": true, + "grib2": false + }, + "volumetric_soil_water_layer_4": { + "name": "Volumetric soil water layer 41", + "shortname": "swvl4", + "units": "m**3 m**-3", + "grib1": true, + "grib2": false + }, + "leaf_area_index_low_vegetation": { + "name": "Leaf area index, low vegetation2", + "shortname": "lai_lv", + "units": "m**2 m**-2", + "grib1": true, + "grib2": false + }, + "leaf_area_index_high_vegetation": { + "name": "Leaf area index, high vegetation2", + "shortname": "lai_hv", + "units": "m**2 m**-2", + "grib1": true, + "grib2": false + }, + "surface_pressure": { + "name": "Surface pressure", + "shortname": "sp", + "units": "Pa", + "grib1": true, + "grib2": false + }, + "soil_temperature_level_1": { + "name": "Soil temperature level 11", + "shortname": "stl1", + "units": "K", + "grib1": true, + "grib2": false + }, + "snow_depth_water_equivalent": { + "name": "Snow depth water equivalent", + "shortname": "sd", + "units": "m of water equivalent", + "grib1": true, + "grib2": false + }, + "10m_u_component_of_wind": { + "name": "10 metre U wind component", + "shortname": "u10", + "units": "m s**-1", + "grib1": true, + "grib2": false + }, + "10m_v_component_of_wind": { + "name": "10 metre V wind component", + "shortname": "v10", + "units": "m s**-1", + "grib1": true, + "grib2": false + }, + "2m_temperature": { + "name": "2 metre temperature", + "shortname": "2t", + "units": "K", + "grib1": true, + "grib2": false + }, + "2m_dewpoint_temperature": { + "name": "2 metre dewpoint temperature", + "shortname": "2d", + "units": "K", + "grib1": true, + "grib2": false + }, + "soil_temperature_level_2": { + "name": "Soil temperature level 21", + "shortname": "stl2", + "units": "K", + "grib1": true, + "grib2": false + }, + "soil_temperature_level_3": { + "name": "Soil temperature level 31", + "shortname": "stl3", + "units": "K", + "grib1": true, + "grib2": false + }, + "skin_reservoir_content": { + "name": "Skin reservoir content", + "shortname": "src", + "units": "m of water equivalent", + "grib1": false, + "grib2": false + }, + "skin_temperature": { + "name": "Skin temperature", + "shortname": "skt", + "units": "K", + "grib1": true, + "grib2": false + }, + "soil_temperature_level_4": { + "name": "Soil temperature level 41", + "shortname": "stl4", + "units": "K", + "grib1": true, + "grib2": false + }, + "temperature_of_snow_layer": { + "name": "Temperature of snow layer", + "shortname": "tsn", + "units": "K", + "grib1": true, + "grib2": false + }, + "forecast_albedo": { + "name": "Forecast albedo", + "shortname": "fal", + "units": "(0 - 1)", + "grib1": true, + "grib2": false + }, + "surface_runoff": { + "name": "Surface runoff", + "shortname": "sro", + "units": "m", + "grib1": true, + "grib2": false + }, + "sub_surface_runoff": { + "name": "Sub-surface runoff", + "shortname": "ssro", + "units": "m", + "grib1": true, + "grib2": false + }, + "\u00a0snow_evaporation": { + "name": "Snow evaporation", + "shortname": "es", + "units": "m of water equivalent", + "grib1": true, + "grib2": false + }, + "snowmelt": { + "name": "Snowmelt", + "shortname": "smlt", + "units": "m of water equivalent", + "grib1": true, + "grib2": false + }, + "snowfall": { + "name": "Snowfall", + "shortname": "sf", + "units": "m of water equivalent", + "grib1": true, + "grib2": false + }, + "surface_sensible_heat_flux": { + "name": "Surface sensible heat flux", + "shortname": "sshf", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "surface_latent_heat_flux": { + "name": "Surface latent heat flux", + "shortname": "slhf", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "surface_solar_radiation_downwards": { + "name": "Surface solar radiation downwards", + "shortname": "ssrd", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "surface_thermal_radiation_downwards": { + "name": "Surface thermal radiation downwards", + "shortname": "strd", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "surface_net_solar_radiation": { + "name": "Surface net solar radiation", + "shortname": "ssr", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "surface_net_thermal_radiation": { + "name": "Surface net thermal radiation", + "shortname": "str", + "units": "J m**-2", + "grib1": true, + "grib2": false + }, + "total_evaporation": { + "name": "Total Evaporation", + "shortname": "e", + "units": "m of water equivalent", + "grib1": true, + "grib2": false + }, + "runoff": { + "name": "Runoff", + "shortname": "ro", + "units": "m", + "grib1": true, + "grib2": false + }, + "total_precipitation": { + "name": "Total precipitation", + "shortname": "tp", + "units": "m", + "grib1": true, + "grib2": false + }, + "evaporation_from_the_top_of_canopy": { + "name": "Evaporation from the top of canopy", + "shortname": "evatc", + "units": "m of water equivalent", + "grib1": false, + "grib2": true + }, + "evaporation_from_bare_soil": { + "name": "Evaporation from bare soil", + "shortname": "evabs", + "units": "m of water equivalent", + "grib1": false, + "grib2": true + }, + "evaporation_from_open_water_surfaces_excluding_oceans": { + "name": "Evaporation from open water surfaces excluding oceans", + "shortname": "evaow", + "units": "m of water equivalent", + "grib1": false, + "grib2": true + }, + "evaporation_from_vegetation_transpiration": { + "name": "Evaporation from vegetation transpiration", + "shortname": "evavt", + "units": "m of water equivalent", + "grib1": false, + "grib2": true + }, + "potential_evaporation": { + "name": "Potential evaporation", + "shortname": "pev", + "units": "m", + "grib1": true, + "grib2": false + } +} \ No newline at end of file From 1c2e4298ea0eb2cb509a0bced0cbf7de62e36f49 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 23 Sep 2024 18:30:42 +0200 Subject: [PATCH 02/17] feat: add era5 sync function --- openhexa/toolbox/era5/google.py | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 3f4347f..37978eb 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -11,7 +11,7 @@ import logging import shutil import tempfile -from datetime import datetime +from datetime import datetime, timedelta from functools import cached_property from pathlib import Path @@ -119,3 +119,41 @@ def download(self, variable: str, date: datetime, dst_file: str | Path, overwrit shutil.move(tmp.name, dst_file) log.debug("Downloaded %s", str(dst_file.absolute())) + + def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: str | Path): + """Download all products for a given variable and date range. + + If products are already present in the destination directory, they will be skipped. + Expects file names to be formatted as "YYYY-MM-DD_VARIABLE.nc". + + Parameters + ---------- + variable : str + Climate data store variable name (ex: "2m_temperature"). + start_date : datetime + Start date (year, month, day). + end_date : datetime + End date (year, month, day). + dst_dir : str | Path + Output directory. + """ + dst_dir = Path(dst_dir) + dst_dir.mkdir(parents=True, exist_ok=True) + + date = start_date + if end_date > self.latest: + log.info("End date is in the future, setting it to the latest available date: %s", self.latest) + end_date = self.latest + + while date <= end_date: + expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc" + fpath = Path(dst_dir, expected_filename) + fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib")) + if fpath.exists() or fpath_grib.exists(): + log.debug("%s already exists, skipping download", expected_filename) + continue + else: + self.download(variable, date, fpath, overwrite=False) + log.debug("Downloaded %s", expected_filename) + + date += timedelta(days=1) From 0cc44728bc7c4ef798694a520d2292eaaa31d703 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 23 Sep 2024 18:40:15 +0200 Subject: [PATCH 03/17] fix: update package dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ee548c4..d3ee45f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,9 @@ dependencies = [ "geopandas", "polars", "diskcache", - "pyjwt" + "pyjwt", + "cdsapi >=0.7.3", + "cads-api-client >=1.4.0", ] [project.optional-dependencies] From a92097f2d5a3c6468500b96663960297301809d7 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 23 Sep 2024 19:05:41 +0200 Subject: [PATCH 04/17] fix: add json file as package data --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d3ee45f..92410c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,10 +40,16 @@ dev = [ "responses", ] +[tool.setuptools] +include-package-data = true + [tool.setuptools.packages.find] where = ["."] namespaces = true +[tool.setuptools.package-data] +"openhexa.toolbox.era5" = ["*.json"] + [project.urls] "Homepage" = "https://github.com/blsq/openhexa-toolbox" "Bug Tracker" = "https://github.com/blsq/openhexa-toolbox/issues" From c1c3bae9bc34284af1146a75d67c02c7ac6620a3 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Tue, 24 Sep 2024 17:18:56 +0200 Subject: [PATCH 05/17] fix: do not move tmp file before closing --- openhexa/toolbox/era5/cds.py | 2 +- openhexa/toolbox/era5/google.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py index eef6db2..46e1323 100644 --- a/openhexa/toolbox/era5/cds.py +++ b/openhexa/toolbox/era5/cds.py @@ -137,6 +137,6 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False) with tempfile.NamedTemporaryFile() as tmp: self.client.retrieve(name=DATASET, request=request, target=tmp.name) - shutil.move(tmp.name, dst_file) + shutil.copy(tmp.name, dst_file) log.debug("Downloaded Era5 product to %s", str(dst_file.absolute())) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 37978eb..871b9df 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -116,7 +116,7 @@ def download(self, variable: str, date: datetime, dst_file: str | Path, overwrit if chunk: f.write(chunk) - shutil.move(tmp.name, dst_file) + shutil.copy(tmp.name, dst_file) log.debug("Downloaded %s", str(dst_file.absolute())) From 56c6a32c6292aec3719822bfaf4dcd17c5d10daa Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Wed, 25 Sep 2024 10:45:51 +0200 Subject: [PATCH 06/17] fix: infinite loop in sync function --- openhexa/toolbox/era5/google.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 871b9df..dec2ab4 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -151,9 +151,8 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib")) if fpath.exists() or fpath_grib.exists(): log.debug("%s already exists, skipping download", expected_filename) - continue + date += timedelta(days=1) else: self.download(variable, date, fpath, overwrite=False) log.debug("Downloaded %s", expected_filename) - - date += timedelta(days=1) + date += timedelta(days=1) From 5b982e655b0e7181962b604dd73fc8ec15257b3e Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Wed, 25 Sep 2024 10:51:04 +0200 Subject: [PATCH 07/17] fix: better log message --- openhexa/toolbox/era5/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index dec2ab4..82d71b4 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -142,7 +142,7 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: date = start_date if end_date > self.latest: - log.info("End date is in the future, setting it to the latest available date: %s", self.latest) + log.info("Setting `end_date` to the latest available date: %s", self.latest) end_date = self.latest while date <= end_date: From f0a9b68e89031a20f73480ba0d44e0fde8d8f8a6 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Wed, 25 Sep 2024 11:00:19 +0200 Subject: [PATCH 08/17] fix: better log message --- openhexa/toolbox/era5/google.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 82d71b4..1799616 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -142,7 +142,7 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: date = start_date if end_date > self.latest: - log.info("Setting `end_date` to the latest available date: %s", self.latest) + log.info("Setting `end_date` to the latest available date: %s" % self.latest) end_date = self.latest while date <= end_date: @@ -150,9 +150,9 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: fpath = Path(dst_dir, expected_filename) fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib")) if fpath.exists() or fpath_grib.exists(): - log.debug("%s already exists, skipping download", expected_filename) + log.debug("%s already exists, skipping download" % expected_filename) date += timedelta(days=1) else: self.download(variable, date, fpath, overwrite=False) - log.debug("Downloaded %s", expected_filename) + log.debug("Downloaded %s" % expected_filename) date += timedelta(days=1) From b8af3fb08c8abd6d68bbc71b80346fe806e5af43 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Wed, 25 Sep 2024 11:05:52 +0200 Subject: [PATCH 09/17] fix: infinite loop in sync function --- openhexa/toolbox/era5/google.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 1799616..0b88380 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -149,10 +149,12 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc" fpath = Path(dst_dir, expected_filename) fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib")) + if fpath.exists() or fpath_grib.exists(): log.debug("%s already exists, skipping download" % expected_filename) date += timedelta(days=1) - else: - self.download(variable, date, fpath, overwrite=False) - log.debug("Downloaded %s" % expected_filename) - date += timedelta(days=1) + continue + + self.download(variable, date, fpath, overwrite=False) + log.debug("Downloaded %s" % expected_filename) + date += timedelta(days=1) From 23824a8b12e5f50ffc7b2823879469a033b5c94c Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Wed, 25 Sep 2024 11:40:27 +0200 Subject: [PATCH 10/17] fix: infinite loop in sync function --- openhexa/toolbox/era5/google.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py index 0b88380..dc4b8d8 100644 --- a/openhexa/toolbox/era5/google.py +++ b/openhexa/toolbox/era5/google.py @@ -140,21 +140,20 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: dst_dir = Path(dst_dir) dst_dir.mkdir(parents=True, exist_ok=True) + if start_date > end_date: + raise ParameterError("`start_date` must be before `end_date`") + date = start_date if end_date > self.latest: - log.info("Setting `end_date` to the latest available date: %s" % self.latest) + log.info("Setting `end_date` to the latest available date: %s" % date.strftime("%Y-%m-%d")) end_date = self.latest while date <= end_date: expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc" fpath = Path(dst_dir, expected_filename) fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib")) - if fpath.exists() or fpath_grib.exists(): log.debug("%s already exists, skipping download" % expected_filename) - date += timedelta(days=1) - continue - - self.download(variable, date, fpath, overwrite=False) - log.debug("Downloaded %s" % expected_filename) + else: + self.download(variable=variable, date=date, dst_file=fpath, overwrite=False) date += timedelta(days=1) From d0a4af30a1def1560aadcdcc73d13d9f937da4fb Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Fri, 11 Oct 2024 19:18:37 +0200 Subject: [PATCH 11/17] feat(era5): download all CDS products for a given period --- openhexa/toolbox/era5/cds.py | 108 ++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py index 46e1323..7747fba 100644 --- a/openhexa/toolbox/era5/cds.py +++ b/openhexa/toolbox/era5/cds.py @@ -6,11 +6,15 @@ import shutil import tempfile from calendar import monthrange -from datetime import datetime +from datetime import datetime, timedelta +from functools import cached_property +from math import ceil from pathlib import Path import cads_api_client import cdsapi +import geopandas as gpd +from dateutil.relativedelta import relativedelta with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f: VARIABLES = json.load(f) @@ -27,11 +31,36 @@ class ParameterError(ValueError): pass +def bounds_from_file(fp: Path, buffer: float = 0.5) -> list[float]: + """Get bounds from file. + + Parameters + ---------- + fp : Path + File path. + buffer : float, optional + Buffer to add to the bounds (default=0.5). + + Returns + ------- + list[float] + Bounds (north, west, south, east). + """ + boundaries = gpd.read_parquet(fp) + xmin, ymin, xmax, ymax = boundaries.total_bounds + xmin = ceil(xmin - 0.5) + ymin = ceil(ymin - 0.5) + xmax = ceil(xmax + 0.5) + ymax = ceil(ymax + 0.5) + return ymax, xmin, ymin, xmax + + class Client: def __init__(self, key: str): self.client = cdsapi.Client(url=URL, key=key, wait_until_complete=True, quiet=True, progress=False) self.cads_api_client = cads_api_client.ApiClient(key=key, url=URL) + @cached_property def latest(self) -> datetime: """Get date of latest available product.""" collection = self.cads_api_client.collection(DATASET) @@ -140,3 +169,80 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False) shutil.copy(tmp.name, dst_file) log.debug("Downloaded Era5 product to %s", str(dst_file.absolute())) + + @staticmethod + def _period_chunks(start: datetime, end: datetime) -> list[dict]: + """Generate list of period chunks to prepare CDS API requests. + + If we can, prepare requests for full months to optimize wait times. If we can't, prepare + daily requests. + + Parameters + ---------- + start : datetime + Start date. + end : datetime + End date. + + Returns + ------- + list[dict] + List of period chunks as dicts with `year`, `month` and `days` keys. + """ + chunks = [] + date = start + while date <= end: + last_day_in_month = datetime(date.year, date.month, monthrange(date.year, date.month)[1]) + if last_day_in_month <= end: + chunks.append( + {"year": date.year, "month": date.month, "days": [day for day in range(1, last_day_in_month.day)]} + ) + date += relativedelta(months=1) + else: + chunks.append({"year": date.year, "month": date.month, "days": [date.day]}) + date += timedelta(days=1) + return chunks + + def download_between( + self, + variable: str, + start: datetime, + end: datetime, + dst_dir: str | Path, + area: list[float] = None, + overwrite: bool = False, + ): + """Download all ERA5 products between two dates. + + Parameters + ---------- + variable : str + Climate data store variable name (ex: "2m_temperature"). + start : datetime + Start date. + end : datetime + End date. + dst_dir : Path + Output directory. + area : list[float], optional + Area of interest (north, west, south, east). Defaults to None (world). + overwrite : bool, optional + Overwrite existing files (default=False). + """ + if end > self.latest: + end = self.latest + log.debug("End date is after latest available product, setting end date to %s", end) + + chunks = self._period_chunks(start, end) + + for chunk in chunks: + request = self.build_request( + variable=variable, year=chunk["year"], month=chunk["month"], days=chunk["days"], area=area + ) + + if len(chunk["days"]) == 1: + dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}-{chunk['days']:02}.grib" + else: + dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}.nc" + + self.download(request=request, dst_file=dst_file, overwrite=overwrite) From 4da06c480f0d43c12b951a3ca5a504b95adb4257 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 14 Oct 2024 11:53:54 +0200 Subject: [PATCH 12/17] fix(era5): do not download daily data if monthly file is present --- openhexa/toolbox/era5/cds.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py index 7747fba..a37558d 100644 --- a/openhexa/toolbox/era5/cds.py +++ b/openhexa/toolbox/era5/cds.py @@ -145,6 +145,15 @@ def build_request( return payload + @staticmethod + def _filename(variable: str, year: int, month: int, day: int = None, data_format: str = "grib") -> str: + """Get filename from variable name and date.""" + EXTENSION = {"grib": "grib", "netcdf": "nc"} + if day is not None: + return f"{variable}_{year}-{month:02}-{day:02}.{EXTENSION[data_format]}" + else: + return f"{variable}_{year}-{month:02}.{EXTENSION[data_format]}" + def download(self, request: dict, dst_file: str | Path, overwrite: bool = False): """Download Era5 product. @@ -164,6 +173,14 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False) log.debug("File %s already exists, skipping download", str(dst_file.absolute())) return + # if we request daily data while a monthly file is already present, also skip download + if len(request["day"]) == 1: + dst_file_monthly = Path( + dst_file.parent, self._filename(request["variable"], request["year"], request["month"]) + ) + if dst_file_monthly.exists() and not overwrite: + log.debug("Monthly file `{}` already exists, skipping download".format(dst_file_monthly.name)) + with tempfile.NamedTemporaryFile() as tmp: self.client.retrieve(name=DATASET, request=request, target=tmp.name) shutil.copy(tmp.name, dst_file) @@ -241,8 +258,8 @@ def download_between( ) if len(chunk["days"]) == 1: - dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}-{chunk['days']:02}.grib" + dst_file = Path(dst_dir, self._filename(variable, chunk["year"], chunk["month"], chunk["days"][0])) else: - dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}.nc" + dst_file = Path(dst_dir, self._filename(variable, chunk["year"], chunk["month"])) self.download(request=request, dst_file=dst_file, overwrite=overwrite) From 7cbf347841be259365f96f2ac8b04e05f34f633c Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 14 Oct 2024 12:00:59 +0200 Subject: [PATCH 13/17] fix(era5): compatibility with cads_api_client>=1.4.5 --- openhexa/toolbox/era5/cds.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py index a37558d..ccc1cf1 100644 --- a/openhexa/toolbox/era5/cds.py +++ b/openhexa/toolbox/era5/cds.py @@ -63,10 +63,8 @@ def __init__(self, key: str): @cached_property def latest(self) -> datetime: """Get date of latest available product.""" - collection = self.cads_api_client.collection(DATASET) - _, end = collection.json["extent"]["temporal"]["interval"][0] - end = datetime.strptime(end, "%Y-%m-%dT00:00:00Z") - return end + collection = self.cads_api_client.get_collection(DATASET) + return collection.end_datetime @staticmethod def build_request( From 6c97e5b0c8e9faec0f155d8a976188d38910e757 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Mon, 14 Oct 2024 15:16:17 +0200 Subject: [PATCH 14/17] fix(era5): make datetime unaware of timezones for comparability --- openhexa/toolbox/era5/cds.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py index ccc1cf1..4ce46b0 100644 --- a/openhexa/toolbox/era5/cds.py +++ b/openhexa/toolbox/era5/cds.py @@ -64,7 +64,10 @@ def __init__(self, key: str): def latest(self) -> datetime: """Get date of latest available product.""" collection = self.cads_api_client.get_collection(DATASET) - return collection.end_datetime + dt = collection.end_datetime + # make datetime unaware of timezone for comparability with other datetimes + dt = datetime(dt.year, dt.month, dt.day) + return dt @staticmethod def build_request( From aea133ff4bb201c7b2563289bf0b9b9a55f9050b Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Tue, 15 Oct 2024 17:45:21 +0200 Subject: [PATCH 15/17] feat(era5): add era5 aggregate module --- openhexa/toolbox/era5/aggregate.py | 213 +++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 openhexa/toolbox/era5/aggregate.py diff --git a/openhexa/toolbox/era5/aggregate.py b/openhexa/toolbox/era5/aggregate.py new file mode 100644 index 0000000..fb00fe2 --- /dev/null +++ b/openhexa/toolbox/era5/aggregate.py @@ -0,0 +1,213 @@ +"""Module for spatial and temporal aggregation of ERA5 data.""" + +from datetime import datetime +from pathlib import Path + +import geopandas as gpd +import numpy as np +import polars as pl +import rasterio +import rasterio.transform +import xarray as xr + + +def clip_dataset(ds: xr.Dataset, xmin: float, ymin: float, xmax: float, ymax: float) -> xr.Dataset: + """Clip input xarray dataset according to the provided bounding box. + + Assumes lat & lon dimensions are named "latitude" and "longitude". Longitude in the + source dataset is expected to be in the range [0, 360], and will be converted to + [-180, 180]. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset. + xmin : float + Minimum longitude. + ymin : float + Minimum latitude. + xmax : float + Maximum longitude. + ymax : float + Maximum latitude. + + Returns + ------- + xr.Dataset + Clipped xarray dataset. + """ + ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude") + ds = ds.where((ds.longitude >= xmin) & (ds.longitude <= xmax), drop=True) + ds = ds.where((ds.latitude >= ymin) & (ds.latitude <= ymax), drop=True) + return ds + + +def get_transform(ds: xr.Dataset) -> rasterio.transform.Affine: + """Get rasterio affine transform from xarray dataset. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset. + + Returns + ------- + rasterio.transform.Affine + Rasterio affine transform. + """ + transform = rasterio.transform.from_bounds( + ds.longitude.values.min(), + ds.latitude.values.min(), + ds.longitude.values.max(), + ds.latitude.values.max(), + len(ds.longitude), + len(ds.latitude), + ) + return transform + + +def build_masks( + boundaries: gpd.GeoDataFrame, height: int, width: int, transform: rasterio.Affine +) -> tuple[np.ndarray, rasterio.Affine]: + """Build binary masks for all geometries in a dataframe. + + We build a raster of shape (n_boundaries, n_height, n_width) in order to store one binary mask + per boundary. Boundaries shapes cannot be stored in a single array as we want masks to overlap + if needed. + + Parameters + ---------- + boundaries : gpd.GeoDataFrame + Input GeoDataFrame containing the boundaries. + height : int + Height of the raster (number of pixels) + width : int + Width of the raster (number of pixels) + transform : rasterio.Affine + Raster affine transform + + Returns + ------- + np.ndarray + Binary masks as a numpy ndarray of shape (n_boundaries, height, width) + """ + masks = np.ndarray(shape=(len(boundaries), height, width), dtype=np.bool_) + for i, geom in enumerate(boundaries.geometry): + mask = rasterio.features.rasterize( + shapes=[geom.__geo_interface__], + out_shape=(height, width), + fill=0, + default_value=1, + all_touched=True, + transform=transform, + ) + masks[i, :, :] = mask == 1 + return masks + + +def merge(data_dir: Path | str) -> xr.Dataset: + """Merge all .grib files in a directory into a single xarray dataset. + + If multiple values are available for a given time, step, longitude & latitude dimensions, the + maximum value is kept. + + Parameters + ---------- + data_dir : Path | str + Directory containing the .grib files. + + Returns + ------- + xr.Dataset + Merged xarray dataset with time, step, longitude and latitude dimensions. + """ + if isinstance(data_dir, str): + data_dir = Path(data_dir) + + datasets = [] + for fp in data_dir.glob("*.grib"): + datasets.append(xr.open_dataset(fp, engine="cfgrib")) + + ds = xr.concat(datasets, dim="tmp_dim").max(dim="tmp_dim") + return ds + + +def _np_to_datetime(dt64: np.datetime64) -> datetime: + epoch = np.datetime64(0, "s") + one_second = np.timedelta64(1, "s") + seconds_since_epoch = (dt64 - epoch) / one_second + return datetime.fromtimestamp(seconds_since_epoch) + + +def _has_missing_data(da: xr.DataArray) -> bool: + """A DataArray is considered to have missing data if not all hours have measurements.""" + missing = False + for step in da.step: + if da.sel(step=step).isnull().all(): + missing = True + return missing + + +def aggregate(ds: xr.Dataset, var: str, masks: np.ndarray, boundaries_id: list[str]) -> pl.DataFrame: + """Aggregate hourly measurements in space and time. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset with time, step, longitude and latitude dimensions + var : str + Variable to aggregate (ex: "t2m" or "tp") + masks : np.ndarray + Binary masks as a numpy ndarray of shape (n_boundaries, height, width) + boundaries_id : list[str] + List of boundary IDs (same order as n_boundaries dimension in masks) + + Notes + ----- + The function aggregates hourly measurements to daily values for each boundary. + + Temporal aggregation is applied first. 3 statistics are computed for each day: daily mean, + daily min, and daily max. + + Spatial aggregation is then applied. For each boundary, 3 statistics are computed: average of + daily means, average of daily min, and average of daily max. These 3 statistics are stored in + the "mean", "min", and "max" columns of the output dataframe. + """ + rows = [] + + for day in ds.time.values: + da = ds[var].sel(time=day) + + if _has_missing_data(da): + continue + + da_mean = da.mean(dim="step").values + da_min = da.min(dim="step").values + da_max = da.max(dim="step").values + + for i, uid in enumerate(boundaries_id): + v_mean = da_mean[masks[i, :, :]].mean() + v_min = da_min[masks[i, :, :]].mean() + v_max = da_max[masks[i, :, :]].mean() + + rows.append( + { + "boundary_id": uid, + "date": _np_to_datetime(day).date(), + "mean": v_mean, + "min": v_min, + "max": v_max, + } + ) + + SCHEMA = { + "boundary_id": pl.String, + "date": pl.Date, + "mean": pl.Float64, + "min": pl.Float64, + "max": pl.Float64, + } + + df = pl.DataFrame(data=rows, schema=SCHEMA) + + return df From 7674307e1d0b3ffb1a1fddfb4dcbd9cddd4e7cb3 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Tue, 15 Oct 2024 17:45:37 +0200 Subject: [PATCH 16/17] docs(era5): add era5 README --- openhexa/toolbox/era5/README.md | 163 ++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 openhexa/toolbox/era5/README.md diff --git a/openhexa/toolbox/era5/README.md b/openhexa/toolbox/era5/README.md new file mode 100644 index 0000000..3a7b6f3 --- /dev/null +++ b/openhexa/toolbox/era5/README.md @@ -0,0 +1,163 @@ +# OpenHEXA Toolbox ERA5 + +The package contains ETL classes and functions to acquire and process ERA5-Land data. ERA5-Land +provides hourly information of surface variables from 1950 to 5 days before the current date, with +a ~9 km spatial resolution. See [ERA5-Land: data +documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation) for more +information. + +## Usage + +The package contains 3 modules: +* `openhexa.toolbox.era5.cds`: download ERA5-land products from the Copernicus [Climate Data Store](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview) +* `openhexa.toolbox.era5.google`: download ERA5 products from Google Cloud [Public Datasets](https://cloud.google.com/storage/docs/public-datasets/era5) +* `openhexa.toolbox.era5.aggregate`: aggregate ERA5 data in space and time + +### Download from CDS + +To download products from the Climate Data Store, you will need to create an account and generate an API key in ECMWF (see [CDS](https://cds.climate.copernicus.eu/)). + +```python +from openhexa.toolbox.era5.cds import Client + +cds = Client(key="") + +request = cds.build_request( + variable="2m_temperature", + year=2024, + month=4 +) + +cds.download( + request=request, + dst_file="data/product.grib" +) +``` + +The module also contains helper functions to use bounds from a geoparquet file as an area of interest. + +```python +bounds = bounds_from_file(fp=Path("data/districts.parquet"), buffer=0.5) + +request = cds.build_request( + variable="total_precipitation", + year=2023, + month=10, + days=[1, 2, 3, 4, 5], + area=bounds +) + +cds.download( + request=request, + dst_file="data/product.grib" +) +``` + +To download multiple products for a given period, use `Client.download_between()`: + +```python +cds.download_between( + variable="2m_temperature", + start=datetime(2020, 1, 1), + end=datetime(2021, 6, 1), + dst_dir="data/raw/2m_temperature", + area=bounds +) +``` + +Checking latest available date in the ERA5-Land dataset: + +```python +cds = Client("") + +cds.latest +``` +``` +>>> datetime(2024, 10, 8) +``` + +### Download from Google Cloud + +```python +from openhexa.toolbox.era5.google import Client + +google = Client() + +google.download( + variable="2m_temperature", + date=datetime(2024, 6, 15), + dst_file="data/product.nc" +) +``` + +Or to download all products for a given period: + +```python +# if products are already presents in dst_dir, they will be skipped +google.sync( + variable="2m_temperature", + start_date=datetime(2022, 1, 1), + end_date=datetime(2022, 6, 1), + dst_dir="data" +) +``` + +### Aggregation + +```python +from pathlib import Path + +import geopandas as gpd +from openhexa.toolbox.era5.aggregate import build_masks, merge, aggregate, get_transform + +boundaries = gpd.read_parquet("districts.parquet") +data_dir = Path("data/era5/total_precipitation") + +ds = merge(data_dir) + +ncols = len(ds.longitude) +nrows = len(ds.latitude) +transform = get_transform(ds) +masks = build_masks(boundaries, nrows, ncols, transform) + +df = aggregate( + ds=ds, + var="tp", + masks=masks, + boundaries_id=[uid for uid in boundaries["district_id"]] +) + +print(df) +``` +``` +shape: (18_410, 5) +┌─────────────┬────────────┬───────────┬──────────┬───────────┐ +│ boundary_id ┆ date ┆ mean ┆ min ┆ max │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ date ┆ f64 ┆ f64 ┆ f64 │ +╞═════════════╪════════════╪═══════════╪══════════╪═══════════╡ +│ mPenE8ZIBFC ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ TPgpGxUBU9y ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ AhST5ZpuCDJ ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ Lp2BjBVT63s ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ EdfRX9b9vEb ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ yhs1ecKsLOc ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ iHSJypSwlo5 ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ CTtB0TPRvWc ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ eVFAuZOzogt ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ WVEJjdJ2S15 ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ rbYGKFgupK9 ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ Nml6rVDElLh ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ E0hd8TD1M0q ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ PCg4pLGmKSM ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ C6EBhE8OnfW ┆ 2024-01-01 ┆ 0.000462 ┆ 0.0 ┆ 0.00086 │ +│ … ┆ … ┆ … ┆ … ┆ … │ +│ CkpfOFkMyrd ┆ 2024-10-07 ┆ 1.883121 ┆ 0.001785 ┆ 2.700447 │ +│ tMXsltjzzmR ┆ 2024-10-07 ┆ 3.579136 ┆ 0.105436 ┆ 4.702504 │ +│ F0ytkh0RExg ┆ 2024-10-07 ┆ 8.415455 ┆ 0.838535 ┆ 17.08884 │ +... +│ TTSmaRnHa82 ┆ 2024-10-07 ┆ 1.724243 ┆ 0.007809 ┆ 5.692989 │ +│ jbmw2gdrrTV ┆ 2024-10-07 ┆ 1.176629 ┆ 0.110173 ┆ 1.582995 │ +│ eKYyXbBdvmB ┆ 2024-10-07 ┆ 0.599976 ┆ 0.037771 ┆ 1.189411 │ +└─────────────┴────────────┴───────────┴──────────┴───────────┘ +``` \ No newline at end of file From 4799b01efd96efec88aa462927b87276723c7544 Mon Sep 17 00:00:00 2001 From: Yann Forget Date: Tue, 15 Oct 2024 17:56:47 +0200 Subject: [PATCH 17/17] docs(era5): update era5 README --- openhexa/toolbox/era5/README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/openhexa/toolbox/era5/README.md b/openhexa/toolbox/era5/README.md index 3a7b6f3..0b0e651 100644 --- a/openhexa/toolbox/era5/README.md +++ b/openhexa/toolbox/era5/README.md @@ -6,6 +6,19 @@ a ~9 km spatial resolution. See [ERA5-Land: data documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation) for more information. +Available variables include: +* 2 metre temperature +* Wind components +* Leaf area index +* Volumetric soil water layer +* Total precipitation + +See [ERA5-Land data +documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-parameterlistingParameterlistings) +for a full list of available parameters. + +In addition to download clients for the Copernicus [Climate Data Store](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview) and [Google Public Datasets](https://cloud.google.com/storage/docs/public-datasets/era5), the package includes an `aggregate` module to aggregate ERA5 measurements in space (geographic boundaries) and time (hourly to daily). + ## Usage The package contains 3 modules: @@ -34,7 +47,9 @@ cds.download( ) ``` -The module also contains helper functions to use bounds from a geoparquet file as an area of interest. +The module also contains helper functions to use bounds from a geoparquet file as an area of +interest. Source bounds are buffered and rounded by default to make sure the required data is +downloaded. ```python bounds = bounds_from_file(fp=Path("data/districts.parquet"), buffer=0.5) @@ -76,6 +91,8 @@ cds.latest >>> datetime(2024, 10, 8) ``` +NB: End dates in product requests will be automatically replaced by latest available date if they are greater. + ### Download from Google Cloud ```python