From 7a9bdf690304f380bea10a928fc663ab4188b896 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 23 Sep 2024 17:15:54 +0200
Subject: [PATCH 01/17] feat: add era5 download modules

---
 openhexa/toolbox/era5/__init__.py    |  15 ++
 openhexa/toolbox/era5/cds.py         | 142 +++++++++++
 openhexa/toolbox/era5/google.py      | 121 +++++++++
 openhexa/toolbox/era5/variables.json | 352 +++++++++++++++++++++++++++
 4 files changed, 630 insertions(+)
 create mode 100644 openhexa/toolbox/era5/__init__.py
 create mode 100644 openhexa/toolbox/era5/cds.py
 create mode 100644 openhexa/toolbox/era5/google.py
 create mode 100644 openhexa/toolbox/era5/variables.json

diff --git a/openhexa/toolbox/era5/__init__.py b/openhexa/toolbox/era5/__init__.py
new file mode 100644
index 0000000..71dee32
--- /dev/null
+++ b/openhexa/toolbox/era5/__init__.py
@@ -0,0 +1,15 @@
+import logging
+
+import cdsapi
+
+logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+BASE_URL = "https://cds-beta.climate.copernicus.eu/api"
+
+
+class ERA5:
+    def __init__(self, key: str):
+        self.client = cdsapi.Client(
+            key=key,
+        )
diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
new file mode 100644
index 0000000..eef6db2
--- /dev/null
+++ b/openhexa/toolbox/era5/cds.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import importlib.resources
+import json
+import logging
+import shutil
+import tempfile
+from calendar import monthrange
+from datetime import datetime
+from pathlib import Path
+
+import cads_api_client
+import cdsapi
+
+with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f:
+    VARIABLES = json.load(f)
+
+DATASET = "reanalysis-era5-land"
+
+logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+URL = "https://cds-beta.climate.copernicus.eu/api"
+
+
+class ParameterError(ValueError):
+    pass
+
+
+class Client:
+    def __init__(self, key: str):
+        self.client = cdsapi.Client(url=URL, key=key, wait_until_complete=True, quiet=True, progress=False)
+        self.cads_api_client = cads_api_client.ApiClient(key=key, url=URL)
+
+    def latest(self) -> datetime:
+        """Get date of latest available product."""
+        collection = self.cads_api_client.collection(DATASET)
+        _, end = collection.json["extent"]["temporal"]["interval"][0]
+        end = datetime.strptime(end, "%Y-%m-%dT00:00:00Z")
+        return end
+
+    @staticmethod
+    def build_request(
+        variable: str,
+        year: int,
+        month: int,
+        days: list[int] = None,
+        time: list[str] = None,
+        data_format: str = "grib",
+        area: list[float] = None,
+    ) -> dict:
+        """Build request payload.
+
+        Parameters
+        ----------
+        variable : str
+            Climate data store variable name (ex: "2m_temperature").
+        year : int
+            Year of interest.
+        month : int
+            Month of interest.
+        days : list[int]
+            Days of interest. Defauls to None (all days).
+        time : list[str]
+            Hours of interest (ex: ["01:00", "06:00", "18:00"]). Defaults to None (all hours).
+        data_format : str
+            Output data format ("grib" or "netcdf"). Defaults to "grib".
+        area : list[float]
+            Area of interest (north, west, south, east). Defaults to None (world).
+
+        Returns
+        -------
+        dict
+            Request payload.
+
+        Raises
+        ------
+        ParameterError
+            Request parameters are not valid.
+        """
+        if variable not in VARIABLES:
+            raise ParameterError("Variable %s not supported", variable)
+
+        if data_format not in ["grib", "netcdf"]:
+            raise ParameterError("Data format %s not supported", data_format)
+
+        if area:
+            n, w, s, e = area
+            if ((abs(n) > 90) or (abs(s) > 90)) or ((abs(w) > 180) or (abs(e) > 180)):
+                raise ParameterError("Invalid area of interest")
+            if (n < s) or (e < w):
+                raise ParameterError("Invalid area of interest")
+
+        if not days:
+            dmax = monthrange(year, month)[1]
+            days = [day for day in range(1, dmax + 1)]
+
+        if not time:
+            time = [f"{hour:02}:00" for hour in range(0, 24)]
+
+        year = str(year)
+        month = f"{month:02}"
+        days = [f"{day:02}" for day in days]
+
+        payload = {
+            "variable": [variable],
+            "year": year,
+            "month": month,
+            "day": days,
+            "time": time,
+            "data_format": data_format,
+        }
+
+        if area:
+            payload["area"] = area
+
+        return payload
+
+    def download(self, request: dict, dst_file: str | Path, overwrite: bool = False):
+        """Download Era5 product.
+
+        Parameters
+        ----------
+        request : dict
+            Request payload as returned by the build_request() method.
+        dst_file : Path
+            Output file path.
+        overwrite : bool, optional
+            Overwrite existing file (default=False).
+        """
+        dst_file = Path(dst_file)
+        dst_file.parent.mkdir(parents=True, exist_ok=True)
+
+        if dst_file.exists() and not overwrite:
+            log.debug("File %s already exists, skipping download", str(dst_file.absolute()))
+            return
+
+        with tempfile.NamedTemporaryFile() as tmp:
+            self.client.retrieve(name=DATASET, request=request, target=tmp.name)
+            shutil.move(tmp.name, dst_file)
+
+        log.debug("Downloaded Era5 product to %s", str(dst_file.absolute()))
diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
new file mode 100644
index 0000000..3f4347f
--- /dev/null
+++ b/openhexa/toolbox/era5/google.py
@@ -0,0 +1,121 @@
+"""Download raw historical Era5 products from Google Cloud:
+https://console.cloud.google.com/storage/browser/gcp-public-data-arco-era5
+
+Products are provided as raw NetCDF files and are usually available with a ~3 month lag.
+"""
+
+from __future__ import annotations
+
+import importlib.resources
+import json
+import logging
+import shutil
+import tempfile
+from datetime import datetime
+from functools import cached_property
+from pathlib import Path
+
+import requests
+from google.cloud import storage
+
+with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f:
+    VARIABLES = json.load(f)
+
+logging.basicConfig(level=logging.DEBUG, format="%(name)s %(asctime)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+
+class NotFoundError(Exception):
+    pass
+
+
+class ParameterError(ValueError):
+    pass
+
+
+class Client:
+    def __init__(self):
+        self.client = storage.Client.create_anonymous_client()
+        self.bucket = self.client.bucket("gcp-public-data-arco-era5")
+
+    @staticmethod
+    def prefix(variable: str, date: datetime) -> str:
+        """Build key prefix for a given product."""
+        return f"raw/date-variable-single_level/{date.year}/{date.month:02}/{date.day:02}/{variable}/surface.nc"
+
+    def _subdirs(self, prefix: str) -> list[str]:
+        """List subdirs."""
+        blobs = self.client.list_blobs(self.bucket, prefix=prefix, delimiter="/")
+        prefixes = []
+        for page in blobs.pages:
+            prefixes += page.prefixes
+        return prefixes
+
+    @cached_property
+    def latest(self) -> datetime:
+        """Get date of latest available product."""
+        root = "raw/date-variable-single_level/"
+        subdirs = self._subdirs(root)  # years
+        subdirs = self._subdirs(max(subdirs))  # months
+        subdirs = self._subdirs(max(subdirs))  # days
+        subdir = max(subdirs).split("/")
+        year = int(subdir[-4])
+        month = int(subdir[-3])
+        day = int(subdir[-2])
+        return datetime(year, month, day)
+
+    def find(self, variable: str, date: datetime) -> str | None:
+        """Find public URL of product. Return None if not found."""
+        prefix = self.prefix(variable, date)
+        blobs = self.client.list_blobs(self.bucket, prefix=prefix, max_results=1)
+        blobs = list(blobs)
+        if blobs:
+            return blobs[0].public_url
+        else:
+            return None
+
+    def download(self, variable: str, date: datetime, dst_file: str | Path, overwrite=False):
+        """Download an Era5 NetCDF product for a given day.
+
+        Parameters
+        ----------
+        variable : str
+            Climate data store variable name (ex: "2m_temperature").
+        date : datetime
+            Product date (year, month, day).
+        dst_file : str | Path
+            Output file.
+        overwrite : bool, optional
+            Overwrite existing file (default=False).
+
+        Raises
+        ------
+        ParameterError
+            Product request parameters are invalid.
+        NotFoundError
+            Product not found in bucket.
+        """
+        dst_file = Path(dst_file)
+        dst_file.parent.mkdir(parents=True, exist_ok=True)
+
+        if dst_file.exists() and not overwrite:
+            log.debug("Skipping download of %s because file already exists", str(dst_file.absolute()))
+            return
+
+        if variable not in VARIABLES:
+            raise ParameterError("%s is not a valid climate data store variable name", variable)
+
+        url = self.find(variable, date)
+        if not url:
+            raise NotFoundError("%s product not found for date %s", variable, date.strftime("%Y-%m-%d"))
+
+        with tempfile.NamedTemporaryFile() as tmp:
+            with open(tmp.name, "wb") as f:
+                with requests.get(url, stream=True) as r:
+                    for chunk in r.iter_content(chunk_size=1024**2):
+                        if chunk:
+                            f.write(chunk)
+
+            shutil.move(tmp.name, dst_file)
+
+        log.debug("Downloaded %s", str(dst_file.absolute()))
diff --git a/openhexa/toolbox/era5/variables.json b/openhexa/toolbox/era5/variables.json
new file mode 100644
index 0000000..fbbd894
--- /dev/null
+++ b/openhexa/toolbox/era5/variables.json
@@ -0,0 +1,352 @@
+{
+    "lake_mix_layer_temperature": {
+        "name": "Lake mix-layer temperature",
+        "shortname": "lmlt",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_mix_layer_depth": {
+        "name": "Lake mix-layer depth",
+        "shortname": "lmld",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_bottom_temperature": {
+        "name": "Lake bottom temperature",
+        "shortname": "lblt",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_total_layer_temperature": {
+        "name": "Lake total layer temperature",
+        "shortname": "ltlt",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_shape_factor": {
+        "name": "Lake shape factor",
+        "shortname": "lshf",
+        "units": "dimensionless",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_ice_temperature": {
+        "name": "Lake ice temperature",
+        "shortname": "lict",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "lake_ice_depth": {
+        "name": "Lake ice depth",
+        "shortname": "licd",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "snow_cover": {
+        "name": "Snow cover",
+        "shortname": "snowc",
+        "units": "%",
+        "grib1": false,
+        "grib2": true
+    },
+    "snow_depth": {
+        "name": "Snow depth",
+        "shortname": "sde",
+        "units": "m",
+        "grib1": false,
+        "grib2": true
+    },
+    "snow_albedo": {
+        "name": "Snow albedo",
+        "shortname": "asn",
+        "units": "(0 - 1)",
+        "grib1": true,
+        "grib2": false
+    },
+    "snow_density": {
+        "name": "Snow density",
+        "shortname": "rsn",
+        "units": "kg m**-3",
+        "grib1": true,
+        "grib2": false
+    },
+    "volumetric_soil_water_layer_1": {
+        "name": "Volumetric soil water layer 11",
+        "shortname": "swvl1",
+        "units": "m**3 m**-3",
+        "grib1": true,
+        "grib2": false
+    },
+    "volumetric_soil_water_layer_2": {
+        "name": "Volumetric soil water layer 21",
+        "shortname": "swvl2",
+        "units": "m**3 m**-3",
+        "grib1": true,
+        "grib2": false
+    },
+    "volumetric_soil_water_layer_3": {
+        "name": "Volumetric soil water layer 31",
+        "shortname": "swvl3",
+        "units": "m**3 m**-3",
+        "grib1": true,
+        "grib2": false
+    },
+    "volumetric_soil_water_layer_4": {
+        "name": "Volumetric soil water layer 41",
+        "shortname": "swvl4",
+        "units": "m**3 m**-3",
+        "grib1": true,
+        "grib2": false
+    },
+    "leaf_area_index_low_vegetation": {
+        "name": "Leaf area index, low vegetation2",
+        "shortname": "lai_lv",
+        "units": "m**2 m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "leaf_area_index_high_vegetation": {
+        "name": "Leaf area index, high vegetation2",
+        "shortname": "lai_hv",
+        "units": "m**2 m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_pressure": {
+        "name": "Surface pressure",
+        "shortname": "sp",
+        "units": "Pa",
+        "grib1": true,
+        "grib2": false
+    },
+    "soil_temperature_level_1": {
+        "name": "Soil temperature level 11",
+        "shortname": "stl1",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "snow_depth_water_equivalent": {
+        "name": "Snow depth water equivalent",
+        "shortname": "sd",
+        "units": "m of water equivalent",
+        "grib1": true,
+        "grib2": false
+    },
+    "10m_u_component_of_wind": {
+        "name": "10 metre U wind component",
+        "shortname": "u10",
+        "units": "m s**-1",
+        "grib1": true,
+        "grib2": false
+    },
+    "10m_v_component_of_wind": {
+        "name": "10 metre V wind component",
+        "shortname": "v10",
+        "units": "m s**-1",
+        "grib1": true,
+        "grib2": false
+    },
+    "2m_temperature": {
+        "name": "2 metre temperature",
+        "shortname": "2t",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "2m_dewpoint_temperature": {
+        "name": "2 metre dewpoint temperature",
+        "shortname": "2d",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "soil_temperature_level_2": {
+        "name": "Soil temperature level 21",
+        "shortname": "stl2",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "soil_temperature_level_3": {
+        "name": "Soil temperature level 31",
+        "shortname": "stl3",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "skin_reservoir_content": {
+        "name": "Skin reservoir content",
+        "shortname": "src",
+        "units": "m of water equivalent",
+        "grib1": false,
+        "grib2": false
+    },
+    "skin_temperature": {
+        "name": "Skin temperature",
+        "shortname": "skt",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "soil_temperature_level_4": {
+        "name": "Soil temperature level 41",
+        "shortname": "stl4",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "temperature_of_snow_layer": {
+        "name": "Temperature of snow layer",
+        "shortname": "tsn",
+        "units": "K",
+        "grib1": true,
+        "grib2": false
+    },
+    "forecast_albedo": {
+        "name": "Forecast albedo",
+        "shortname": "fal",
+        "units": "(0 - 1)",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_runoff": {
+        "name": "Surface runoff",
+        "shortname": "sro",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "sub_surface_runoff": {
+        "name": "Sub-surface runoff",
+        "shortname": "ssro",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "\u00a0snow_evaporation": {
+        "name": "Snow evaporation",
+        "shortname": "es",
+        "units": "m of water equivalent",
+        "grib1": true,
+        "grib2": false
+    },
+    "snowmelt": {
+        "name": "Snowmelt",
+        "shortname": "smlt",
+        "units": "m of water equivalent",
+        "grib1": true,
+        "grib2": false
+    },
+    "snowfall": {
+        "name": "Snowfall",
+        "shortname": "sf",
+        "units": "m of water equivalent",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_sensible_heat_flux": {
+        "name": "Surface sensible heat flux",
+        "shortname": "sshf",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_latent_heat_flux": {
+        "name": "Surface latent heat flux",
+        "shortname": "slhf",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_solar_radiation_downwards": {
+        "name": "Surface solar radiation downwards",
+        "shortname": "ssrd",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_thermal_radiation_downwards": {
+        "name": "Surface thermal radiation downwards",
+        "shortname": "strd",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_net_solar_radiation": {
+        "name": "Surface net solar radiation",
+        "shortname": "ssr",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "surface_net_thermal_radiation": {
+        "name": "Surface net thermal radiation",
+        "shortname": "str",
+        "units": "J m**-2",
+        "grib1": true,
+        "grib2": false
+    },
+    "total_evaporation": {
+        "name": "Total Evaporation",
+        "shortname": "e",
+        "units": "m of water equivalent",
+        "grib1": true,
+        "grib2": false
+    },
+    "runoff": {
+        "name": "Runoff",
+        "shortname": "ro",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "total_precipitation": {
+        "name": "Total precipitation",
+        "shortname": "tp",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    },
+    "evaporation_from_the_top_of_canopy": {
+        "name": "Evaporation from the top of canopy",
+        "shortname": "evatc",
+        "units": "m of water equivalent",
+        "grib1": false,
+        "grib2": true
+    },
+    "evaporation_from_bare_soil": {
+        "name": "Evaporation from bare soil",
+        "shortname": "evabs",
+        "units": "m of water equivalent",
+        "grib1": false,
+        "grib2": true
+    },
+    "evaporation_from_open_water_surfaces_excluding_oceans": {
+        "name": "Evaporation from open water surfaces excluding oceans",
+        "shortname": "evaow",
+        "units": "m of water equivalent",
+        "grib1": false,
+        "grib2": true
+    },
+    "evaporation_from_vegetation_transpiration": {
+        "name": "Evaporation from vegetation transpiration",
+        "shortname": "evavt",
+        "units": "m of water equivalent",
+        "grib1": false,
+        "grib2": true
+    },
+    "potential_evaporation": {
+        "name": "Potential evaporation",
+        "shortname": "pev",
+        "units": "m",
+        "grib1": true,
+        "grib2": false
+    }
+}
\ No newline at end of file

From 1c2e4298ea0eb2cb509a0bced0cbf7de62e36f49 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 23 Sep 2024 18:30:42 +0200
Subject: [PATCH 02/17] feat: add era5 sync function

---
 openhexa/toolbox/era5/google.py | 40 ++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 3f4347f..37978eb 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -11,7 +11,7 @@
 import logging
 import shutil
 import tempfile
-from datetime import datetime
+from datetime import datetime, timedelta
 from functools import cached_property
 from pathlib import Path
 
@@ -119,3 +119,41 @@ def download(self, variable: str, date: datetime, dst_file: str | Path, overwrit
             shutil.move(tmp.name, dst_file)
 
         log.debug("Downloaded %s", str(dst_file.absolute()))
+
+    def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir: str | Path):
+        """Download all products for a given variable and date range.
+
+        If products are already present in the destination directory, they will be skipped.
+        Expects file names to be formatted as "YYYY-MM-DD_VARIABLE.nc".
+
+        Parameters
+        ----------
+        variable : str
+            Climate data store variable name (ex: "2m_temperature").
+        start_date : datetime
+            Start date (year, month, day).
+        end_date : datetime
+            End date (year, month, day).
+        dst_dir : str | Path
+            Output directory.
+        """
+        dst_dir = Path(dst_dir)
+        dst_dir.mkdir(parents=True, exist_ok=True)
+
+        date = start_date
+        if end_date > self.latest:
+            log.info("End date is in the future, setting it to the latest available date: %s", self.latest)
+            end_date = self.latest
+
+        while date <= end_date:
+            expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc"
+            fpath = Path(dst_dir, expected_filename)
+            fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib"))
+            if fpath.exists() or fpath_grib.exists():
+                log.debug("%s already exists, skipping download", expected_filename)
+                continue
+            else:
+                self.download(variable, date, fpath, overwrite=False)
+                log.debug("Downloaded %s", expected_filename)
+
+            date += timedelta(days=1)

From 0cc44728bc7c4ef798694a520d2292eaaa31d703 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 23 Sep 2024 18:40:15 +0200
Subject: [PATCH 03/17] fix: update package dependencies

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ee548c4..d3ee45f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,9 @@ dependencies = [
     "geopandas",
     "polars",
     "diskcache",
-    "pyjwt"
+    "pyjwt",
+    "cdsapi >=0.7.3",
+    "cads-api-client >=1.4.0",
 ]
 
 [project.optional-dependencies]

From a92097f2d5a3c6468500b96663960297301809d7 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 23 Sep 2024 19:05:41 +0200
Subject: [PATCH 04/17] fix: add json file as package data

---
 pyproject.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index d3ee45f..92410c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,10 +40,16 @@ dev = [
     "responses",
 ]
 
+[tool.setuptools]
+include-package-data = true
+
 [tool.setuptools.packages.find]
 where = ["."]
 namespaces = true
 
+[tool.setuptools.package-data]
+"openhexa.toolbox.era5" = ["*.json"]
+
 [project.urls]
 "Homepage" = "https://github.com/blsq/openhexa-toolbox"
 "Bug Tracker" = "https://github.com/blsq/openhexa-toolbox/issues"

From c1c3bae9bc34284af1146a75d67c02c7ac6620a3 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Tue, 24 Sep 2024 17:18:56 +0200
Subject: [PATCH 05/17] fix: do not move tmp file before closing

---
 openhexa/toolbox/era5/cds.py    | 2 +-
 openhexa/toolbox/era5/google.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
index eef6db2..46e1323 100644
--- a/openhexa/toolbox/era5/cds.py
+++ b/openhexa/toolbox/era5/cds.py
@@ -137,6 +137,6 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False)
 
         with tempfile.NamedTemporaryFile() as tmp:
             self.client.retrieve(name=DATASET, request=request, target=tmp.name)
-            shutil.move(tmp.name, dst_file)
+            shutil.copy(tmp.name, dst_file)
 
         log.debug("Downloaded Era5 product to %s", str(dst_file.absolute()))
diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 37978eb..871b9df 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -116,7 +116,7 @@ def download(self, variable: str, date: datetime, dst_file: str | Path, overwrit
                         if chunk:
                             f.write(chunk)
 
-            shutil.move(tmp.name, dst_file)
+            shutil.copy(tmp.name, dst_file)
 
         log.debug("Downloaded %s", str(dst_file.absolute()))
 

From 56c6a32c6292aec3719822bfaf4dcd17c5d10daa Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Wed, 25 Sep 2024 10:45:51 +0200
Subject: [PATCH 06/17] fix: infinite loop in sync function

---
 openhexa/toolbox/era5/google.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 871b9df..dec2ab4 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -151,9 +151,8 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
             fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib"))
             if fpath.exists() or fpath_grib.exists():
                 log.debug("%s already exists, skipping download", expected_filename)
-                continue
+                date += timedelta(days=1)
             else:
                 self.download(variable, date, fpath, overwrite=False)
                 log.debug("Downloaded %s", expected_filename)
-
-            date += timedelta(days=1)
+                date += timedelta(days=1)

From 5b982e655b0e7181962b604dd73fc8ec15257b3e Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Wed, 25 Sep 2024 10:51:04 +0200
Subject: [PATCH 07/17] fix: better log message

---
 openhexa/toolbox/era5/google.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index dec2ab4..82d71b4 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -142,7 +142,7 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
 
         date = start_date
         if end_date > self.latest:
-            log.info("End date is in the future, setting it to the latest available date: %s", self.latest)
+            log.info("Setting `end_date` to the latest available date: %s", self.latest)
             end_date = self.latest
 
         while date <= end_date:

From f0a9b68e89031a20f73480ba0d44e0fde8d8f8a6 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Wed, 25 Sep 2024 11:00:19 +0200
Subject: [PATCH 08/17] fix: better log message

---
 openhexa/toolbox/era5/google.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 82d71b4..1799616 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -142,7 +142,7 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
 
         date = start_date
         if end_date > self.latest:
-            log.info("Setting `end_date` to the latest available date: %s", self.latest)
+            log.info("Setting `end_date` to the latest available date: %s" % self.latest)
             end_date = self.latest
 
         while date <= end_date:
@@ -150,9 +150,9 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
             fpath = Path(dst_dir, expected_filename)
             fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib"))
             if fpath.exists() or fpath_grib.exists():
-                log.debug("%s already exists, skipping download", expected_filename)
+                log.debug("%s already exists, skipping download" % expected_filename)
                 date += timedelta(days=1)
             else:
                 self.download(variable, date, fpath, overwrite=False)
-                log.debug("Downloaded %s", expected_filename)
+                log.debug("Downloaded %s" % expected_filename)
                 date += timedelta(days=1)

From b8af3fb08c8abd6d68bbc71b80346fe806e5af43 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Wed, 25 Sep 2024 11:05:52 +0200
Subject: [PATCH 09/17] fix: infinite loop in sync function

---
 openhexa/toolbox/era5/google.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 1799616..0b88380 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -149,10 +149,12 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
             expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc"
             fpath = Path(dst_dir, expected_filename)
             fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib"))
+
             if fpath.exists() or fpath_grib.exists():
                 log.debug("%s already exists, skipping download" % expected_filename)
                 date += timedelta(days=1)
-            else:
-                self.download(variable, date, fpath, overwrite=False)
-                log.debug("Downloaded %s" % expected_filename)
-                date += timedelta(days=1)
+                continue
+
+            self.download(variable, date, fpath, overwrite=False)
+            log.debug("Downloaded %s" % expected_filename)
+            date += timedelta(days=1)

From 23824a8b12e5f50ffc7b2823879469a033b5c94c Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Wed, 25 Sep 2024 11:40:27 +0200
Subject: [PATCH 10/17] fix: infinite loop in sync function

---
 openhexa/toolbox/era5/google.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/openhexa/toolbox/era5/google.py b/openhexa/toolbox/era5/google.py
index 0b88380..dc4b8d8 100644
--- a/openhexa/toolbox/era5/google.py
+++ b/openhexa/toolbox/era5/google.py
@@ -140,21 +140,20 @@ def sync(self, variable: str, start_date: datetime, end_date: datetime, dst_dir:
         dst_dir = Path(dst_dir)
         dst_dir.mkdir(parents=True, exist_ok=True)
 
+        if start_date > end_date:
+            raise ParameterError("`start_date` must be before `end_date`")
+
         date = start_date
         if end_date > self.latest:
-            log.info("Setting `end_date` to the latest available date: %s" % self.latest)
+            log.info("Setting `end_date` to the latest available date: %s" % date.strftime("%Y-%m-%d"))
             end_date = self.latest
 
         while date <= end_date:
             expected_filename = f"{date.strftime('%Y-%m-%d')}_{variable}.nc"
             fpath = Path(dst_dir, expected_filename)
             fpath_grib = Path(dst_dir, expected_filename.replace(".nc", ".grib"))
-
             if fpath.exists() or fpath_grib.exists():
                 log.debug("%s already exists, skipping download" % expected_filename)
-                date += timedelta(days=1)
-                continue
-
-            self.download(variable, date, fpath, overwrite=False)
-            log.debug("Downloaded %s" % expected_filename)
+            else:
+                self.download(variable=variable, date=date, dst_file=fpath, overwrite=False)
             date += timedelta(days=1)

From d0a4af30a1def1560aadcdcc73d13d9f937da4fb Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Fri, 11 Oct 2024 19:18:37 +0200
Subject: [PATCH 11/17] feat(era5): download all CDS products for a given
 period

---
 openhexa/toolbox/era5/cds.py | 108 ++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
index 46e1323..7747fba 100644
--- a/openhexa/toolbox/era5/cds.py
+++ b/openhexa/toolbox/era5/cds.py
@@ -6,11 +6,15 @@
 import shutil
 import tempfile
 from calendar import monthrange
-from datetime import datetime
+from datetime import datetime, timedelta
+from functools import cached_property
+from math import ceil
 from pathlib import Path
 
 import cads_api_client
 import cdsapi
+import geopandas as gpd
+from dateutil.relativedelta import relativedelta
 
 with importlib.resources.open_text("openhexa.toolbox.era5", "variables.json") as f:
     VARIABLES = json.load(f)
@@ -27,11 +31,36 @@ class ParameterError(ValueError):
     pass
 
 
+def bounds_from_file(fp: Path, buffer: float = 0.5) -> list[float]:
+    """Get bounds from file.
+
+    Parameters
+    ----------
+    fp : Path
+        File path.
+    buffer : float, optional
+        Buffer to add to the bounds (default=0.5).
+
+    Returns
+    -------
+    list[float]
+        Bounds (north, west, south, east).
+    """
+    boundaries = gpd.read_parquet(fp)
+    xmin, ymin, xmax, ymax = boundaries.total_bounds
+    xmin = ceil(xmin - 0.5)
+    ymin = ceil(ymin - 0.5)
+    xmax = ceil(xmax + 0.5)
+    ymax = ceil(ymax + 0.5)
+    return ymax, xmin, ymin, xmax
+
+
 class Client:
     def __init__(self, key: str):
         self.client = cdsapi.Client(url=URL, key=key, wait_until_complete=True, quiet=True, progress=False)
         self.cads_api_client = cads_api_client.ApiClient(key=key, url=URL)
 
+    @cached_property
     def latest(self) -> datetime:
         """Get date of latest available product."""
         collection = self.cads_api_client.collection(DATASET)
@@ -140,3 +169,80 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False)
             shutil.copy(tmp.name, dst_file)
 
         log.debug("Downloaded Era5 product to %s", str(dst_file.absolute()))
+
+    @staticmethod
+    def _period_chunks(start: datetime, end: datetime) -> list[dict]:
+        """Generate list of period chunks to prepare CDS API requests.
+
+        If we can, prepare requests for full months to optimize wait times. If we can't, prepare
+        daily requests.
+
+        Parameters
+        ----------
+        start : datetime
+            Start date.
+        end : datetime
+            End date.
+
+        Returns
+        -------
+        list[dict]
+            List of period chunks as dicts with `year`, `month` and `days` keys.
+        """
+        chunks = []
+        date = start
+        while date <= end:
+            last_day_in_month = datetime(date.year, date.month, monthrange(date.year, date.month)[1])
+            if last_day_in_month <= end:
+                chunks.append(
+                    {"year": date.year, "month": date.month, "days": [day for day in range(1, last_day_in_month.day)]}
+                )
+                date += relativedelta(months=1)
+            else:
+                chunks.append({"year": date.year, "month": date.month, "days": [date.day]})
+                date += timedelta(days=1)
+        return chunks
+
+    def download_between(
+        self,
+        variable: str,
+        start: datetime,
+        end: datetime,
+        dst_dir: str | Path,
+        area: list[float] = None,
+        overwrite: bool = False,
+    ):
+        """Download all ERA5 products between two dates.
+
+        Parameters
+        ----------
+        variable : str
+            Climate data store variable name (ex: "2m_temperature").
+        start : datetime
+            Start date.
+        end : datetime
+            End date.
+        dst_dir : Path
+            Output directory.
+        area : list[float], optional
+            Area of interest (north, west, south, east). Defaults to None (world).
+        overwrite : bool, optional
+            Overwrite existing files (default=False).
+        """
+        if end > self.latest:
+            end = self.latest
+            log.debug("End date is after latest available product, setting end date to %s", end)
+
+        chunks = self._period_chunks(start, end)
+
+        for chunk in chunks:
+            request = self.build_request(
+                variable=variable, year=chunk["year"], month=chunk["month"], days=chunk["days"], area=area
+            )
+
+            if len(chunk["days"]) == 1:
+                dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}-{chunk['days']:02}.grib"
+            else:
+                dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}.nc"
+
+            self.download(request=request, dst_file=dst_file, overwrite=overwrite)

From 4da06c480f0d43c12b951a3ca5a504b95adb4257 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 14 Oct 2024 11:53:54 +0200
Subject: [PATCH 12/17] fix(era5): do not download daily data if monthly file
 is present

---
 openhexa/toolbox/era5/cds.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
index 7747fba..a37558d 100644
--- a/openhexa/toolbox/era5/cds.py
+++ b/openhexa/toolbox/era5/cds.py
@@ -145,6 +145,15 @@ def build_request(
 
         return payload
 
+    @staticmethod
+    def _filename(variable: str, year: int, month: int, day: int = None, data_format: str = "grib") -> str:
+        """Get filename from variable name and date."""
+        EXTENSION = {"grib": "grib", "netcdf": "nc"}
+        if day is not None:
+            return f"{variable}_{year}-{month:02}-{day:02}.{EXTENSION[data_format]}"
+        else:
+            return f"{variable}_{year}-{month:02}.{EXTENSION[data_format]}"
+
     def download(self, request: dict, dst_file: str | Path, overwrite: bool = False):
         """Download Era5 product.
 
@@ -164,6 +173,14 @@ def download(self, request: dict, dst_file: str | Path, overwrite: bool = False)
             log.debug("File %s already exists, skipping download", str(dst_file.absolute()))
             return
 
+        # if we request daily data while a monthly file is already present, also skip download
+        if len(request["day"]) == 1:
+            dst_file_monthly = Path(
+                dst_file.parent, self._filename(request["variable"], request["year"], request["month"])
+            )
+            if dst_file_monthly.exists() and not overwrite:
+                log.debug("Monthly file `{}` already exists, skipping download".format(dst_file_monthly.name))
+
         with tempfile.NamedTemporaryFile() as tmp:
             self.client.retrieve(name=DATASET, request=request, target=tmp.name)
             shutil.copy(tmp.name, dst_file)
@@ -241,8 +258,8 @@ def download_between(
             )
 
             if len(chunk["days"]) == 1:
-                dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}-{chunk['days']:02}.grib"
+                dst_file = Path(dst_dir, self._filename(variable, chunk["year"], chunk["month"], chunk["days"][0]))
             else:
-                dst_file = Path(dst_dir) / f"{variable}_{chunk['year']}-{chunk['month']:02}.nc"
+                dst_file = Path(dst_dir, self._filename(variable, chunk["year"], chunk["month"]))
 
             self.download(request=request, dst_file=dst_file, overwrite=overwrite)

From 7cbf347841be259365f96f2ac8b04e05f34f633c Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 14 Oct 2024 12:00:59 +0200
Subject: [PATCH 13/17] fix(era5): compatibility with cads_api_client>=1.4.5

---
 openhexa/toolbox/era5/cds.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
index a37558d..ccc1cf1 100644
--- a/openhexa/toolbox/era5/cds.py
+++ b/openhexa/toolbox/era5/cds.py
@@ -63,10 +63,8 @@ def __init__(self, key: str):
     @cached_property
     def latest(self) -> datetime:
         """Get date of latest available product."""
-        collection = self.cads_api_client.collection(DATASET)
-        _, end = collection.json["extent"]["temporal"]["interval"][0]
-        end = datetime.strptime(end, "%Y-%m-%dT00:00:00Z")
-        return end
+        collection = self.cads_api_client.get_collection(DATASET)
+        return collection.end_datetime
 
     @staticmethod
     def build_request(

From 6c97e5b0c8e9faec0f155d8a976188d38910e757 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Mon, 14 Oct 2024 15:16:17 +0200
Subject: [PATCH 14/17] fix(era5): make datetime unaware of timezones for
 comparability

---
 openhexa/toolbox/era5/cds.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openhexa/toolbox/era5/cds.py b/openhexa/toolbox/era5/cds.py
index ccc1cf1..4ce46b0 100644
--- a/openhexa/toolbox/era5/cds.py
+++ b/openhexa/toolbox/era5/cds.py
@@ -64,7 +64,10 @@ def __init__(self, key: str):
     def latest(self) -> datetime:
         """Get date of latest available product."""
         collection = self.cads_api_client.get_collection(DATASET)
-        return collection.end_datetime
+        dt = collection.end_datetime
+        # make datetime unaware of timezone for comparability with other datetimes
+        dt = datetime(dt.year, dt.month, dt.day)
+        return dt
 
     @staticmethod
     def build_request(

From aea133ff4bb201c7b2563289bf0b9b9a55f9050b Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Tue, 15 Oct 2024 17:45:21 +0200
Subject: [PATCH 15/17] feat(era5): add era5 aggregate module

---
 openhexa/toolbox/era5/aggregate.py | 213 +++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 openhexa/toolbox/era5/aggregate.py

diff --git a/openhexa/toolbox/era5/aggregate.py b/openhexa/toolbox/era5/aggregate.py
new file mode 100644
index 0000000..fb00fe2
--- /dev/null
+++ b/openhexa/toolbox/era5/aggregate.py
@@ -0,0 +1,213 @@
+"""Module for spatial and temporal aggregation of ERA5 data."""
+
+from datetime import datetime
+from pathlib import Path
+
+import geopandas as gpd
+import numpy as np
+import polars as pl
+import rasterio
+import rasterio.transform
+import xarray as xr
+
+
+def clip_dataset(ds: xr.Dataset, xmin: float, ymin: float, xmax: float, ymax: float) -> xr.Dataset:
+    """Clip input xarray dataset according to the provided bounding box.
+
+    Assumes lat & lon dimensions are named "latitude" and "longitude". Longitude in the
+    source dataset is expected to be in the range [0, 360], and will be converted to
+    [-180, 180].
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        Input xarray dataset.
+    xmin : float
+        Minimum longitude.
+    ymin : float
+        Minimum latitude.
+    xmax : float
+        Maximum longitude.
+    ymax : float
+        Maximum latitude.
+
+    Returns
+    -------
+    xr.Dataset
+        Clipped xarray dataset.
+    """
+    ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180)).sortby("longitude")
+    ds = ds.where((ds.longitude >= xmin) & (ds.longitude <= xmax), drop=True)
+    ds = ds.where((ds.latitude >= ymin) & (ds.latitude <= ymax), drop=True)
+    return ds
+
+
+def get_transform(ds: xr.Dataset) -> rasterio.transform.Affine:
+    """Get rasterio affine transform from xarray dataset.
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        Input xarray dataset.
+
+    Returns
+    -------
+    rasterio.transform.Affine
+        Rasterio affine transform.
+    """
+    transform = rasterio.transform.from_bounds(
+        ds.longitude.values.min(),
+        ds.latitude.values.min(),
+        ds.longitude.values.max(),
+        ds.latitude.values.max(),
+        len(ds.longitude),
+        len(ds.latitude),
+    )
+    return transform
+
+
+def build_masks(
+    boundaries: gpd.GeoDataFrame, height: int, width: int, transform: rasterio.Affine
+) -> tuple[np.ndarray, rasterio.Affine]:
+    """Build binary masks for all geometries in a dataframe.
+
+    We build a raster of shape (n_boundaries, n_height, n_width) in order to store one binary mask
+    per boundary. Boundaries shapes cannot be stored in a single array as we want masks to overlap
+    if needed.
+
+    Parameters
+    ----------
+    boundaries : gpd.GeoDataFrame
+        Input GeoDataFrame containing the boundaries.
+    height : int
+        Height of the raster (number of pixels)
+    width : int
+        Width of the raster (number of pixels)
+    transform : rasterio.Affine
+        Raster affine transform
+
+    Returns
+    -------
+    np.ndarray
+        Binary masks as a numpy ndarray of shape (n_boundaries, height, width)
+    """
+    masks = np.ndarray(shape=(len(boundaries), height, width), dtype=np.bool_)
+    for i, geom in enumerate(boundaries.geometry):
+        mask = rasterio.features.rasterize(
+            shapes=[geom.__geo_interface__],
+            out_shape=(height, width),
+            fill=0,
+            default_value=1,
+            all_touched=True,
+            transform=transform,
+        )
+        masks[i, :, :] = mask == 1
+    return masks
+
+
+def merge(data_dir: Path | str) -> xr.Dataset:
+    """Merge all .grib files in a directory into a single xarray dataset.
+
+    If multiple values are available for a given time, step, longitude & latitude dimensions, the
+    maximum value is kept.
+
+    Parameters
+    ----------
+    data_dir : Path | str
+        Directory containing the .grib files.
+
+    Returns
+    -------
+    xr.Dataset
+        Merged xarray dataset with time, step, longitude and latitude dimensions.
+    """
+    if isinstance(data_dir, str):
+        data_dir = Path(data_dir)
+
+    datasets = []
+    for fp in data_dir.glob("*.grib"):
+        datasets.append(xr.open_dataset(fp, engine="cfgrib"))
+
+    ds = xr.concat(datasets, dim="tmp_dim").max(dim="tmp_dim")
+    return ds
+
+
+def _np_to_datetime(dt64: np.datetime64) -> datetime:
+    epoch = np.datetime64(0, "s")
+    one_second = np.timedelta64(1, "s")
+    seconds_since_epoch = (dt64 - epoch) / one_second
+    return datetime.fromtimestamp(seconds_since_epoch)
+
+
+def _has_missing_data(da: xr.DataArray) -> bool:
+    """A DataArray is considered to have missing data if not all hours have measurements."""
+    missing = False
+    for step in da.step:
+        if da.sel(step=step).isnull().all():
+            missing = True
+    return missing
+
+
+def aggregate(ds: xr.Dataset, var: str, masks: np.ndarray, boundaries_id: list[str]) -> pl.DataFrame:
+    """Aggregate hourly measurements in space and time.
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        Input xarray dataset with time, step, longitude and latitude dimensions
+    var : str
+        Variable to aggregate (ex: "t2m" or "tp")
+    masks : np.ndarray
+        Binary masks as a numpy ndarray of shape (n_boundaries, height, width)
+    boundaries_id : list[str]
+        List of boundary IDs (same order as n_boundaries dimension in masks)
+
+    Notes
+    -----
+    The function aggregates hourly measurements to daily values for each boundary.
+
+    Temporal aggregation is applied first. 3 statistics are computed for each day: daily mean,
+    daily min, and daily max.
+
+    Spatial aggregation is then applied. For each boundary, 3 statistics are computed: average of
+    daily means, average of daily min, and average of daily max. These 3 statistics are stored in
+    the "mean", "min", and "max" columns of the output dataframe.
+    """
+    rows = []
+
+    for day in ds.time.values:
+        da = ds[var].sel(time=day)
+
+        if _has_missing_data(da):
+            continue
+
+        da_mean = da.mean(dim="step").values
+        da_min = da.min(dim="step").values
+        da_max = da.max(dim="step").values
+
+        for i, uid in enumerate(boundaries_id):
+            v_mean = da_mean[masks[i, :, :]].mean()
+            v_min = da_min[masks[i, :, :]].mean()
+            v_max = da_max[masks[i, :, :]].mean()
+
+            rows.append(
+                {
+                    "boundary_id": uid,
+                    "date": _np_to_datetime(day).date(),
+                    "mean": v_mean,
+                    "min": v_min,
+                    "max": v_max,
+                }
+            )
+
+    SCHEMA = {
+        "boundary_id": pl.String,
+        "date": pl.Date,
+        "mean": pl.Float64,
+        "min": pl.Float64,
+        "max": pl.Float64,
+    }
+
+    df = pl.DataFrame(data=rows, schema=SCHEMA)
+
+    return df

From 7674307e1d0b3ffb1a1fddfb4dcbd9cddd4e7cb3 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Tue, 15 Oct 2024 17:45:37 +0200
Subject: [PATCH 16/17] docs(era5): add era5 README

---
 openhexa/toolbox/era5/README.md | 163 ++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 openhexa/toolbox/era5/README.md

diff --git a/openhexa/toolbox/era5/README.md b/openhexa/toolbox/era5/README.md
new file mode 100644
index 0000000..3a7b6f3
--- /dev/null
+++ b/openhexa/toolbox/era5/README.md
@@ -0,0 +1,163 @@
+# OpenHEXA Toolbox ERA5
+
+The package contains ETL classes and functions to acquire and process ERA5-Land data. ERA5-Land
+provides hourly information of surface variables from 1950 to 5 days before the current date, with
+a ~9 km spatial resolution. See [ERA5-Land: data
+documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation) for more
+information.
+
+## Usage
+
+The package contains 3 modules:
+* `openhexa.toolbox.era5.cds`: download ERA5-land products from the Copernicus [Climate Data Store](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview)
+* `openhexa.toolbox.era5.google`: download ERA5 products from Google Cloud [Public Datasets](https://cloud.google.com/storage/docs/public-datasets/era5)
+* `openhexa.toolbox.era5.aggregate`: aggregate ERA5 data in space and time
+
+### Download from CDS
+
+To download products from the Climate Data Store, you will need to create an account and generate an API key in ECMWF (see [CDS](https://cds.climate.copernicus.eu/)).
+
+```python
+from openhexa.toolbox.era5.cds import Client
+
+cds = Client(key="<cds_api_key>")
+
+request = cds.build_request(
+    variable="2m_temperature",
+    year=2024,
+    month=4
+)
+
+cds.download(
+    request=request,
+    dst_file="data/product.grib"
+)
+```
+
+The module also contains helper functions to use bounds from a geoparquet file as an area of interest.
+
+```python
+bounds = bounds_from_file(fp=Path("data/districts.parquet"), buffer=0.5)
+
+request = cds.build_request(
+    variable="total_precipitation",
+    year=2023,
+    month=10,
+    days=[1, 2, 3, 4, 5],
+    area=bounds
+)
+
+cds.download(
+    request=request,
+    dst_file="data/product.grib"
+)
+```
+
+To download multiple products for a given period, use `Client.download_between()`:
+
+```python
+cds.download_between(
+    variable="2m_temperature",
+    start=datetime(2020, 1, 1),
+    end=datetime(2021, 6, 1),
+    dst_dir="data/raw/2m_temperature",
+    area=bounds
+)
+```
+
+Checking latest available date in the ERA5-Land dataset:
+
+```python
+cds = Client("<api_key>")
+
+cds.latest
+```
+```
+>>> datetime(2024, 10, 8)
+```
+
+### Download from Google Cloud
+
+```python
+from openhexa.toolbox.era5.google import Client
+
+google = Client()
+
+google.download(
+    variable="2m_temperature",
+    date=datetime(2024, 6, 15),
+    dst_file="data/product.nc"
+)
+```
+
+Or to download all products for a given period:
+
+```python
+# if products are already presents in dst_dir, they will be skipped
+google.sync(
+    variable="2m_temperature",
+    start_date=datetime(2022, 1, 1),
+    end_date=datetime(2022, 6, 1),
+    dst_dir="data"
+)
+```
+
+### Aggregation
+
+```python
+from pathlib import Path
+
+import geopandas as gpd
+from openhexa.toolbox.era5.aggregate import build_masks, merge, aggregate, get_transform
+
+boundaries = gpd.read_parquet("districts.parquet")
+data_dir = Path("data/era5/total_precipitation")
+
+ds = merge(data_dir)
+
+ncols = len(ds.longitude)
+nrows = len(ds.latitude)
+transform = get_transform(ds)
+masks = build_masks(boundaries, nrows, ncols, transform)
+
+df = aggregate(
+    ds=ds,
+    var="tp",
+    masks=masks,
+    boundaries_id=[uid for uid in boundaries["district_id"]]
+)
+
+print(df)
+```
+```
+shape: (18_410, 5)
+┌─────────────┬────────────┬───────────┬──────────┬───────────┐
+│ boundary_id ┆ date       ┆ mean      ┆ min      ┆ max       │
+│ ---         ┆ ---        ┆ ---       ┆ ---      ┆ ---       │
+│ str         ┆ date       ┆ f64       ┆ f64      ┆ f64       │
+╞═════════════╪════════════╪═══════════╪══════════╪═══════════╡
+│ mPenE8ZIBFC ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ TPgpGxUBU9y ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ AhST5ZpuCDJ ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ Lp2BjBVT63s ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ EdfRX9b9vEb ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ yhs1ecKsLOc ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ iHSJypSwlo5 ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ CTtB0TPRvWc ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ eVFAuZOzogt ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ WVEJjdJ2S15 ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ rbYGKFgupK9 ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ Nml6rVDElLh ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ E0hd8TD1M0q ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ PCg4pLGmKSM ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ C6EBhE8OnfW ┆ 2024-01-01 ┆ 0.000462  ┆ 0.0      ┆ 0.00086   │
+│ …           ┆ …          ┆ …         ┆ …        ┆ …         │
+│ CkpfOFkMyrd ┆ 2024-10-07 ┆ 1.883121  ┆ 0.001785 ┆ 2.700447  │
+│ tMXsltjzzmR ┆ 2024-10-07 ┆ 3.579136  ┆ 0.105436 ┆ 4.702504  │
+│ F0ytkh0RExg ┆ 2024-10-07 ┆ 8.415455  ┆ 0.838535 ┆ 17.08884  │
+...
+│ TTSmaRnHa82 ┆ 2024-10-07 ┆ 1.724243  ┆ 0.007809 ┆ 5.692989  │
+│ jbmw2gdrrTV ┆ 2024-10-07 ┆ 1.176629  ┆ 0.110173 ┆ 1.582995  │
+│ eKYyXbBdvmB ┆ 2024-10-07 ┆ 0.599976  ┆ 0.037771 ┆ 1.189411  │
+└─────────────┴────────────┴───────────┴──────────┴───────────┘
+```
\ No newline at end of file

From 4799b01efd96efec88aa462927b87276723c7544 Mon Sep 17 00:00:00 2001
From: Yann Forget <yannforget@mailbox.org>
Date: Tue, 15 Oct 2024 17:56:47 +0200
Subject: [PATCH 17/17] docs(era5): update era5 README

---
 openhexa/toolbox/era5/README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/openhexa/toolbox/era5/README.md b/openhexa/toolbox/era5/README.md
index 3a7b6f3..0b0e651 100644
--- a/openhexa/toolbox/era5/README.md
+++ b/openhexa/toolbox/era5/README.md
@@ -6,6 +6,19 @@ a ~9 km spatial resolution. See [ERA5-Land: data
 documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation) for more
 information.
 
+Available variables include:
+* 2 metre temperature
+* Wind components
+* Leaf area index
+* Volumetric soil water layer
+* Total precipitation
+
+See [ERA5-Land data
+documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-parameterlistingParameterlistings)
+for a full list of available parameters.
+
+In addition to download clients for the Copernicus [Climate Data Store](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land?tab=overview) and [Google Public Datasets](https://cloud.google.com/storage/docs/public-datasets/era5), the package includes an `aggregate` module to aggregate ERA5 measurements in space (geographic boundaries) and time (hourly to daily).
+
 ## Usage
 
 The package contains 3 modules:
@@ -34,7 +47,9 @@ cds.download(
 )
 ```
 
-The module also contains helper functions to use bounds from a geoparquet file as an area of interest.
+The module also contains helper functions to use bounds from a geoparquet file as an area of
+interest. Source bounds are buffered and rounded by default to make sure the required data is
+downloaded.
 
 ```python
 bounds = bounds_from_file(fp=Path("data/districts.parquet"), buffer=0.5)
@@ -76,6 +91,8 @@ cds.latest
 >>> datetime(2024, 10, 8)
 ```
 
+NB: End dates in product requests will be automatically replaced by latest available date if they are greater.
+
 ### Download from Google Cloud
 
 ```python