Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load hazard raster data lazily #578

Merged
merged 14 commits into from
Jan 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 110 additions & 98 deletions climada/hazard/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import rasterio
from rasterio.features import rasterize
from rasterio.warp import reproject, Resampling, calculate_default_transform
import sparse as sp
from scipy import sparse
import xarray as xr

Expand Down Expand Up @@ -445,7 +446,7 @@ def from_raster_xarray(
Parameters
----------
data : xarray.Dataset or str
The data to read from. May be a opened dataset or a path to a raster data
The data to read from. May be an opened dataset or a path to a raster data
file, in which case the file is opened first. Works with any file format
supported by ``xarray``.
hazard_type : str
Expand All @@ -458,7 +459,9 @@ def from_raster_xarray(
coordinate_vars : dict(str, str), optional
Mapping from default coordinate names to coordinate names used in the data
to read. The default is
``dict(event="time", longitude="longitude", latitude="latitude")``
``dict(event="time", longitude="longitude", latitude="latitude")``, as most
of the commonly used hazard data happens to have a "time" attribute but no
"event" attribute.
data_vars : dict(str, str), optional
Mapping from default variable names to variable names used in the data
to read. The default names are ``fraction``, ``hazard_type``, ``frequency``,
Expand Down Expand Up @@ -519,135 +522,135 @@ def from_raster_xarray(
expected names.

>>> dset = xr.Dataset(
>>> dict(
>>> intensity=(
>>> ["time", "latitude", "longitude"],
>>> [[[0, 1, 2], [3, 4, 5]]],
>>> )
>>> ),
>>> dict(
>>> time=[datetime.datetime(2000, 1, 1)],
>>> latitude=[0, 1],
>>> longitude=[0, 1, 2],
>>> ),
>>> )
... dict(
... intensity=(
... ["time", "latitude", "longitude"],
... [[[0, 1, 2], [3, 4, 5]]],
... )
... ),
... dict(
... time=[datetime.datetime(2000, 1, 1)],
... latitude=[0, 1],
... longitude=[0, 1, 2],
... ),
... )
>>> hazard = Hazard.from_raster_xarray(dset, "", "")

For non-default coordinate names, use the ``coordinate_vars`` argument.

>>> dset = xr.Dataset(
>>> dict(
>>> intensity=(
>>> ["day", "lat", "longitude"],
>>> [[[0, 1, 2], [3, 4, 5]]],
>>> )
>>> ),
>>> dict(
>>> day=[datetime.datetime(2000, 1, 1)],
>>> lat=[0, 1],
>>> longitude=[0, 1, 2],
>>> ),
>>> )
... dict(
... intensity=(
... ["day", "lat", "longitude"],
... [[[0, 1, 2], [3, 4, 5]]],
... )
... ),
... dict(
... day=[datetime.datetime(2000, 1, 1)],
... lat=[0, 1],
... longitude=[0, 1, 2],
... ),
... )
>>> hazard = Hazard.from_raster_xarray(
>>> dset, "", "", coordinate_vars=dict(event="day", latitude="lat")
>>> )
... dset, "", "", coordinate_vars=dict(event="day", latitude="lat")
... )

Coordinates can be different from the actual dataset dimensions. The following
loads the data with coordinates ``longitude`` and ``latitude`` (default names):

>>> dset = xr.Dataset(
>>> dict(intensity=(["time", "y", "x"], [[[0, 1, 2], [3, 4, 5]]])),
>>> dict(
>>> time=[datetime.datetime(2000, 1, 1)],
>>> y=[0, 1],
>>> x=[0, 1, 2],
>>> longitude=(["y", "x"], [[0.0, 0.1, 0.2], [0.0, 0.1, 0.2]]),
>>> latitude=(["y", "x"], [[0.0, 0.0, 0.0], [0.1, 0.1, 0.1]]),
>>> ),
>>> )
... dict(intensity=(["time", "y", "x"], [[[0, 1, 2], [3, 4, 5]]])),
... dict(
... time=[datetime.datetime(2000, 1, 1)],
... y=[0, 1],
... x=[0, 1, 2],
... longitude=(["y", "x"], [[0.0, 0.1, 0.2], [0.0, 0.1, 0.2]]),
... latitude=(["y", "x"], [[0.0, 0.0, 0.0], [0.1, 0.1, 0.1]]),
... ),
... )
>>> hazard = Hazard.from_raster_xarray(dset, "", "")

Optional data is read from the dataset if the default keys are found. Users can
specify custom variables in the data, or that the default keys should be ignored,
with the ``data_vars`` argument.

>>> dset = xr.Dataset(
>>> dict(
>>> intensity=(
>>> ["time", "latitude", "longitude"],
>>> [[[0, 1, 2], [3, 4, 5]]],
>>> ),
>>> fraction=(
>>> ["time", "latitude", "longitude"],
>>> [[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]],
>>> ),
>>> freq=(["time"], [0.4]),
>>> event_id=(["time"], [4]),
>>> ),
>>> dict(
>>> time=[datetime.datetime(2000, 1, 1)],
>>> latitude=[0, 1],
>>> longitude=[0, 1, 2],
>>> ),
>>> )
... dict(
... intensity=(
... ["time", "latitude", "longitude"],
... [[[0, 1, 2], [3, 4, 5]]],
... ),
... fraction=(
... ["time", "latitude", "longitude"],
... [[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]],
... ),
... freq=(["time"], [0.4]),
... event_id=(["time"], [4]),
... ),
... dict(
... time=[datetime.datetime(2000, 1, 1)],
... latitude=[0, 1],
... longitude=[0, 1, 2],
... ),
... )
>>> hazard = Hazard.from_raster_xarray(
>>> dset,
>>> "",
>>> "",
>>> data_vars=dict(
>>> # Load frequency from 'freq' array
>>> frequency="freq",
>>> # Ignore 'event_id' array and use default instead
>>> event_id="",
>>> # 'fraction' array is loaded because it has the default name
>>> ),
>>> )
... dset,
... "",
... "",
... data_vars=dict(
... # Load frequency from 'freq' array
... frequency="freq",
... # Ignore 'event_id' array and use default instead
... event_id="",
... # 'fraction' array is loaded because it has the default name
... ),
... )
>>> np.array_equal(hazard.frequency, [0.4]) and np.array_equal(
>>> hazard.event_id, [1]
>>> )
... hazard.event_id, [1]
... )
True

If your read single-event data your dataset probably will not have a time
dimension. As long as a time *coordinate* exists, however, this method will
automatically promote it to a dataset dimension and load the data:

>>> dset = xr.Dataset(
>>> dict(
>>> intensity=(
>>> ["latitude", "longitude"],
>>> [[0, 1, 2], [3, 4, 5]],
>>> )
>>> ),
>>> dict(
>>> time=[datetime.datetime(2000, 1, 1)],
>>> latitude=[0, 1],
>>> longitude=[0, 1, 2],
>>> ),
>>> )
... dict(
... intensity=(
... ["latitude", "longitude"],
... [[0, 1, 2], [3, 4, 5]],
... )
... ),
... dict(
... time=[datetime.datetime(2000, 1, 1)],
... latitude=[0, 1],
... longitude=[0, 1, 2],
... ),
... )
>>> hazard = Hazard.from_raster_xarray(dset, "", "") # Same as first example

If one coordinate is missing altogehter, you must add it or expand the dimensions
before loading the dataset:

>>> dset = xr.Dataset(
>>> dict(
>>> intensity=(
>>> ["latitude", "longitude"],
>>> [[0, 1, 2], [3, 4, 5]],
>>> )
>>> ),
>>> dict(
>>> latitude=[0, 1],
>>> longitude=[0, 1, 2],
>>> ),
>>> )
... dict(
... intensity=(
... ["latitude", "longitude"],
... [[0, 1, 2], [3, 4, 5]],
... )
... ),
... dict(
... latitude=[0, 1],
... longitude=[0, 1, 2],
... ),
... )
>>> dset = dset.expand_dims(time=[numpy.datetime64("2000-01-01")])
>>> hazard = Hazard.from_raster_xarray(dset, "", "")
"""
# If the data is a string, open the respective file
if not isinstance(data, xr.Dataset):
LOGGER.info("Loading Hazard from file: %s", data)
data: xr.Dataset = xr.open_dataset(data)
data: xr.Dataset = xr.open_dataset(data, chunks="auto")
else:
LOGGER.info("Loading Hazard from xarray Dataset")

Expand Down Expand Up @@ -713,12 +716,20 @@ def from_raster_xarray(
data[coords["latitude"]].values, data[coords["longitude"]].values, crs=crs,
)

def to_csr_matrix(array: np.ndarray) -> sparse.csr_matrix:
def to_csr_matrix(array: xr.DataArray) -> sparse.csr_matrix:
"""Store a numpy array as sparse matrix, optimizing storage space

The CSR matrix stores NaNs explicitly, so we set them to zero.
"""
return sparse.csr_matrix(np.where(np.isnan(array), 0, array))
array = array.where(array.notnull(), 0)
array = xr.apply_ufunc(
sp.COO.from_numpy,
array,
dask="parallelized",
output_dtypes=[array.dtype]
)
sparse_coo = array.compute().data # Load into memory
return sparse_coo.tocsr() # Convert sparse.COO to scipy.sparse.csr_matrix

# Read the intensity data
LOGGER.debug("Loading Hazard intensity from DataArray '%s'", intensity)
Expand Down Expand Up @@ -788,9 +799,9 @@ def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray:
],
# The accessor for the data in the Dataset
accessor=[
lambda x: to_csr_matrix(default_accessor(x)),
to_csr_matrix,
lambda x: maybe_repeat(default_accessor(x), num_events),
lambda x: strict_positive_int_accessor(x),
strict_positive_int_accessor,
lambda x: list(maybe_repeat(default_accessor(x), num_events).flat),
lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events),
],
Expand Down Expand Up @@ -827,7 +838,7 @@ def load_from_xarray_or_return_default(
Does the following based on the ``user_key``:
* If the key is an empty string, return the default value
* If the key is a non-empty string, load the data for that key and return it.
* If the key is ``None``, look for the``default_key`` in the data. If it
* If the key is ``None``, look for the ``default_key`` in the data. If it
exists, return that data. If not, return the default value.

Parameters
Expand Down Expand Up @@ -2415,7 +2426,8 @@ def centr_exp_col(self):
in an exposures gdf. E.g. "centr_TC"

"""
from climada.entity.exposures import INDICATOR_CENTR
from climada.entity.exposures import INDICATOR_CENTR # pylint: disable=import-outside-toplevel
# import outside toplevel is necessary for it not being circular
return INDICATOR_CENTR + self.tag.haz_type

@property
Expand Down
16 changes: 10 additions & 6 deletions climada/hazard/test/test_base_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
"""

import unittest
from unittest.mock import patch, MagicMock
from unittest.mock import patch
import datetime as dt
from pathlib import Path

import numpy as np
from scipy.sparse import csr_matrix

Expand All @@ -31,8 +33,6 @@
from climada.hazard.base import Hazard
from climada.util.constants import DEF_CRS

from pathlib import Path


class TestReadDefaultNetCDF(unittest.TestCase):
"""Test reading a NetCDF file where the coordinates to read match the dimensions"""
Expand Down Expand Up @@ -120,9 +120,13 @@ def test_load_path(self):

def test_load_dataset(self):
"""Load the data from an opened dataset as argument"""
dataset = xr.open_dataset(self.netcdf_path)
hazard = Hazard.from_raster_xarray(dataset, "", "")
self._assert_default(hazard)
def _load_and_assert(chunks):
dataset = xr.open_dataset(self.netcdf_path, chunks=chunks)
hazard = Hazard.from_raster_xarray(dataset, "", "")
self._assert_default(hazard)

_load_and_assert(chunks=None)
_load_and_assert(chunks=dict(latitude=1, longitude=1, time=1))

def test_type_and_unit(self):
"""Test passing a custom type and unit"""
Expand Down
2 changes: 1 addition & 1 deletion requirements/env_climada.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies:
- salib>=1.3.0
- scikit-learn>=1.0
- scipy>=1.6
- sparse>=0.13
- statsmodels>=0.11
- tabulate>=0.8
- tqdm>=4.48
Expand All @@ -44,4 +45,3 @@ dependencies:
- peewee>=3.14
- pybufrkit>=0.2
- SALib==1.3.12
- sparse
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
'salib',
'scikit-learn',
'statsmodels',
'sparse',
'tables',
'tabulate',
'tqdm',
Expand Down