Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure csr matrices are in "canonical format" before impact calculation #893

Merged
merged 15 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull
- Centroids complete overhaul. Most function should be backward compatible. Internal data is stored in a geodataframe attribute. Raster are now stored as points, and the meta attribute is removed. Several methds were deprecated or removed. [#787](https://github.com/CLIMADA-project/climada_python/pull/787)
- Improved error messages produced by `ImpactCalc.impact()` in case impact function in the exposures is not found in impf_set [#863](https://github.com/CLIMADA-project/climada_python/pull/863)
- Changed module structure: `climada.hazard.Hazard` has been split into the modules `base`, `io` and `plot` [#871](https://github.com/CLIMADA-project/climada_python/pull/871)
- Ensure `csr_matrix` stored in `climada.hazard.Hazard` have consistent data format and store no explicit zeros [#893](https://github.com/CLIMADA-project/climada_python/pull/893)

peanutfun marked this conversation as resolved.
Show resolved Hide resolved
### Fixed

Expand Down
30 changes: 28 additions & 2 deletions climada/hazard/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
LOGGER = logging.getLogger(__name__)


class Hazard(HazardIO, HazardPlot):

Check warning on line 47 in climada/hazard/base.py

View check run for this annotation

Jenkins - WCR / Pylint

too-many-public-methods

LOW: Too many public methods (22/20)
Raw output
Used when class has too many public methods, try to reduce this to get asimpler (and so easier to use) class.
"""
Contains events of some hazard type defined at centroids. Loads from
files with format defined in FILE_EXT.
Expand Down Expand Up @@ -90,8 +90,8 @@
'centroids',
'event_id',
'frequency',
'intensity',
'fraction'
'_intensity',
'_fraction'
}
"""Name of the variables needed to compute the impact. Types: scalar, str,
list, 1dim np.array of size num_events, scipy.sparse matrix of shape
Expand Down Expand Up @@ -191,6 +191,32 @@
if self.pool:
LOGGER.info('Using %s CPUs.', self.pool.ncpus)

@property
def intensity(self) -> sparse.csr_matrix:
"""Hazard intensity matrix"""
return self._intensity

@intensity.setter
def intensity(self, value: sparse.csr_matrix):
"""Set intensity matrix to new value"""
self._intensity = value
self._intensity.check_format()
self._intensity.eliminate_zeros()
self._intensity.sum_duplicates()
peanutfun marked this conversation as resolved.
Show resolved Hide resolved

@property
def fraction(self) -> sparse.csr_matrix:
"""Hazard fraction matrix"""
return self._fraction

@fraction.setter
def fraction(self, value: sparse.csr_matrix):
"""Set fraction matrix to new value"""
self._fraction = value
self._fraction.check_format()
self._fraction.eliminate_zeros()
self._fraction.sum_duplicates()

@classmethod
def get_default(cls, attribute):
"""Get the Hazard type default for a given attribute.
Expand Down
21 changes: 17 additions & 4 deletions climada/hazard/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,12 +1004,18 @@ def write_hdf5(self, file_name, todense=False):
LOGGER.info('Writing %s', file_name)
with h5py.File(file_name, 'w') as hf_data:
str_dt = h5py.special_dtype(vlen=str)
for (var_name, var_val) in self.__dict__.items():
for var_name in self.__dict__:
if var_name == 'centroids':
# Centroids have their own write_hdf5 method,
# which is invoked at the end of this method (s.b.)
pass
elif isinstance(var_val, sparse.csr_matrix):
continue
# Prune private attributes
if var_name in self.vars_oblig:
var_name = var_name.lstrip("_")

var_val = getattr(self, var_name) # Also works with properties

if isinstance(var_val, sparse.csr_matrix):
if todense:
hf_data.create_dataset(var_name, data=var_val.toarray())
else:
Expand Down Expand Up @@ -1065,11 +1071,18 @@ def from_hdf5(cls, file_name):
haz = cls()
hazard_kwargs = dict()
with h5py.File(file_name, 'r') as hf_data:
for (var_name, var_val) in haz.__dict__.items():
for var_name in haz.__dict__:
# Prune private attributes
if var_name in haz.vars_oblig:
var_name = var_name.lstrip("_")

if var_name not in hf_data.keys():
continue
if var_name == 'centroids':
continue

var_val = getattr(haz, var_name) # Also works with properties

if isinstance(var_val, np.ndarray) and var_val.ndim == 1:
hazard_kwargs[var_name] = np.array(hf_data.get(var_name))
elif isinstance(var_val, sparse.csr_matrix):
Expand Down
42 changes: 36 additions & 6 deletions climada/hazard/test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,18 @@ def test_check_wrongFreq_fail(self):
def test_check_wrongInten_fail(self):
"""Wrong hazard definition"""
self.hazard.intensity = sparse.csr_matrix([[1, 2], [1, 2]])

with self.assertRaises(ValueError) as cm:
with self.assertRaisesRegex(
ValueError, "Invalid Hazard._intensity row size: 3 != 2."
):
self.hazard.check()
self.assertIn('Invalid Hazard.intensity row size: 3 != 2.', str(cm.exception))

def test_check_wrongFrac_fail(self):
"""Wrong hazard definition"""
self.hazard.fraction = sparse.csr_matrix([[1], [1], [1]])

with self.assertRaises(ValueError) as cm:
with self.assertRaisesRegex(
ValueError, "Invalid Hazard._fraction column size: 2 != 1."
):
self.hazard.check()
self.assertIn('Invalid Hazard.fraction column size: 2 != 1.', str(cm.exception))

def test_check_wrongEvName_fail(self):
"""Wrong hazard definition"""
Expand Down Expand Up @@ -212,6 +212,36 @@ def test_get_date_strings_pass(self):
self.assertEqual(haz.get_event_date()[560],
u_dt.date_to_str(haz.date[560]))

def test_matrix_consistency(self):
"""Check that the csr_matrix is brought in canonical format"""
# Non-canonical: First three data points will be summed onto the first matrix
# entry, forth will be an explicit zero entry
data = [0, 1, 2, 0]
indices = [0, 0, 0, 1]
indptr = [0, 4, 4, 4]
matrix = sparse.csr_matrix((data, indices, indptr), shape=(3, 2))
np.testing.assert_array_equal(matrix.data, data)
np.testing.assert_array_equal(matrix[0, [0, 1]].toarray(), [[3, 0]])
self.assertEqual(matrix.nnz, 4)
self.assertFalse(matrix.has_canonical_format)
peanutfun marked this conversation as resolved.
Show resolved Hide resolved

def check_canonical_matrix(mat):
self.assertTrue(mat.has_canonical_format)
self.assertEqual(mat[0, 0], 3)
np.testing.assert_array_equal(mat.data, [3])
self.assertEqual(mat.nnz, 1)

# Check canonical format when initializing
hazard_new = Hazard("TC", intensity=matrix.copy(), fraction=matrix.copy())
matrix_attrs = ("intensity", "fraction")
for attr in matrix_attrs:
check_canonical_matrix(getattr(hazard_new, attr))

# Check conversion to canonical format when assigning
for attr in ("intensity", "fraction"):
setattr(self.hazard, attr, matrix.copy())
check_canonical_matrix(getattr(self.hazard, attr))

class TestRemoveDupl(unittest.TestCase):
"""Test remove_duplicates method."""

Expand Down
Loading