diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e4265df0..2335237c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull - Improved error messages produced by `ImpactCalc.impact()` in case impact function in the exposures is not found in impf_set [#863](https://github.com/CLIMADA-project/climada_python/pull/863) - Update the Holland et al. 2010 TC windfield model and introduce `model_kwargs` parameter to adjust model parameters [#846](https://github.com/CLIMADA-project/climada_python/pull/846) - Changed module structure: `climada.hazard.Hazard` has been split into the modules `base`, `io` and `plot` [#871](https://github.com/CLIMADA-project/climada_python/pull/871) +- Ensure `csr_matrix` stored in `climada.hazard.Hazard` have consistent data format and store no explicit zeros when initializing `ImpactCalc` [#893](https://github.com/CLIMADA-project/climada_python/pull/893) - `Impact.from_hdf5` now calls `str` on `event_name` data that is not strings, and issue a warning then [#894](https://github.com/CLIMADA-project/climada_python/pull/894) - `Impact.write_hdf5` now throws an error if `event_name` is does not contain strings exclusively [#894](https://github.com/CLIMADA-project/climada_python/pull/894) - Split `climada.hazard.trop_cyclone` module into smaller submodules without affecting module usage [#911](https://github.com/CLIMADA-project/climada_python/pull/911) @@ -39,6 +40,7 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull ### Added +- Method `Hazard.check_matrices` for bringing the stored CSR matrices into "canonical format" [#893](https://github.com/CLIMADA-project/climada_python/pull/893) - Generic s-shaped impact function via `ImpactFunc.from_poly_s_shape` [#878](https://github.com/CLIMADA-project/climada_python/pull/878) - climada.hazard.centroids.centr.Centroids.get_area_pixel - climada.hazard.centroids.centr.Centroids.get_dist_coast diff --git a/climada/engine/impact_calc.py b/climada/engine/impact_calc.py index 315a02c8a..f050607c5 100644 --- a/climada/engine/impact_calc.py +++ b/climada/engine/impact_calc.py @@ -47,6 +47,8 @@ def __init__(self, The dimension of the imp_mat variable must be compatible with the exposures and hazard objects. + This will call :py:meth:`climada.hazard.base.Hazard.check_matrices`. + Parameters ---------- exposures : climada.entity.Exposures @@ -61,6 +63,8 @@ def __init__(self, self.exposures = exposures self.impfset = impfset self.hazard = hazard + self.hazard.check_matrices() + # exposures index to use for matrix reconstruction self._orig_exp_idx = np.arange(self.exposures.gdf.shape[0]) diff --git a/climada/engine/test/test_impact_calc.py b/climada/engine/test/test_impact_calc.py index 471706988..c4ee5e26b 100644 --- a/climada/engine/test/test_impact_calc.py +++ b/climada/engine/test/test_impact_calc.py @@ -70,6 +70,16 @@ def test_init(self): np.testing.assert_array_equal(HAZ.event_id, icalc.hazard.event_id) np.testing.assert_array_equal(HAZ.event_name, icalc.hazard.event_name) + # Test check matrices + hazard = deepcopy(HAZ) + hazard.intensity[0, hazard.intensity.indices[0]] = 0 + hazard.fraction = sparse.csr_matrix(np.ones((1, 1))) + with self.assertRaisesRegex( + ValueError, "Intensity and fraction matrices must have the same shape" + ): + ImpactCalc(ENT.exposures, ENT.impact_funcs, hazard) + self.assertEqual(hazard.intensity.nnz, HAZ.intensity.nnz - 1) # was pruned + def test_metrics(self): """Test methods to get impact metrics""" mat = sparse.csr_matrix(np.array( diff --git a/climada/hazard/base.py b/climada/hazard/base.py index e69d80c4e..f0d1f4744 100644 --- a/climada/hazard/base.py +++ b/climada/hazard/base.py @@ -50,6 +50,21 @@ class Hazard(HazardIO, HazardPlot): Contains events of some hazard type defined at centroids. Loads from files with format defined in FILE_EXT. + Attention + --------- + This class uses instances of + `scipy.sparse.csr_matrix + `_ + to store :py:attr:`intensity` and :py:attr:`fraction`. This data types comes with + its particular pitfalls. Depending on how the objects are instantiated and modified, + a matrix might end up in a "non-canonical" state. In this state, its ``.data`` + attribute does not necessarily represent the values apparent in the final matrix. + In particular, a "non-canonical" matrix may store "duplicates", i.e. multiple values + that map to the same matrix position. This is supported, and the default behavior is + to sum up these values. To avoid any inconsistencies, call :py:meth:`check_matrices` + before accessing the ``data`` attribute of either matrix. This will explicitly sum + all values at the same matrix position and eliminate explicit zeros. + Attributes ---------- haz_type : str @@ -192,6 +207,33 @@ def __init__(self, if self.pool: LOGGER.info('Using %s CPUs.', self.pool.ncpus) + def check_matrices(self): + """Ensure that matrices are consistently shaped and stored + + It is good practice to call this method before accessing the ``data`` attribute + of either :py:attr:`intensity` or :py:attr:`fraction`. + + See Also + -------- + :py:func:`climada.util.checker.prune_csr_matrix` + + Todo + ----- + * Check consistency with centroids + + Raises + ------ + ValueError + If matrices are ill-formed or ill-shaped in relation to each other + """ + u_check.prune_csr_matrix(self.intensity) + u_check.prune_csr_matrix(self.fraction) + if self.fraction.nnz > 0: + if self.intensity.shape != self.fraction.shape: + raise ValueError( + "Intensity and fraction matrices must have the same shape" + ) + @classmethod def get_default(cls, attribute): """Get the Hazard type default for a given attribute. diff --git a/climada/hazard/io.py b/climada/hazard/io.py index d50dcfc1d..5248d4579 100644 --- a/climada/hazard/io.py +++ b/climada/hazard/io.py @@ -1008,7 +1008,7 @@ def write_hdf5(self, file_name, todense=False): if var_name == 'centroids': # Centroids have their own write_hdf5 method, # which is invoked at the end of this method (s.b.) - pass + continue elif isinstance(var_val, sparse.csr_matrix): if todense: hf_data.create_dataset(var_name, data=var_val.toarray()) diff --git a/climada/hazard/test/test_base.py b/climada/hazard/test/test_base.py index f9c53d425..3dc7d8db9 100644 --- a/climada/hazard/test/test_base.py +++ b/climada/hazard/test/test_base.py @@ -124,18 +124,18 @@ def test_check_wrongFreq_fail(self): def test_check_wrongInten_fail(self): """Wrong hazard definition""" self.hazard.intensity = sparse.csr_matrix([[1, 2], [1, 2]]) - - with self.assertRaises(ValueError) as cm: + with self.assertRaisesRegex( + ValueError, "Invalid Hazard.intensity row size: 3 != 2." + ): self.hazard.check() - self.assertIn('Invalid Hazard.intensity row size: 3 != 2.', str(cm.exception)) def test_check_wrongFrac_fail(self): """Wrong hazard definition""" self.hazard.fraction = sparse.csr_matrix([[1], [1], [1]]) - - with self.assertRaises(ValueError) as cm: + with self.assertRaisesRegex( + ValueError, "Invalid Hazard.fraction column size: 2 != 1." + ): self.hazard.check() - self.assertIn('Invalid Hazard.fraction column size: 2 != 1.', str(cm.exception)) def test_check_wrongEvName_fail(self): """Wrong hazard definition""" @@ -212,6 +212,32 @@ def test_get_date_strings_pass(self): self.assertEqual(haz.get_event_date()[560], u_dt.date_to_str(haz.date[560])) + def test_check_matrices(self): + """Test the check_matrices method""" + hazard = Hazard("TC") + hazard.fraction = sparse.csr_matrix(np.zeros((2, 2))) + hazard.check_matrices() # No error, fraction.nnz = 0 + hazard.fraction = sparse.csr_matrix(np.ones((2, 2))) + with self.assertRaisesRegex( + ValueError, "Intensity and fraction matrices must have the same shape" + ): + hazard.check_matrices() + hazard.intensity = sparse.csr_matrix(np.ones((2, 3))) + with self.assertRaisesRegex( + ValueError, "Intensity and fraction matrices must have the same shape" + ): + hazard.check_matrices() + + # Check that matrices are pruned + hazard.intensity[:] = 0 + hazard.fraction = sparse.csr_matrix(([0], [0], [0, 1, 1]), shape=(2, 3)) + hazard.check_matrices() + for attr in ("intensity", "fraction"): + with self.subTest(matrix=attr): + matrix = getattr(hazard, attr) + self.assertEqual(matrix.nnz, 0) + self.assertTrue(matrix.has_canonical_format) + class TestRemoveDupl(unittest.TestCase): """Test remove_duplicates method.""" diff --git a/climada/util/checker.py b/climada/util/checker.py index c48ea3be9..17e9fa76d 100644 --- a/climada/util/checker.py +++ b/climada/util/checker.py @@ -23,7 +23,8 @@ 'size', 'shape', 'array_optional', - 'array_default' + 'array_default', + 'prune_csr_matrix', ] import logging @@ -180,3 +181,36 @@ def array_default(exp_len, var, var_name, def_val): else: size(exp_len, var, var_name) return res + +def prune_csr_matrix(matrix: sparse.csr_matrix): + """Ensure that the matrix is in the "canonical format". + + Depending on how the matrix was instantiated or modified, it might be in a + "non-canonical" state. This only relates to its internal storage. In this state, + multiple values might be stored for a single "apparent" value in the matrix. + Also, the matrix might store zeros explicitly, which could be removed. + Calling this function makes sure that the matrix is in the "canonical state", and + brings it into this state, if possible. + + See Also + -------- + `csr_matrix.has_canonical_format + `_ + + Parameters + ---------- + matrix : csr_matrix + The matrix to check. It will be modified *inplace*. Its ``.data`` attribute + might change, but apparent matrix values will stay the same. + + Raises + ------ + ValueError + If + `csr_matrix.check_format + `_ + fails + """ + matrix.check_format() + matrix.eliminate_zeros() + matrix.sum_duplicates() diff --git a/climada/util/test/test_checker.py b/climada/util/test/test_checker.py index f75a8d78c..391191a7e 100644 --- a/climada/util/test/test_checker.py +++ b/climada/util/test/test_checker.py @@ -87,6 +87,28 @@ def test_check_optionals_fail(self): u_check.check_optionals(dummy.__dict__, dummy.vars_opt, "DummyClass.", dummy.id.size) self.assertIn('Invalid DummyClass.list size: 25 != 3.', str(cm.exception)) + def test_prune_csr_matrix(self): + """Check that csr matrices are brought into canonical format""" + # Non-canonical: First three data points will be summed onto the first matrix + # entry, fourth will be an explicit zero entry + data = [0, 1, 2, 0] + indices = [0, 0, 0, 1] + indptr = [0, 4, 4, 4] + matrix = sparse.csr_matrix((data, indices, indptr), shape=(3, 2)) + + # These checks just make sure that we understand how csr_matrix works + np.testing.assert_array_equal(matrix.data, data) + np.testing.assert_array_equal(matrix[0, [0, 1]].toarray(), [[3, 0]]) + self.assertEqual(matrix.nnz, 4) + self.assertFalse(matrix.has_canonical_format) + + # Now test our function + u_check.prune_csr_matrix(matrix) + self.assertTrue(matrix.has_canonical_format) + self.assertEqual(matrix[0, 0], 3) + np.testing.assert_array_equal(matrix.data, [3]) + self.assertEqual(matrix.nnz, 1) + # Execute Tests if __name__ == "__main__": TESTS = unittest.TestLoader().loadTestsFromTestCase(TestChecks)