CLIMADA-project · peanutfun · Jul 17, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull
 - Centroids complete overhaul. Most function should be backward compatible. Internal data is stored in a geodataframe attribute. Raster are now stored as points, and the meta attribute is removed. Several methds were deprecated or removed. [#787](https://github.com/CLIMADA-project/climada_python/pull/787)
 - Improved error messages produced by `ImpactCalc.impact()` in case impact function in the exposures is not found in impf_set [#863](https://github.com/CLIMADA-project/climada_python/pull/863)
 - Changed module structure: `climada.hazard.Hazard` has been split into the modules `base`, `io` and `plot` [#871](https://github.com/CLIMADA-project/climada_python/pull/871)
+- Ensure `csr_matrix` stored in `climada.hazard.Hazard` have consistent data format and store no explicit zeros [#893](https://github.com/CLIMADA-project/climada_python/pull/893)
 
 ### Fixed
 

diff --git a/climada/hazard/base.py b/climada/hazard/base.py
@@ -44,7 +44,7 @@
 LOGGER = logging.getLogger(__name__)


 class Hazard(HazardIO, HazardPlot):
    """
    Contains events of some hazard type defined at centroids. Loads from
    files with format defined in FILE_EXT.
@@ -90,8 +90,8 @@
                   'centroids',
                   'event_id',
                   'frequency',
-                  'intensity',
-                  'fraction'
+                  '_intensity',
+                  '_fraction'
                   }
     """Name of the variables needed to compute the impact. Types: scalar, str,
     list, 1dim np.array of size num_events, scipy.sparse matrix of shape
@@ -191,6 +191,32 @@
         if self.pool:
             LOGGER.info('Using %s CPUs.', self.pool.ncpus)
 
+    @property
+    def intensity(self) -> sparse.csr_matrix:
+        """Hazard intensity matrix"""
+        return self._intensity
+
+    @intensity.setter
+    def intensity(self, value: sparse.csr_matrix):
+        """Set intensity matrix to new value"""
+        self._intensity = value
+        self._intensity.check_format()
+        self._intensity.eliminate_zeros()
+        self._intensity.sum_duplicates()
+
+    @property
+    def fraction(self) -> sparse.csr_matrix:
+        """Hazard fraction matrix"""
+        return self._fraction
+
+    @fraction.setter
+    def fraction(self, value: sparse.csr_matrix):
+        """Set fraction matrix to new value"""
+        self._fraction = value
+        self._fraction.check_format()
+        self._fraction.eliminate_zeros()
+        self._fraction.sum_duplicates()
+
     @classmethod
     def get_default(cls, attribute):
         """Get the Hazard type default for a given attribute.

diff --git a/climada/hazard/io.py b/climada/hazard/io.py
@@ -1004,12 +1004,18 @@ def write_hdf5(self, file_name, todense=False):
         LOGGER.info('Writing %s', file_name)
         with h5py.File(file_name, 'w') as hf_data:
             str_dt = h5py.special_dtype(vlen=str)
-            for (var_name, var_val) in self.__dict__.items():
+            for var_name in self.__dict__:
                 if var_name == 'centroids':
                     # Centroids have their own write_hdf5 method,
                     # which is invoked at the end of this method (s.b.)
-                    pass
-                elif isinstance(var_val, sparse.csr_matrix):
+                    continue
+                # Prune private attributes
+                if var_name in self.vars_oblig:
+                    var_name = var_name.lstrip("_")
+
+                var_val = getattr(self, var_name)  # Also works with properties
+
+                if isinstance(var_val, sparse.csr_matrix):
                     if todense:
                         hf_data.create_dataset(var_name, data=var_val.toarray())
                     else:
@@ -1065,11 +1071,18 @@ def from_hdf5(cls, file_name):
         haz = cls()
         hazard_kwargs = dict()
         with h5py.File(file_name, 'r') as hf_data:
-            for (var_name, var_val) in haz.__dict__.items():
+            for var_name in haz.__dict__:
+                # Prune private attributes
+                if var_name in haz.vars_oblig:
+                    var_name = var_name.lstrip("_")
+
                 if var_name not in hf_data.keys():
                     continue
                 if var_name == 'centroids':
                     continue
+
+                var_val = getattr(haz, var_name)  # Also works with properties
+
                 if isinstance(var_val, np.ndarray) and var_val.ndim == 1:
                     hazard_kwargs[var_name] = np.array(hf_data.get(var_name))
                 elif isinstance(var_val, sparse.csr_matrix):

diff --git a/climada/hazard/test/test_base.py b/climada/hazard/test/test_base.py
@@ -124,18 +124,18 @@ def test_check_wrongFreq_fail(self):
     def test_check_wrongInten_fail(self):
         """Wrong hazard definition"""
         self.hazard.intensity = sparse.csr_matrix([[1, 2], [1, 2]])
-
-        with self.assertRaises(ValueError) as cm:
+        with self.assertRaisesRegex(
+            ValueError, "Invalid Hazard._intensity row size: 3 != 2."
+        ):
             self.hazard.check()
-        self.assertIn('Invalid Hazard.intensity row size: 3 != 2.', str(cm.exception))
 
     def test_check_wrongFrac_fail(self):
         """Wrong hazard definition"""
         self.hazard.fraction = sparse.csr_matrix([[1], [1], [1]])
-
-        with self.assertRaises(ValueError) as cm:
+        with self.assertRaisesRegex(
+            ValueError, "Invalid Hazard._fraction column size: 2 != 1."
+        ):
             self.hazard.check()
-        self.assertIn('Invalid Hazard.fraction column size: 2 != 1.', str(cm.exception))
 
     def test_check_wrongEvName_fail(self):
         """Wrong hazard definition"""
@@ -212,6 +212,36 @@ def test_get_date_strings_pass(self):
         self.assertEqual(haz.get_event_date()[560],
                          u_dt.date_to_str(haz.date[560]))
 
+    def test_matrix_consistency(self):
+        """Check that the csr_matrix is brought in canonical format"""
+        # Non-canonical: First three data points will be summed onto the first matrix
+        # entry, forth will be an explicit zero entry
+        data = [0, 1, 2, 0]
+        indices = [0, 0, 0, 1]
+        indptr = [0, 4, 4, 4]
+        matrix = sparse.csr_matrix((data, indices, indptr), shape=(3, 2))
+        np.testing.assert_array_equal(matrix.data, data)
+        np.testing.assert_array_equal(matrix[0, [0, 1]].toarray(), [[3, 0]])
+        self.assertEqual(matrix.nnz, 4)
+        self.assertFalse(matrix.has_canonical_format)
+
+        def check_canonical_matrix(mat):
+            self.assertTrue(mat.has_canonical_format)
+            self.assertEqual(mat[0, 0], 3)
+            np.testing.assert_array_equal(mat.data, [3])
+            self.assertEqual(mat.nnz, 1)
+
+        # Check canonical format when initializing
+        hazard_new = Hazard("TC", intensity=matrix.copy(), fraction=matrix.copy())
+        matrix_attrs = ("intensity", "fraction")
+        for attr in matrix_attrs:
+            check_canonical_matrix(getattr(hazard_new, attr))
+
+        # Check conversion to canonical format when assigning
+        for attr in ("intensity", "fraction"):
+            setattr(self.hazard, attr, matrix.copy())
+            check_canonical_matrix(getattr(self.hazard, attr))
+
 class TestRemoveDupl(unittest.TestCase):
     """Test remove_duplicates method."""