update normalize function

YQ-Wang · Dec 3, 2023 · f04126a · f04126a
1 parent 69ec97f
commit f04126a
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 55 deletions.
diff --git a/bsp2/bsp2.py b/bsp2/bsp2.py
@@ -13,40 +13,30 @@
                           isspmatrix_csr)
 from scipy.spatial import KDTree  # type: ignore
 from scipy.stats import gmean, lognorm  # type: ignore
-from sklearn.preprocessing import minmax_scale  # type: ignore
 
 
-def _scale_sparse_minmax(input_exp_mat: csr_matrix) -> csr_matrix:
+def _scale_sparse_matrix(input_exp_mat: csr_matrix) -> csr_matrix:
     if input_exp_mat.shape[0] == 0 or input_exp_mat.shape[1] == 0:
         return input_exp_mat
 
-    input_exp_mat_cdx = input_exp_mat.data
-
-    if len(input_exp_mat_cdx) / input_exp_mat.shape[0] / input_exp_mat.shape[1] > 0.1:
-        input_exp_mat_den = input_exp_mat.todense()
-        input_exp_mat_den_array = np.asarray(input_exp_mat_den)
-        norm_exp = minmax_scale(input_exp_mat_den_array, axis=1)
-        created_sparse_mat = csr_matrix(norm_exp)
-    else:
-        input_exp_mat_row = input_exp_mat.getnnz(axis=0)
-        input_exp_mat_idx = np.r_[0, input_exp_mat_row[:-1].cumsum()]
-        input_exp_mat_max = np.maximum.reduceat(input_exp_mat_cdx, input_exp_mat_idx)
-        input_exp_mat_min = (
-            np.minimum.reduceat(input_exp_mat_cdx, input_exp_mat_idx) - 1
-        )
-        input_exp_mat_diff = input_exp_mat_max - input_exp_mat_min
-        input_exp_mat_diff[input_exp_mat_diff == 0] = 1  # Prevent division by zero
-        input_exp_mat_diffs = 1 / input_exp_mat_diff
-        input_exp_mat_diffs = np.repeat(input_exp_mat_diffs, input_exp_mat_row)
-        input_exp_mat_mins = np.repeat(input_exp_mat_min, input_exp_mat_row)
-        input_exp_mat_vals = (
-            input_exp_mat_cdx - input_exp_mat_mins
-        ) * input_exp_mat_diffs
-        rows, cols = input_exp_mat.nonzero()
-        created_sparse_mat = csr_matrix(
-            (input_exp_mat_vals, (rows, cols)), shape=input_exp_mat.shape
-        )
-    return created_sparse_mat
+    data = input_exp_mat.data
+    rows, cols = input_exp_mat.nonzero()
+
+    row_indices = np.diff(input_exp_mat.indptr)
+    row_idx = np.r_[0, np.cumsum(row_indices)]
+
+    row_max = np.array(
+        [
+            data[start:end].max() if end > start else 1
+            for start, end in zip(row_idx[:-1], row_idx[1:])
+        ]
+    )
+
+    # Scale the data based on the row max
+    data_scaled = data / np.repeat(row_max, row_indices)
+    scaled_matrix = csr_matrix((data_scaled, (rows, cols)), shape=input_exp_mat.shape)
+
+    return scaled_matrix
 
 
 def _binary_distance_matrix_threshold(
@@ -73,7 +63,7 @@ def _spvars(input_csr_mat: csr_matrix, axis: int) -> List[float]:
 def _test_scores(
     input_sp_mat: np.ndarray, input_exp_mat_raw: csr_matrix, d1: float, d2: float
 ) -> List[float]:
-    input_exp_mat_norm = _scale_sparse_minmax(input_exp_mat_raw).transpose()
+    input_exp_mat_norm = _scale_sparse_matrix(input_exp_mat_raw).transpose()
     input_exp_mat_raw = input_exp_mat_raw.transpose()
     inverted_diag_matrix_cache: Dict[Tuple, csr_matrix] = {}
 

diff --git a/test/test_bsp2.py b/test/test_bsp2.py
@@ -3,49 +3,62 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
+from scipy.sparse import random as sparse_random
 
 from bsp2.bsp2 import (
     _binary_distance_matrix_threshold,
-    _scale_sparse_minmax,
+    _scale_sparse_matrix,
     _spvars,
     _test_scores,
     granp,
 )
 
 
 class TestScaleSparseMinmax(unittest.TestCase):
-    def test_dense_conversion_scaling(self):
-        # Sparse matrix with more than 10% non-zero entries
-        data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-        rows = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
-        cols = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
-        matrix = csr_matrix((data, (rows, cols)), shape=(5, 2))
-
-        scaled_matrix = _scale_sparse_minmax(matrix)
-        scaled_matrix_dense = np.asarray(scaled_matrix.todense())
-
-        self.assertEqual(scaled_matrix.shape, matrix.shape)
-        # Ensure scaled_matrix_dense is ndarray and not np.matrix
-        self.assertIsInstance(scaled_matrix_dense, np.ndarray)
+    def test_scale_sparse_matrix(self):
+        # Creating a small sparse matrix with known values
+        rows, cols = 3, 3
+        data = [4, 2, 1, 4, 5]
+        row_indices = [0, 0, 1, 1, 2]
+        col_indices = [0, 2, 1, 2, 2]
+        test_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(rows, cols))
+
+        # Expected scaled matrix
+        # For row 0: max is 4, so 4/4=1 and 2/4=0.5
+        # For row 1: max is 4, so 1/4=0.25 and 4/4=1
+        # For row 2: only one value which is 5, so it remains 1
+        expected_scaled_data = [1, 0.5, 0.25, 1, 1]
+
+        scaled_matrix = _scale_sparse_matrix(test_matrix)
+
+        scaled_data = scaled_matrix.data
+        self.assertTrue(
+            np.allclose(scaled_data, expected_scaled_data),
+            "Scaled data does not match expected values",
+        )
 
-    def test_sparse_scaling(self):
-        # Sparse matrix with less than 10% non-zero entries
-        data = np.array([1, 2, 3])
-        rows = np.array([0, 1, 2])
-        cols = np.array([0, 0, 0])
-        matrix = csr_matrix((data, (rows, cols)), shape=(5, 5))
+    def test_sparse_matrix_scaling(self):
+        # Create a sparse matrix with less than 10% non-zero entries
+        rows, cols = 10, 10
+        density = 0.1
+        sparse_matrix = sparse_random(
+            rows, cols, density=density, format="csr", dtype=float
+        )
 
-        scaled_matrix = _scale_sparse_minmax(matrix)
-        scaled_matrix_dense = np.asarray(scaled_matrix.todense())
+        scaled_matrix = _scale_sparse_matrix(sparse_matrix)
 
-        self.assertEqual(scaled_matrix.shape, matrix.shape)
-        self.assertIsInstance(scaled_matrix_dense, np.ndarray)
+        for row in range(scaled_matrix.shape[0]):
+            row_data = scaled_matrix[row, :].toarray().flatten()
+            if np.any(row_data != 0):
+                self.assertEqual(
+                    row_data.max(), 1, "Max value in a row should be 1 after scaling"
+                )
 
     def test_empty_matrix(self):
         # Empty matrix
         matrix = csr_matrix((0, 0))
 
-        scaled_matrix = _scale_sparse_minmax(matrix)
+        scaled_matrix = _scale_sparse_matrix(matrix)
         scaled_matrix_dense = np.asarray(scaled_matrix.todense())
 
         self.assertEqual(scaled_matrix.shape, matrix.shape)