Skip to content

Commit

Permalink
update normalize function
Browse files Browse the repository at this point in the history
  • Loading branch information
YQ-Wang committed Dec 3, 2023
1 parent 69ec97f commit f04126a
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 55 deletions.
50 changes: 20 additions & 30 deletions bsp2/bsp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,40 +13,30 @@
isspmatrix_csr)
from scipy.spatial import KDTree # type: ignore
from scipy.stats import gmean, lognorm # type: ignore
from sklearn.preprocessing import minmax_scale # type: ignore


def _scale_sparse_minmax(input_exp_mat: csr_matrix) -> csr_matrix:
def _scale_sparse_matrix(input_exp_mat: csr_matrix) -> csr_matrix:
if input_exp_mat.shape[0] == 0 or input_exp_mat.shape[1] == 0:
return input_exp_mat

input_exp_mat_cdx = input_exp_mat.data

if len(input_exp_mat_cdx) / input_exp_mat.shape[0] / input_exp_mat.shape[1] > 0.1:
input_exp_mat_den = input_exp_mat.todense()
input_exp_mat_den_array = np.asarray(input_exp_mat_den)
norm_exp = minmax_scale(input_exp_mat_den_array, axis=1)
created_sparse_mat = csr_matrix(norm_exp)
else:
input_exp_mat_row = input_exp_mat.getnnz(axis=0)
input_exp_mat_idx = np.r_[0, input_exp_mat_row[:-1].cumsum()]
input_exp_mat_max = np.maximum.reduceat(input_exp_mat_cdx, input_exp_mat_idx)
input_exp_mat_min = (
np.minimum.reduceat(input_exp_mat_cdx, input_exp_mat_idx) - 1
)
input_exp_mat_diff = input_exp_mat_max - input_exp_mat_min
input_exp_mat_diff[input_exp_mat_diff == 0] = 1 # Prevent division by zero
input_exp_mat_diffs = 1 / input_exp_mat_diff
input_exp_mat_diffs = np.repeat(input_exp_mat_diffs, input_exp_mat_row)
input_exp_mat_mins = np.repeat(input_exp_mat_min, input_exp_mat_row)
input_exp_mat_vals = (
input_exp_mat_cdx - input_exp_mat_mins
) * input_exp_mat_diffs
rows, cols = input_exp_mat.nonzero()
created_sparse_mat = csr_matrix(
(input_exp_mat_vals, (rows, cols)), shape=input_exp_mat.shape
)
return created_sparse_mat
data = input_exp_mat.data
rows, cols = input_exp_mat.nonzero()

row_indices = np.diff(input_exp_mat.indptr)
row_idx = np.r_[0, np.cumsum(row_indices)]

row_max = np.array(
[
data[start:end].max() if end > start else 1
for start, end in zip(row_idx[:-1], row_idx[1:])
]
)

# Scale the data based on the row max
data_scaled = data / np.repeat(row_max, row_indices)
scaled_matrix = csr_matrix((data_scaled, (rows, cols)), shape=input_exp_mat.shape)

return scaled_matrix


def _binary_distance_matrix_threshold(
Expand All @@ -73,7 +63,7 @@ def _spvars(input_csr_mat: csr_matrix, axis: int) -> List[float]:
def _test_scores(
input_sp_mat: np.ndarray, input_exp_mat_raw: csr_matrix, d1: float, d2: float
) -> List[float]:
input_exp_mat_norm = _scale_sparse_minmax(input_exp_mat_raw).transpose()
input_exp_mat_norm = _scale_sparse_matrix(input_exp_mat_raw).transpose()
input_exp_mat_raw = input_exp_mat_raw.transpose()
inverted_diag_matrix_cache: Dict[Tuple, csr_matrix] = {}

Expand Down
63 changes: 38 additions & 25 deletions test/test_bsp2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,62 @@
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import random as sparse_random

from bsp2.bsp2 import (
_binary_distance_matrix_threshold,
_scale_sparse_minmax,
_scale_sparse_matrix,
_spvars,
_test_scores,
granp,
)


class TestScaleSparseMinmax(unittest.TestCase):
def test_dense_conversion_scaling(self):
# Sparse matrix with more than 10% non-zero entries
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
rows = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
cols = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
matrix = csr_matrix((data, (rows, cols)), shape=(5, 2))

scaled_matrix = _scale_sparse_minmax(matrix)
scaled_matrix_dense = np.asarray(scaled_matrix.todense())

self.assertEqual(scaled_matrix.shape, matrix.shape)
# Ensure scaled_matrix_dense is ndarray and not np.matrix
self.assertIsInstance(scaled_matrix_dense, np.ndarray)
def test_scale_sparse_matrix(self):
# Creating a small sparse matrix with known values
rows, cols = 3, 3
data = [4, 2, 1, 4, 5]
row_indices = [0, 0, 1, 1, 2]
col_indices = [0, 2, 1, 2, 2]
test_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(rows, cols))

# Expected scaled matrix
# For row 0: max is 4, so 4/4=1 and 2/4=0.5
# For row 1: max is 4, so 1/4=0.25 and 4/4=1
# For row 2: only one value which is 5, so it remains 1
expected_scaled_data = [1, 0.5, 0.25, 1, 1]

scaled_matrix = _scale_sparse_matrix(test_matrix)

scaled_data = scaled_matrix.data
self.assertTrue(
np.allclose(scaled_data, expected_scaled_data),
"Scaled data does not match expected values",
)

def test_sparse_scaling(self):
# Sparse matrix with less than 10% non-zero entries
data = np.array([1, 2, 3])
rows = np.array([0, 1, 2])
cols = np.array([0, 0, 0])
matrix = csr_matrix((data, (rows, cols)), shape=(5, 5))
def test_sparse_matrix_scaling(self):
# Create a sparse matrix with less than 10% non-zero entries
rows, cols = 10, 10
density = 0.1
sparse_matrix = sparse_random(
rows, cols, density=density, format="csr", dtype=float
)

scaled_matrix = _scale_sparse_minmax(matrix)
scaled_matrix_dense = np.asarray(scaled_matrix.todense())
scaled_matrix = _scale_sparse_matrix(sparse_matrix)

self.assertEqual(scaled_matrix.shape, matrix.shape)
self.assertIsInstance(scaled_matrix_dense, np.ndarray)
for row in range(scaled_matrix.shape[0]):
row_data = scaled_matrix[row, :].toarray().flatten()
if np.any(row_data != 0):
self.assertEqual(
row_data.max(), 1, "Max value in a row should be 1 after scaling"
)

def test_empty_matrix(self):
# Empty matrix
matrix = csr_matrix((0, 0))

scaled_matrix = _scale_sparse_minmax(matrix)
scaled_matrix = _scale_sparse_matrix(matrix)
scaled_matrix_dense = np.asarray(scaled_matrix.todense())

self.assertEqual(scaled_matrix.shape, matrix.shape)
Expand Down

0 comments on commit f04126a

Please sign in to comment.