matrix normalization utility (#58)

* implement matrix normalization utility * test matrix norm
OmicsML · Nov 20, 2022 · 8113a2c · 8113a2c
1 parent 9bce772
commit 8113a2c
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 1 deletion.
diff --git a/dance/utils/matrix.py b/dance/utils/matrix.py
@@ -0,0 +1,68 @@
+import numpy as np
+import torch
+
+from dance.typing import Literal
+
+NormMode = Literal["normalize", "standardize", "minmax", "l2"]
+
+
+def normalize(mat, *, mode: NormMode = "normalize", axis: int = 0, eps: float = -1.0):
+ """Normalize a matrix.
+
+ Parameters
+ ----------
+ mat
+ Input matrix to be notmalized, can be torch tensor or numpy array.
+ mode
+ Normalization mode. **normalize** means divide the values by the sum. **standardize** means center then rescale
+ by standard deviation. "minmax" means rescale the values along the axis of choice between zero and one.
+ axis
+ Axis along which the normalization will take place.
+ eps
+ Denominator correction factor to prevent divide by zero error. If set to -1, then replace the zero entries with
+ ones.
+
+ """
+ if isinstance(mat, torch.Tensor):
+ is_torch = True
+ elif not isinstance(mat, np.ndarray):
+ raise TypeError(f"Invalid type for input matrix: {type(mat)}")
+ else:
+ is_torch = False
+ opts = {"axis": axis, "keepdims": True}
+
+ # Compute shift
+ if mode == "standardize":
+ shift = -mat.mean(**opts)
+ elif mode == "minmax":
+ min_vals = mat.min(**opts)[0] if is_torch else mat.min(**opts)
+ shift = -min_vals
+ else:
+ shift = 0
+
+ # Compute rescaling factor
+ if mode == "normalize":
+ denom = mat.sum(**opts)
+ elif mode == "standardize":
+ denom = mat.std(**opts, unbiased=False) if is_torch else mat.std(**opts)
+ elif mode == "minmax":
+ max_vals = mat.max(**opts)[0] if is_torch else mat.max(**opts)
+ denom = max_vals - min_vals
+ elif mode == "l2":
+ denom = (mat**2).sum(**opts)**0.5
+ else:
+ denom = None
+
+ # Correct denominator to prevent divide by zero error
+ if denom is None:
+ denom = 1
+ elif eps == -1:
+ denom[denom == 0] = 1
+ elif eps > 0:
+ denom += eps
+ else:
+ raise ValueError(f"Invalid {eps=!r}. Must be positive or -1, the later set zero entries to one.")
+
+ norm_mat = (mat + shift) / denom
+
+ return norm_mat
diff --git a/setup.cfg b/setup.cfg
@@ -63,6 +63,7 @@ packages = find:
 [options.extras_require]
 dev =
  pre-commit==2.20.0
- pytest==7.2.0
+ pytest-subtests==0.9.0
  pytest-xdist==3.0.2
+ pytest==7.2.0
  tox==3.27.1
diff --git a/tests/utils/test_matrix.py b/tests/utils/test_matrix.py
@@ -0,0 +1,26 @@
+import numpy as np
+
+from dance.utils import matrix
+
+
+def test_normalize(subtests):
+ mat = np.array([[1, 1], [4, 4]])
+
+ with subtests.test("normalize"):
+ assert matrix.normalize(mat, mode="normalize", axis=0).tolist() == [[0.2, 0.2], [0.8, 0.8]]
+ assert matrix.normalize(mat, mode="normalize", axis=1).tolist() == [[0.5, 0.5], [0.5, 0.5]]
+
+ with subtests.test("standardize"):
+ assert matrix.normalize(mat, mode="standardize", axis=0).tolist() == [[-1, -1], [1, 1]]
+ assert matrix.normalize(mat, mode="standardize", axis=1).tolist() == [[0, 0], [0, 0]]
+
+ with subtests.test("minmax"):
+ assert matrix.normalize(mat, mode="minmax", axis=0).tolist() == [[0, 0], [1, 1]]
+ assert matrix.normalize(mat, mode="minmax", axis=1).tolist() == [[0, 0], [0, 0]]
+
+ with subtests.test("l2"):
+ mat_norm0 = (mat / np.sqrt((mat**2).sum(0))).tolist()
+ assert matrix.normalize(mat, mode="l2", axis=0).tolist() == mat_norm0
+
+ mat_norm1 = (mat / np.sqrt((mat**2).sum(1, keepdims=True))).tolist()
+ assert matrix.normalize(mat, mode="l2", axis=1).tolist() == mat_norm1