fix casting to float and test (#837)

theislab · Dec 6, 2024 · 27852fa · 27852fa
1 parent 2e8240f
commit 27852fa
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 18 deletions.
diff --git a/ehrapy/anndata/anndata_ext.py b/ehrapy/anndata/anndata_ext.py
@@ -388,6 +388,9 @@ def set_numeric_vars(
 
     vars_idx = get_column_indices(adata, vars)
 
+    # if e.g. adata.X is of type int64, and values of dtype float64, the floats will be casted to int
+    adata.X = adata.X.astype(values.dtype)
+
     adata.X[:, vars_idx] = values
 
     return adata

diff --git a/tests/data/dataset1.csv b/tests/data/dataset1.csv
@@ -1,13 +1,13 @@
-idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station
-1,138,78,80,77,A,ICU
-2,139,79,90,76,A,ICU
-3,140,80,120,60,A,MICU
-4,141,81,130,90,A,MICU
-5,148,77,80,110,B,ICU
-6,149,78,135,78,B,ICU
-7,150,79,125,56,B,MICU
-8,151,80,95,76,B,MICU
-9,158,55,70,67,C,ICU
-10,159,56,85,82,C,ICU
-11,160,57,125,59,C,MICU
-12,161,58,125,81,C,MICU
+idx,sys_bp_entry,dia_bp_entry,glucose,weight,in_days,disease,station
+1,138,78,80,77,1,A,ICU
+2,139,79,90,76,2,A,ICU
+3,140,80,120,60,0,A,MICU
+4,141,81,130,90,1,A,MICU
+5,148,77,80,110,0,B,ICU
+6,149,78,135,78,1,B,ICU
+7,150,79,125,56,2,B,MICU
+8,151,80,95,76,3,B,MICU
+9,158,55,70,67,4,C,ICU
+10,159,56,85,82,1,C,ICU
+11,160,57,125,59,2,C,MICU
+12,161,58,125,81,1,C,MICU
diff --git a/tests/preprocessing/test_normalization.py b/tests/preprocessing/test_normalization.py
@@ -23,6 +23,19 @@ def adata_mini():
     )[:8]
 
 
+@pytest.fixture
+def adata_mini_integers_in_X():
+    adata = read_csv(
+        f"{TEST_DATA_PATH}/dataset1.csv",
+        columns_obs_only=["idx", "sys_bp_entry", "dia_bp_entry", "glucose", "weight", "disease", "station"],
+    )
+    # cast data in X to integers; pd.read generates floats generously, but want to test integer normalization
+    adata.X = adata.X.astype(np.int32)
+    ep.ad.infer_feature_types(adata)
+    ep.ad.replace_feature_types(adata, ["in_days"], "numeric")
+    return adata
+
+
 @pytest.fixture
 def adata_to_norm():
     obs_data = {"ID": ["Patient1", "Patient2", "Patient3"], "Age": [31, 94, 62]}
@@ -94,6 +107,27 @@ def test_norm_scale(array_type, adata_to_norm):
     assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_scale_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.scale_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array(
+        [
+            [-0.4472136],
+            [0.4472136],
+            [-1.34164079],
+            [-0.4472136],
+            [-1.34164079],
+            [-0.4472136],
+            [0.4472136],
+            [1.34164079],
+            [2.23606798],
+            [-0.4472136],
+            [0.4472136],
+            [-0.4472136],
+        ]
+    )
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_scale_kwargs(array_type, adata_to_norm):
     adata_to_norm_casted = adata_to_norm.copy()
@@ -159,6 +193,12 @@ def test_norm_minmax(array_type, adata_to_norm):
     assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_minmax_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.minmax_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array([[0.25], [0.5], [0.0], [0.25], [0.0], [0.25], [0.5], [0.75], [1.0], [0.25], [0.5], [0.25]])
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_minmax_kwargs(array_type, adata_to_norm):
     adata_to_norm_casted = adata_to_norm.copy()
@@ -218,6 +258,12 @@ def test_norm_maxabs(array_type, adata_to_norm):
         assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_maxabs_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.maxabs_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array([[0.25], [0.5], [0.0], [0.25], [0.0], [0.25], [0.5], [0.75], [1.0], [0.25], [0.5], [0.25]])
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_maxabs_group(array_type, adata_mini):
     adata_mini_casted = adata_mini.copy()
@@ -273,6 +319,12 @@ def test_norm_robust_scale(array_type, adata_to_norm):
     assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_robust_scale_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.robust_scale_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array([[0.0], [1.0], [-1.0], [0.0], [-1.0], [0.0], [1.0], [2.0], [3.0], [0.0], [1.0], [0.0]])
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_robust_scale_kwargs(array_type, adata_to_norm):
     adata_to_norm_casted = adata_to_norm.copy()
@@ -331,6 +383,27 @@ def test_norm_quantile_uniform(array_type, adata_to_norm):
     assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_quantile_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.quantile_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array(
+        [
+            [0.36363636],
+            [0.72727273],
+            [0.0],
+            [0.36363636],
+            [0.0],
+            [0.36363636],
+            [0.72727273],
+            [0.90909091],
+            [1.0],
+            [0.36363636],
+            [0.72727273],
+            [0.36363636],
+        ]
+    )
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_quantile_uniform_kwargs(array_type, adata_to_norm):
     adata_to_norm_casted = adata_to_norm.copy()
@@ -392,6 +465,27 @@ def test_norm_power(array_type, adata_to_norm):
         assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True)
 
 
+def test_norm_power_integers(adata_mini_integers_in_X):
+    adata_norm = ep.pp.power_norm(adata_mini_integers_in_X, copy=True)
+    in_days_norm = np.array(
+        [
+            [-0.31234142],
+            [0.58319338],
+            [-1.65324303],
+            [-0.31234142],
+            [-1.65324303],
+            [-0.31234142],
+            [0.58319338],
+            [1.27419965],
+            [1.8444134],
+            [-0.31234142],
+            [0.58319338],
+            [-0.31234142],
+        ]
+    )
+    assert np.allclose(adata_norm.X, in_days_norm)
+
+
 @pytest.mark.parametrize("array_type", ARRAY_TYPES)
 def test_norm_power_kwargs(array_type, adata_to_norm):
     adata_to_norm_casted = adata_to_norm.copy()

diff --git a/tests/tools/feature_ranking/test_rank_features_groups.py b/tests/tools/feature_ranking/test_rank_features_groups.py
@@ -323,15 +323,20 @@ def test_rank_features_groups_generates_outputs(field_to_rank):
     assert "log2foldchanges" not in adata.uns["rank_features_groups"]
     assert "pts" not in adata.uns["rank_features_groups"]
 
-    if field_to_rank == "layer" or field_to_rank == "obs":
+    if field_to_rank == "layer":
+        assert len(adata.uns["rank_features_groups"]["names"]) == 4
+        assert len(adata.uns["rank_features_groups"]["pvals"]) == 4
+        assert len(adata.uns["rank_features_groups"]["scores"]) == 4
+
+    elif field_to_rank == "obs":
         assert len(adata.uns["rank_features_groups"]["names"]) == 3  # It only captures the length of each group
         assert len(adata.uns["rank_features_groups"]["pvals"]) == 3
         assert len(adata.uns["rank_features_groups"]["scores"]) == 3
 
     elif field_to_rank == "layer_and_obs":
-        assert len(adata.uns["rank_features_groups"]["names"]) == 6  # It only captures the length of each group
-        assert len(adata.uns["rank_features_groups"]["pvals"]) == 6
-        assert len(adata.uns["rank_features_groups"]["scores"]) == 6
+        assert len(adata.uns["rank_features_groups"]["names"]) == 7  # It only captures the length of each group
+        assert len(adata.uns["rank_features_groups"]["pvals"]) == 7
+        assert len(adata.uns["rank_features_groups"]["scores"]) == 7
 
 
 def test_rank_features_groups_consistent_results():
@@ -396,7 +401,7 @@ def test_rank_features_group_column_to_rank():
     adata_copy = adata.copy()
 
     ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank="all")
-    assert len(adata.uns["rank_features_groups"]["names"]) == 2
+    assert len(adata.uns["rank_features_groups"]["names"]) == 3
 
     # want to check a "complete selection" works
     adata = adata_copy.copy()