From afd898653479a9fd2b3ae2f41bbd76682eb520f5 Mon Sep 17 00:00:00 2001 From: eroell Date: Fri, 6 Dec 2024 12:30:31 +0100 Subject: [PATCH] fix casting to float and test --- ehrapy/anndata/anndata_ext.py | 3 + tests/data/dataset1.csv | 26 ++--- tests/preprocessing/test_normalization.py | 94 +++++++++++++++++++ .../test_rank_features_groups.py | 15 ++- 4 files changed, 120 insertions(+), 18 deletions(-) diff --git a/ehrapy/anndata/anndata_ext.py b/ehrapy/anndata/anndata_ext.py index 38d60224..85e53f6c 100644 --- a/ehrapy/anndata/anndata_ext.py +++ b/ehrapy/anndata/anndata_ext.py @@ -388,6 +388,9 @@ def set_numeric_vars( vars_idx = get_column_indices(adata, vars) + # if e.g. adata.X is of type int64, and values of dtype float64, the floats will be casted to int + adata.X = adata.X.astype(values.dtype) + adata.X[:, vars_idx] = values return adata diff --git a/tests/data/dataset1.csv b/tests/data/dataset1.csv index 1569f780..194641f4 100644 --- a/tests/data/dataset1.csv +++ b/tests/data/dataset1.csv @@ -1,13 +1,13 @@ -idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station -1,138,78,80,77,A,ICU -2,139,79,90,76,A,ICU -3,140,80,120,60,A,MICU -4,141,81,130,90,A,MICU -5,148,77,80,110,B,ICU -6,149,78,135,78,B,ICU -7,150,79,125,56,B,MICU -8,151,80,95,76,B,MICU -9,158,55,70,67,C,ICU -10,159,56,85,82,C,ICU -11,160,57,125,59,C,MICU -12,161,58,125,81,C,MICU +idx,sys_bp_entry,dia_bp_entry,glucose,weight,in_days,disease,station +1,138,78,80,77,1,A,ICU +2,139,79,90,76,2,A,ICU +3,140,80,120,60,0,A,MICU +4,141,81,130,90,1,A,MICU +5,148,77,80,110,0,B,ICU +6,149,78,135,78,1,B,ICU +7,150,79,125,56,2,B,MICU +8,151,80,95,76,3,B,MICU +9,158,55,70,67,4,C,ICU +10,159,56,85,82,1,C,ICU +11,160,57,125,59,2,C,MICU +12,161,58,125,81,1,C,MICU diff --git a/tests/preprocessing/test_normalization.py b/tests/preprocessing/test_normalization.py index 75df11ae..5067b237 100644 --- a/tests/preprocessing/test_normalization.py +++ b/tests/preprocessing/test_normalization.py @@ -23,6 +23,19 @@ def adata_mini(): )[:8] +@pytest.fixture +def adata_mini_integers_in_X(): + adata = read_csv( + f"{TEST_DATA_PATH}/dataset1.csv", + columns_obs_only=["idx", "sys_bp_entry", "dia_bp_entry", "glucose", "weight", "disease", "station"], + ) + # cast data in X to integers; pd.read generates floats generously, but want to test integer normalization + adata.X = adata.X.astype(np.int32) + ep.ad.infer_feature_types(adata) + ep.ad.replace_feature_types(adata, ["in_days"], "numeric") + return adata + + @pytest.fixture def adata_to_norm(): obs_data = {"ID": ["Patient1", "Patient2", "Patient3"], "Age": [31, 94, 62]} @@ -94,6 +107,27 @@ def test_norm_scale(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_scale_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.scale_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array( + [ + [-0.4472136], + [0.4472136], + [-1.34164079], + [-0.4472136], + [-1.34164079], + [-0.4472136], + [0.4472136], + [1.34164079], + [2.23606798], + [-0.4472136], + [0.4472136], + [-0.4472136], + ] + ) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_scale_kwargs(array_type, adata_to_norm): adata_to_norm_casted = adata_to_norm.copy() @@ -159,6 +193,12 @@ def test_norm_minmax(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_minmax_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.minmax_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array([[0.25], [0.5], [0.0], [0.25], [0.0], [0.25], [0.5], [0.75], [1.0], [0.25], [0.5], [0.25]]) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_minmax_kwargs(array_type, adata_to_norm): adata_to_norm_casted = adata_to_norm.copy() @@ -218,6 +258,12 @@ def test_norm_maxabs(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_maxabs_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.maxabs_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array([[0.25], [0.5], [0.0], [0.25], [0.0], [0.25], [0.5], [0.75], [1.0], [0.25], [0.5], [0.25]]) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_maxabs_group(array_type, adata_mini): adata_mini_casted = adata_mini.copy() @@ -273,6 +319,12 @@ def test_norm_robust_scale(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_robust_scale_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.robust_scale_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array([[0.0], [1.0], [-1.0], [0.0], [-1.0], [0.0], [1.0], [2.0], [3.0], [0.0], [1.0], [0.0]]) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_robust_scale_kwargs(array_type, adata_to_norm): adata_to_norm_casted = adata_to_norm.copy() @@ -331,6 +383,27 @@ def test_norm_quantile_uniform(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_quantile_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.quantile_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array( + [ + [0.36363636], + [0.72727273], + [0.0], + [0.36363636], + [0.0], + [0.36363636], + [0.72727273], + [0.90909091], + [1.0], + [0.36363636], + [0.72727273], + [0.36363636], + ] + ) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_quantile_uniform_kwargs(array_type, adata_to_norm): adata_to_norm_casted = adata_to_norm.copy() @@ -392,6 +465,27 @@ def test_norm_power(array_type, adata_to_norm): assert np.allclose(adata_norm.X[:, 5], adata_to_norm_casted.X[:, 5], equal_nan=True) +def test_norm_power_integers(adata_mini_integers_in_X): + adata_norm = ep.pp.power_norm(adata_mini_integers_in_X, copy=True) + in_days_norm = np.array( + [ + [-0.31234142], + [0.58319338], + [-1.65324303], + [-0.31234142], + [-1.65324303], + [-0.31234142], + [0.58319338], + [1.27419965], + [1.8444134], + [-0.31234142], + [0.58319338], + [-0.31234142], + ] + ) + assert np.allclose(adata_norm.X, in_days_norm) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_norm_power_kwargs(array_type, adata_to_norm): adata_to_norm_casted = adata_to_norm.copy() diff --git a/tests/tools/feature_ranking/test_rank_features_groups.py b/tests/tools/feature_ranking/test_rank_features_groups.py index a6b27e74..4a367a3d 100644 --- a/tests/tools/feature_ranking/test_rank_features_groups.py +++ b/tests/tools/feature_ranking/test_rank_features_groups.py @@ -323,15 +323,20 @@ def test_rank_features_groups_generates_outputs(field_to_rank): assert "log2foldchanges" not in adata.uns["rank_features_groups"] assert "pts" not in adata.uns["rank_features_groups"] - if field_to_rank == "layer" or field_to_rank == "obs": + if field_to_rank == "layer": + assert len(adata.uns["rank_features_groups"]["names"]) == 4 + assert len(adata.uns["rank_features_groups"]["pvals"]) == 4 + assert len(adata.uns["rank_features_groups"]["scores"]) == 4 + + elif field_to_rank == "obs": assert len(adata.uns["rank_features_groups"]["names"]) == 3 # It only captures the length of each group assert len(adata.uns["rank_features_groups"]["pvals"]) == 3 assert len(adata.uns["rank_features_groups"]["scores"]) == 3 elif field_to_rank == "layer_and_obs": - assert len(adata.uns["rank_features_groups"]["names"]) == 6 # It only captures the length of each group - assert len(adata.uns["rank_features_groups"]["pvals"]) == 6 - assert len(adata.uns["rank_features_groups"]["scores"]) == 6 + assert len(adata.uns["rank_features_groups"]["names"]) == 7 # It only captures the length of each group + assert len(adata.uns["rank_features_groups"]["pvals"]) == 7 + assert len(adata.uns["rank_features_groups"]["scores"]) == 7 def test_rank_features_groups_consistent_results(): @@ -396,7 +401,7 @@ def test_rank_features_group_column_to_rank(): adata_copy = adata.copy() ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank="all") - assert len(adata.uns["rank_features_groups"]["names"]) == 2 + assert len(adata.uns["rank_features_groups"]["names"]) == 3 # want to check a "complete selection" works adata = adata_copy.copy()