From 441a4b194a6bab821949c181dcafe02490b30610 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:05:37 -0400 Subject: [PATCH 1/7] move feature select functions to subfolder --- pycytominer/{ => operations}/correlation_threshold.py | 0 pycytominer/{ => operations}/get_na_columns.py | 0 pycytominer/{ => operations}/variance_threshold.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename pycytominer/{ => operations}/correlation_threshold.py (100%) rename pycytominer/{ => operations}/get_na_columns.py (100%) rename pycytominer/{ => operations}/variance_threshold.py (100%) diff --git a/pycytominer/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py similarity index 100% rename from pycytominer/correlation_threshold.py rename to pycytominer/operations/correlation_threshold.py diff --git a/pycytominer/get_na_columns.py b/pycytominer/operations/get_na_columns.py similarity index 100% rename from pycytominer/get_na_columns.py rename to pycytominer/operations/get_na_columns.py diff --git a/pycytominer/variance_threshold.py b/pycytominer/operations/variance_threshold.py similarity index 100% rename from pycytominer/variance_threshold.py rename to pycytominer/operations/variance_threshold.py From 00cc3f6bbc124b4075512229289ea578c3d7fa1f Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:06:12 -0400 Subject: [PATCH 2/7] closes #71 --- pycytominer/feature_select.py | 12 +++++++----- pycytominer/operations/__init__.py | 3 +++ pycytominer/operations/correlation_threshold.py | 8 ++++---- pycytominer/operations/get_na_columns.py | 6 +++--- pycytominer/operations/variance_threshold.py | 8 ++++---- 5 files changed, 21 insertions(+), 16 deletions(-) create mode 100644 pycytominer/operations/__init__.py diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index e59d9fe7..a90b000a 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -5,9 +5,11 @@ import os import pandas as pd -from pycytominer.correlation_threshold import correlation_threshold -from pycytominer.variance_threshold import variance_threshold -from pycytominer.get_na_columns import get_na_columns +from pycytominer.operations import ( + correlation_threshold, + variance_threshold, + get_na_columns, +) from pycytominer.cyto_utils import ( load_profiles, output, @@ -20,7 +22,7 @@ def feature_select( profiles, features="infer", - samples="none", + samples="all", operation="variance_threshold", output_file="none", na_cutoff=0.05, @@ -42,7 +44,7 @@ def feature_select( if "infer", then assume cell painting features are those that start with "Cells", "Nuclei", or "Cytoplasm" samples - if provided, a list of samples to provide operation on - [default: "none"] - if "none", use all samples to calculate + [default: "all"] - if "all", use all samples to calculate operation - str or list of given operations to perform on input profiles output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend diff --git a/pycytominer/operations/__init__.py b/pycytominer/operations/__init__.py new file mode 100644 index 00000000..b80b79d9 --- /dev/null +++ b/pycytominer/operations/__init__.py @@ -0,0 +1,3 @@ +from .correlation_threshold import correlation_threshold +from .variance_threshold import variance_threshold +from .get_na_columns import get_na_columns diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py index 1ab70203..0f2f436f 100644 --- a/pycytominer/operations/correlation_threshold.py +++ b/pycytominer/operations/correlation_threshold.py @@ -5,15 +5,15 @@ import numpy as np import pandas as pd -from pycytominer.cyto_utils.features import infer_cp_features -from pycytominer.cyto_utils.util import ( +from pycytominer.cyto_utils import ( + infer_cp_features, get_pairwise_correlation, check_correlation_method, ) def correlation_threshold( - population_df, features="infer", samples="none", threshold=0.9, method="pearson" + population_df, features="infer", samples="all", threshold=0.9, method="pearson" ): """ Exclude features that have correlations above a certain threshold @@ -38,7 +38,7 @@ def correlation_threshold( assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)" # Subset dataframe and calculate correlation matrix across subset features - if samples != "none": + if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py index 17655b8b..10d7192a 100644 --- a/pycytominer/operations/get_na_columns.py +++ b/pycytominer/operations/get_na_columns.py @@ -7,7 +7,7 @@ from pycytominer.cyto_utils.features import infer_cp_features -def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05): +def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05): """ Get features that have more NA values than cutoff defined @@ -17,14 +17,14 @@ def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05) if "infer", then assume cell painting features are those that do not start with "Cells", "Nuclei", or "Cytoplasm" samples - if provided, a list of samples to provide operation on - [default: "none"] - if "none", use all samples to calculate + [default: "all"] - if "all", use all samples to calculate cutoff - float to exclude features that have a higher proportion of missingness Output: A list of the features to exclude """ - if samples != "none": + if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 6ae6178d..df2bedbc 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -5,11 +5,11 @@ import numpy as np import pandas as pd -from pycytominer.cyto_utils.features import infer_cp_features +from pycytominer.cyto_utils import infer_cp_features def variance_threshold( - population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01 + population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01 ): """ Exclude features that have low variance (low information content) @@ -20,7 +20,7 @@ def variance_threshold( if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on - [default: "none"] - if "none", use all samples to calculate + [default: "all"] - if "all", use all samples to calculate freq_cut - float of ratio (second most common feature value / most common) [default: 0.1] unique_cut - float of ratio (num unique features / num samples) [default: 0.1] @@ -32,7 +32,7 @@ def variance_threshold( assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)" # Subset dataframe - if samples != "none": + if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": From 85950b7b54b233c3a0c8711fb10c9cf7613e7701 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:10:16 -0400 Subject: [PATCH 3/7] move feature select operation tests to test_operations folder --- pycytominer/tests/test_operations/__init__.py | 0 .../tests/{ => test_operations}/test_correlation_threshold.py | 0 pycytominer/tests/{ => test_operations}/test_get_na_columns.py | 0 .../tests/{ => test_operations}/test_variance_threshold.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 pycytominer/tests/test_operations/__init__.py rename pycytominer/tests/{ => test_operations}/test_correlation_threshold.py (100%) rename pycytominer/tests/{ => test_operations}/test_get_na_columns.py (100%) rename pycytominer/tests/{ => test_operations}/test_variance_threshold.py (100%) diff --git a/pycytominer/tests/test_operations/__init__.py b/pycytominer/tests/test_operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pycytominer/tests/test_correlation_threshold.py b/pycytominer/tests/test_operations/test_correlation_threshold.py similarity index 100% rename from pycytominer/tests/test_correlation_threshold.py rename to pycytominer/tests/test_operations/test_correlation_threshold.py diff --git a/pycytominer/tests/test_get_na_columns.py b/pycytominer/tests/test_operations/test_get_na_columns.py similarity index 100% rename from pycytominer/tests/test_get_na_columns.py rename to pycytominer/tests/test_operations/test_get_na_columns.py diff --git a/pycytominer/tests/test_variance_threshold.py b/pycytominer/tests/test_operations/test_variance_threshold.py similarity index 100% rename from pycytominer/tests/test_variance_threshold.py rename to pycytominer/tests/test_operations/test_variance_threshold.py From ac2a16cb9ce5582c3753b24db6e98ab979d08f12 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:10:38 -0400 Subject: [PATCH 4/7] fix import and address #71 --- .../test_operations/test_correlation_threshold.py | 14 +++++++------- .../tests/test_operations/test_get_na_columns.py | 4 ++-- .../test_operations/test_variance_threshold.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pycytominer/tests/test_operations/test_correlation_threshold.py b/pycytominer/tests/test_operations/test_correlation_threshold.py index 08410cd4..cf39a14a 100644 --- a/pycytominer/tests/test_operations/test_correlation_threshold.py +++ b/pycytominer/tests/test_operations/test_correlation_threshold.py @@ -1,6 +1,6 @@ -import pandas as pd +samples="all"import pandas as pd import pytest -from pycytominer.correlation_threshold import correlation_threshold +from pycytominer.operations import correlation_threshold # Build data to use in tests data_df = pd.DataFrame( @@ -27,7 +27,7 @@ def test_correlation_threshold(): correlation_threshold_result = correlation_threshold( population_df=data_df, features=data_df.columns.tolist(), - samples="none", + samples="all", threshold=0.9, method="pearson", ) @@ -39,7 +39,7 @@ def test_correlation_threshold(): correlation_threshold_result = correlation_threshold( population_df=data_df, features=data_df.columns.tolist(), - samples="none", + samples="all", threshold=0.2, method="pearson", ) @@ -53,7 +53,7 @@ def test_correlation_threshold_uncorrelated(): correlation_threshold_result = correlation_threshold( population_df=data_uncorrelated_df, features=data_uncorrelated_df.columns.tolist(), - samples="none", + samples="all", threshold=0.9, method="pearson", ) @@ -80,7 +80,7 @@ def test_correlation_threshold_featureinfer(): correlation_threshold_result = correlation_threshold( population_df=data_df, features="infer", - samples="none", + samples="all", threshold=0.9, method="pearson", ) @@ -93,7 +93,7 @@ def test_correlation_threshold_featureinfer(): correlation_threshold_result = correlation_threshold( population_df=data_cp_df, features="infer", - samples="none", + samples="all", threshold=0.9, method="pearson", ) diff --git a/pycytominer/tests/test_operations/test_get_na_columns.py b/pycytominer/tests/test_operations/test_get_na_columns.py index 304e7b17..9280e81e 100644 --- a/pycytominer/tests/test_operations/test_get_na_columns.py +++ b/pycytominer/tests/test_operations/test_get_na_columns.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd import pytest -from pycytominer.get_na_columns import get_na_columns +from pycytominer.operations import get_na_columns data_df = pd.DataFrame( { @@ -67,7 +67,7 @@ def test_get_na_columns_featureinfer(): with pytest.raises(AssertionError) as nocp: na_result = get_na_columns( population_df=data_df, - samples="none", + samples="all", features="infer", cutoff=0.1 ) diff --git a/pycytominer/tests/test_operations/test_variance_threshold.py b/pycytominer/tests/test_operations/test_variance_threshold.py index b5fd4bbe..b5164a3b 100644 --- a/pycytominer/tests/test_operations/test_variance_threshold.py +++ b/pycytominer/tests/test_operations/test_variance_threshold.py @@ -2,7 +2,7 @@ import pytest import numpy as np import pandas as pd -from pycytominer.variance_threshold import variance_threshold, calculate_frequency +from pycytominer.operations import variance_threshold, calculate_frequency random.seed(123) From 0f6c631d98ced6296ed94effc6f6edd88f6e9ce6 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:13:58 -0400 Subject: [PATCH 5/7] fix init imports --- pycytominer/__init__.py | 3 --- pycytominer/operations/__init__.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pycytominer/__init__.py b/pycytominer/__init__.py index da02cee2..4761b55e 100644 --- a/pycytominer/__init__.py +++ b/pycytominer/__init__.py @@ -1,11 +1,8 @@ from .aggregate import aggregate from .annotate import annotate from .audit import audit -from .correlation_threshold import correlation_threshold from .count_na_features import count_na_features from .covariance import covariance from .feature_select import feature_select -from .get_na_columns import get_na_columns from .normalize import normalize from .sparse_random_projection import sparse_random_projection -from .variance_threshold import variance_threshold diff --git a/pycytominer/operations/__init__.py b/pycytominer/operations/__init__.py index b80b79d9..8d87581e 100644 --- a/pycytominer/operations/__init__.py +++ b/pycytominer/operations/__init__.py @@ -1,3 +1,3 @@ from .correlation_threshold import correlation_threshold -from .variance_threshold import variance_threshold +from .variance_threshold import variance_threshold, calculate_frequency from .get_na_columns import get_na_columns From ff0e64c0c1d695efba0dd3c1aad4bd60fe961536 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:15:57 -0400 Subject: [PATCH 6/7] fix typo --- pycytominer/tests/test_operations/test_correlation_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/tests/test_operations/test_correlation_threshold.py b/pycytominer/tests/test_operations/test_correlation_threshold.py index cf39a14a..8499c87e 100644 --- a/pycytominer/tests/test_operations/test_correlation_threshold.py +++ b/pycytominer/tests/test_operations/test_correlation_threshold.py @@ -1,4 +1,4 @@ -samples="all"import pandas as pd +import pandas as pd import pytest from pycytominer.operations import correlation_threshold From f60cde9f452d25bd7373e8e5ee437d52c2f849e0 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Sun, 26 Jul 2020 14:21:48 -0400 Subject: [PATCH 7/7] update drop_outlier_features --- pycytominer/cyto_utils/features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index 11cb9467..41ef8a67 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -93,7 +93,7 @@ def infer_cp_features(population_df, metadata=False): def drop_outlier_features( - population_df, features="infer", samples="none", outlier_cutoff=15 + population_df, features="infer", samples="all", outlier_cutoff=15 ): """ Exclude a feature if its min or max absolute value is greater than the threshold @@ -104,14 +104,14 @@ def drop_outlier_features( if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on - [default: "none"] - if "none", use all samples to calculate + [default: "all"] - if "all", use all samples to calculate outlier_cutoff - threshold to remove feature if absolute value is greater Return: list of features to exclude from the population_df """ # Subset dataframe - if samples != "none": + if samples != "all": population_df = population_df.loc[samples, :] if features == "infer":