diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..bda36cdf --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include MANIFEST.in +include LICENSE.md +include README.md +include setup.py +include pycytominer/data/* + diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py new file mode 100644 index 00000000..d9015fc1 --- /dev/null +++ b/pycytominer/cyto_utils/features.py @@ -0,0 +1,36 @@ +""" +Utility function to manipulate cell profiler features +""" + +import os +import pandas as pd + +blacklist_file = os.path.join( + os.path.dirname(__file__), "..", "data", "blacklist_features.txt" +) + + +def get_blacklist_features(blacklist_file=blacklist_file, population_df=None): + """ + Get a list of blacklist features + + Arguments: + blacklist_file - file location of dataframe with features to exclude + population_df - profile dataframe used to subset blacklist features [default: None] + + Return: + list of features to exclude from downstream analysis + """ + + blacklist = pd.read_csv(blacklist_file) + + assert any( + [x == "blacklist" for x in blacklist.columns] + ), "one column must be named 'blacklist'" + + blacklist_features = blacklist.blacklist.to_list() + if isinstance(population_df, pd.DataFrame): + population_features = population_df.columns.tolist() + blacklist_features = [x for x in blacklist_features if x in population_features] + + return blacklist_features diff --git a/pycytominer/data/blacklist_features.txt b/pycytominer/data/blacklist_features.txt new file mode 100644 index 00000000..60b1a82d --- /dev/null +++ b/pycytominer/data/blacklist_features.txt @@ -0,0 +1,56 @@ +blacklist +Nuclei_Correlation_Manders_AGP_DNA +Nuclei_Correlation_Manders_AGP_ER +Nuclei_Correlation_Manders_AGP_Mito +Nuclei_Correlation_Manders_AGP_RNA +Nuclei_Correlation_Manders_DNA_AGP +Nuclei_Correlation_Manders_DNA_ER +Nuclei_Correlation_Manders_DNA_Mito +Nuclei_Correlation_Manders_DNA_RNA +Nuclei_Correlation_Manders_ER_AGP +Nuclei_Correlation_Manders_ER_DNA +Nuclei_Correlation_Manders_ER_Mito +Nuclei_Correlation_Manders_ER_RNA +Nuclei_Correlation_Manders_Mito_AGP +Nuclei_Correlation_Manders_Mito_DNA +Nuclei_Correlation_Manders_Mito_ER +Nuclei_Correlation_Manders_Mito_RNA +Nuclei_Correlation_Manders_RNA_AGP +Nuclei_Correlation_Manders_RNA_DNA +Nuclei_Correlation_Manders_RNA_ER +Nuclei_Correlation_Manders_RNA_Mito +Nuclei_Correlation_RWC_AGP_DNA +Nuclei_Correlation_RWC_AGP_ER +Nuclei_Correlation_RWC_AGP_Mito +Nuclei_Correlation_RWC_AGP_RNA +Nuclei_Correlation_RWC_DNA_AGP +Nuclei_Correlation_RWC_DNA_ER +Nuclei_Correlation_RWC_DNA_Mito +Nuclei_Correlation_RWC_DNA_RNA +Nuclei_Correlation_RWC_ER_AGP +Nuclei_Correlation_RWC_ER_DNA +Nuclei_Correlation_RWC_ER_Mito +Nuclei_Correlation_RWC_ER_RNA +Nuclei_Correlation_RWC_Mito_AGP +Nuclei_Correlation_RWC_Mito_DNA +Nuclei_Correlation_RWC_Mito_ER +Nuclei_Correlation_RWC_Mito_RNA +Nuclei_Correlation_RWC_RNA_AGP +Nuclei_Correlation_RWC_RNA_DNA +Nuclei_Correlation_RWC_RNA_ER +Nuclei_Correlation_RWC_RNA_Mito +Nuclei_Granularity_14_AGP +Nuclei_Granularity_14_DNA +Nuclei_Granularity_14_ER +Nuclei_Granularity_14_Mito +Nuclei_Granularity_14_RNA +Nuclei_Granularity_15_AGP +Nuclei_Granularity_15_DNA +Nuclei_Granularity_15_ER +Nuclei_Granularity_15_Mito +Nuclei_Granularity_15_RNA +Nuclei_Granularity_16_AGP +Nuclei_Granularity_16_DNA +Nuclei_Granularity_16_ER +Nuclei_Granularity_16_Mito +Nuclei_Granularity_16_RNA diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index 6af6fe98..c6fa6292 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -2,12 +2,14 @@ Select features to use in downstream analysis based on specified selection method """ +import os import pandas as pd from pycytominer.correlation_threshold import correlation_threshold from pycytominer.variance_threshold import variance_threshold from pycytominer.get_na_columns import get_na_columns from pycytominer.cyto_utils.compress import compress +from pycytominer.cyto_utils.features import get_blacklist_features def feature_select( @@ -41,8 +43,14 @@ def feature_select( unique_cut = kwargs.pop("unique_cut", 0.1) how = kwargs.pop("how", None) float_format = kwargs.pop("float_format", None) + blacklist_file = kwargs.pop("blacklist_file", None) - all_ops = ["variance_threshold", "correlation_threshold", "drop_na_columns"] + all_ops = [ + "variance_threshold", + "correlation_threshold", + "drop_na_columns", + "blacklist", + ] # Make sure the user provides a supported operation if isinstance(operation, list): @@ -94,6 +102,12 @@ def feature_select( threshold=corr_threshold, method=corr_method, ) + elif op == "blacklist": + if blacklist_file: + exclude = get_blacklist_features(population_df=profiles, blacklist_file=blacklist_file) + else: + exclude = get_blacklist_features(population_df=profiles) + excluded_features += exclude excluded_features = list(set(excluded_features)) diff --git a/pycytominer/tests/test_feature_blacklist.py b/pycytominer/tests/test_feature_blacklist.py new file mode 100644 index 00000000..cf08207a --- /dev/null +++ b/pycytominer/tests/test_feature_blacklist.py @@ -0,0 +1,30 @@ +import os +import random +import pytest +import tempfile +import warnings +import pandas as pd +from pycytominer.cyto_utils.features import get_blacklist_features + +blacklist_file = os.path.join( + os.path.dirname(__file__), "..", "data", "blacklist_features.txt" +) + +blacklist = pd.read_csv(blacklist_file).blacklist.tolist() + +data_blacklist_df = pd.DataFrame( + { + "Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2], + "Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9], + } +).reset_index(drop=True) + + +def test_blacklist(): + blacklist_from_func = get_blacklist_features() + assert blacklist == blacklist_from_func + + +def test_blacklist_df(): + blacklist_from_func = get_blacklist_features(population_df=data_blacklist_df) + assert data_blacklist_df.columns.tolist() == blacklist_from_func diff --git a/pycytominer/tests/test_feature_select.py b/pycytominer/tests/test_feature_select.py index 6dcccc3d..840201ee 100644 --- a/pycytominer/tests/test_feature_select.py +++ b/pycytominer/tests/test_feature_select.py @@ -165,3 +165,22 @@ def test_feature_select_compress(): result = pd.read_csv(compress_file) pd.testing.assert_frame_equal(result, expected_result) + + +def test_feature_select_blacklist(): + """ + Testing feature_select and get_na_columns pycytominer function + """ + + data_blacklist_df = pd.DataFrame( + { + "Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2], + "y": [1, 2, 8, 5, 2, 1], + "Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9], + "zz": [0, -3, 8, 9, 6, 9], + } + ).reset_index(drop=True) + + result = feature_select(data_blacklist_df, operation="blacklist") + expected_result = pd.DataFrame({"y": [1, 2, 8, 5, 2, 1], "zz": [0, -3, 8, 9, 6, 9]}) + pd.testing.assert_frame_equal(result, expected_result) diff --git a/setup.py b/setup.py index 8611f57f..d93bbee3 100644 --- a/setup.py +++ b/setup.py @@ -19,4 +19,5 @@ license="BSD 3-Clause License", install_requires=["numpy", "pandas", "scikit-learn", "sqlalchemy"], python_requires=">=3.4", + include_package_data=True, )