diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index fd9940e..e2a1b16 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -23,7 +23,7 @@ jobs: run_tests: strategy: matrix: - python_version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python_version: ["3.9", "3.10", "3.11", "3.12"] os: [ubuntu-22.04, macos-13] runs-on: ${{ matrix.os }} env: diff --git a/pyproject.toml b/pyproject.toml index 6a73b0b..64a5088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,12 +47,16 @@ select = [ # flake8-comprehensions "C4", # flake8-simplify - "SIM" + "SIM", + # flake8-annotations + "ANN" ] [tool.ruff.lint.per-file-ignores] # Ignore `E402` and `F401` (unused imports) in all `__init__.py` files "__init__.py" = ["E402", "F401"] +# ignore typing rules for tests +"tests/*" = ["ANN201"] # set dynamic versioning capabilities for project [tool.poetry-dynamic-versioning] diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index 0c41cdd..9f33c4b 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -16,12 +16,12 @@ ) -def find_outliers( +def identify_outliers( df: pd.DataFrame, - metadata_columns: List[str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, -) -> pd.DataFrame: + include_threshold_scores: bool = False, +) -> Union[pd.Series, pd.DataFrame]: """ This function uses z-scoring to format the data for detecting outlier nuclei or cells using specific CellProfiler features. Thresholds are @@ -47,10 +47,21 @@ def find_outliers( defined within a file. Returns: - pd.DataFrame: - Outlier data frame for the given conditions. + Union[pd.Series, pd.DataFrame]: + Outlier series with booleans based on whether outliers were detected + or not for use within other functions. """ + # create a copy of the dataframe to ensure + # we don't modify the supplied dataframe inplace. + outlier_df = df.copy() + + thresholds_name = ( + f"outlier_{feature_thresholds}" + if isinstance(feature_thresholds, str) + else "outlier_custom" + ) + if isinstance(feature_thresholds, str): feature_thresholds = read_thresholds_set_from_file( feature_thresholds=feature_thresholds, @@ -62,7 +73,7 @@ def find_outliers( for feature in feature_thresholds: if feature not in df.columns: raise ValueError(f"Feature '{feature}' does not exist in the DataFrame.") - df[f"Z_Score_{feature}"] = scipy_zscore(df[feature]) + outlier_df[f"Z_Score_{feature}"] = scipy_zscore(df[feature]) zscore_columns[feature] = f"Z_Score_{feature}" # Create outlier detection conditions for each feature @@ -71,15 +82,77 @@ def find_outliers( # For positive thresholds, look for outliers that are # that number of std "above" the mean if threshold > 0: - condition = df[zscore_columns[feature]] > threshold + condition = outlier_df[zscore_columns[feature]] > threshold # For negative thresholds, look for outliers that are # that number of std "below" the mean else: - condition = df[zscore_columns[feature]] < threshold + condition = outlier_df[zscore_columns[feature]] < threshold conditions.append(condition) + return ( + # create a boolean pd.series identifier for dataframe + # based on all conditions for use within other functions. + reduce(operator.and_, conditions) + if not include_threshold_scores + # otherwise, provide the threshold zscore col and the above column + else pd.concat( + [ + # grab only the outlier zscore columns from the outlier_df + outlier_df[zscore_columns.values()], + pd.DataFrame({thresholds_name: reduce(operator.and_, conditions)}), + ], + axis=1, + ) + ) + + +def find_outliers( + df: pd.DataFrame, + metadata_columns: List[str], + feature_thresholds: Union[Dict[str, float], str], + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, +) -> pd.DataFrame: + """ + This function uses identify_outliers to return a dataframe + with only the outliers and provided metadata columns. + + Args: + df: pd.DataFrame + Data frame with converted output from CytoTable. + metadata_columns: List[str] + List of metadata columns that should be outputted with the outlier data. + feature_thresholds: Dict[str, float] + One of two options: + A dictionary with the feature name(s) as the key(s) and their assigned + threshold for identifying outliers. Positive int for the threshold + will detect outliers "above" than the mean, negative int will detect + outliers "below" the mean. + Or a string which is a named key reference found within + the feature_thresholds_file yaml file. + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, + An optional feature thresholds file where thresholds may be + defined within a file. + + Returns: + pd.DataFrame: + Outlier data frame for the given conditions. + """ + + if isinstance(feature_thresholds, str): + feature_thresholds = read_thresholds_set_from_file( + feature_thresholds=feature_thresholds, + feature_thresholds_file=feature_thresholds_file, + ) + # Filter DataFrame for outliers using all conditions - outliers_df = df[reduce(operator.and_, conditions)] + outliers_df = df[ + # use identify outliers as a mask on the full dataframe + identify_outliers( + df=df, + feature_thresholds=feature_thresholds, + feature_thresholds_file=feature_thresholds_file, + ) + ] # Print outliers count and range for each feature print("Number of outliers:", outliers_df.shape[0]) @@ -95,9 +168,95 @@ def find_outliers( return outliers_df[columns_to_include] +def label_outliers( + df: pd.DataFrame, + feature_thresholds: Optional[Union[Dict[str, float], str]] = None, + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, + include_threshold_scores: bool = False, +) -> pd.DataFrame: + """ + Use identify_outliers to label the original dataset for + where a cell passed or failed the quality control condition(s). + + Args: + df: pd.DataFrame + Data frame with converted output from CytoTable. + feature_thresholds: Dict[str, float] + One of two options: + A dictionary with the feature name(s) as the key(s) and their assigned + threshold for identifying outliers. Positive int for the threshold + will detect outliers "above" than the mean, negative int will detect + outliers "below" the mean. + Or a string which is a named key reference found within + the feature_thresholds_file yaml file. + feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, + An optional feature thresholds file where thresholds may be + defined within a file. + include_threshold_scores: bool = False + Whether to include the scores in addition to whether an outlier + was detected or not. + + Returns: + pd.DataFrame: + Full dataframe with optional scores and outlier boolean column. + """ + + # for single outlier processing + if isinstance(feature_thresholds, (str, dict)): + # return the outlier dataframe for one threshold rule + identified_outliers = identify_outliers( + df=df, + feature_thresholds=feature_thresholds, + feature_thresholds_file=feature_thresholds_file, + include_threshold_scores=include_threshold_scores, + ) + return pd.concat( + [ + df, + ( + identified_outliers + if isinstance(identified_outliers, pd.DataFrame) + else pd.DataFrame( + { + ( + f"outlier_{feature_thresholds}" + if isinstance(feature_thresholds, str) + else "outlier_custom" + ): identified_outliers + } + ) + ), + ], + axis=1, + ) + + # for multiple outlier processing + elif feature_thresholds is None: + # return the outlier dataframe for all threshold rules + labeled_df = pd.concat( + [df] + + [ + # identify outliers for each threshold rule + identify_outliers( + df=df, + feature_thresholds=thresholds, + feature_thresholds_file=feature_thresholds_file, + include_threshold_scores=include_threshold_scores, + ) + # loop through each threshold rule + for thresholds in read_thresholds_set_from_file( + feature_thresholds_file=feature_thresholds_file, + ) + ], + axis=1, + ) + # return a dataframe with a deduplicated columns by name + return labeled_df.loc[:, ~labeled_df.columns.duplicated()] + + def read_thresholds_set_from_file( - feature_thresholds: str, feature_thresholds_file: str -): + feature_thresholds_file: str, feature_thresholds: Optional[str] = None +) -> Union[Dict[str, int], Dict[str, Dict[str, int]]]: """ Reads a set of feature thresholds from a specified file. @@ -106,10 +265,11 @@ def read_thresholds_set_from_file( the thresholds set from the file. Args: - feature_thresholds (str): - A string specifying the feature thresholds. feature_thresholds_file (str): The path to the file containing feature thresholds. + feature_thresholds (Optional str, default None): + A string specifying the feature thresholds. + If we have None, return all thresholds. Returns: dict: A dictionary containing the processed feature thresholds. @@ -118,9 +278,14 @@ def read_thresholds_set_from_file( LookupError: If the file does not contain the specified feature_thresholds key. """ + # open the yaml file with open(feature_thresholds_file, "r") as file: thresholds = yaml.safe_load(file) + # if no feature thresholds name is specified, return all thresholds + if feature_thresholds is None: + return thresholds["thresholds"] + if feature_thresholds not in thresholds["thresholds"]: raise LookupError( ( diff --git a/tests/conftest.py b/tests/conftest.py index 3adc367..a8f796a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,3 +16,11 @@ def fixture_cytotable_CFReT_df(): return pd.read_parquet( "tests/data/cytotable/CFRet_data/test_localhost231120090001_converted.parquet" ) + + +@pytest.fixture(name="basic_outlier_dataframe") +def fixture_basic_outlier_dataframe(): + """ + Creates basic example data for use in tests + """ + return pd.DataFrame({"example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) diff --git a/tests/data/coSMicQC/test_identifier_outliers_output.parquet b/tests/data/coSMicQC/test_identifier_outliers_output.parquet new file mode 100644 index 0000000..3412884 Binary files /dev/null and b/tests/data/coSMicQC/test_identifier_outliers_output.parquet differ diff --git a/tests/data/coSMicQC/test_label_outliers_output.parquet b/tests/data/coSMicQC/test_label_outliers_output.parquet new file mode 100644 index 0000000..00dbdd1 Binary files /dev/null and b/tests/data/coSMicQC/test_label_outliers_output.parquet differ diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 49f949b..6c7a879 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -7,6 +7,26 @@ from cosmicqc import analyze +def test_find_outliers_basic(basic_outlier_dataframe: pd.DataFrame): + """ + Testing find_outliers with basic/simulated data. + """ + + # add metadata to basic data + metadata_column_name = "Image_Metadata_Plate" + basic_outlier_dataframe[metadata_column_name] = "A" + + # assert that we have the output we expect + assert analyze.find_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + metadata_columns=[metadata_column_name], + ).to_dict(orient="dict") == { + "example_feature": {8: 9, 9: 10}, + "Image_Metadata_Plate": {8: "A", 9: "A"}, + } + + def test_find_outliers_cfret(cytotable_CFReT_data_df: pd.DataFrame): """ Testing find_outliers with CytoTable CFReT data. @@ -215,6 +235,17 @@ def test_read_thresholds_set_from_file(): feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, ) == {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2} + assert analyze.read_thresholds_set_from_file( + feature_thresholds_file=analyze.DEFAULT_QC_THRESHOLD_FILE, + ) == { + "small_and_low_formfactor_nuclei": { + "Nuclei_AreaShape_Area": -1, + "Nuclei_AreaShape_FormFactor": -1, + }, + "elongated_nuclei": {"Nuclei_AreaShape_Eccentricity": 2}, + "large_nuclei": {"Nuclei_AreaShape_Area": 2, "Nuclei_AreaShape_FormFactor": -2}, + } + def test_find_outliers_dict_and_default_config_cfret( cytotable_CFReT_data_df: pd.DataFrame, @@ -280,3 +311,172 @@ def test_find_outliers_dict_and_default_config_cfret( metadata_columns=metadata_columns, ), ) + + +def test_label_outliers( + basic_outlier_dataframe: pd.DataFrame, + cytotable_CFReT_data_df: pd.DataFrame, +): + """ + Tests label_outliers + """ + + # test basic single-column result with zscores + assert analyze.label_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ).to_dict(orient="dict") == { + "example_feature": { + 0: 1, + 1: 2, + 2: 3, + 3: 4, + 4: 5, + 5: 6, + 6: 7, + 7: 8, + 8: 9, + 9: 10, + }, + "Z_Score_example_feature": { + 0: -1.5666989036012806, + 1: -1.2185435916898848, + 2: -0.8703882797784892, + 3: -0.5222329678670935, + 4: -0.17407765595569785, + 5: 0.17407765595569785, + 6: 0.5222329678670935, + 7: 0.8703882797784892, + 8: 1.2185435916898848, + 9: 1.5666989036012806, + }, + "outlier_custom": { + 0: False, + 1: False, + 2: False, + 3: False, + 4: False, + 5: False, + 6: False, + 7: False, + 8: True, + 9: True, + }, + } + + # test for case when zscores are excluded + assert analyze.label_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=False, + ).to_dict(orient="dict") == { + "example_feature": { + 0: 1, + 1: 2, + 2: 3, + 3: 4, + 4: 5, + 5: 6, + 6: 7, + 7: 8, + 8: 9, + 9: 10, + }, + "outlier_custom": { + 0: False, + 1: False, + 2: False, + 3: False, + 4: False, + 5: False, + 6: False, + 7: False, + 8: True, + 9: True, + }, + } + + # test single-column result + test_df = analyze.label_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="large_nuclei", + include_threshold_scores=True, + ) + pd.testing.assert_frame_equal( + test_df, + pd.read_parquet( + path="tests/data/coSMicQC/test_label_outliers_output.parquet", + columns=test_df.columns.tolist(), + ), + ) + + # test full dataset + pd.testing.assert_frame_equal( + analyze.label_outliers( + df=cytotable_CFReT_data_df, include_threshold_scores=True + ), + pd.read_parquet(path="tests/data/coSMicQC/test_label_outliers_output.parquet"), + ) + + +def test_identify_outliers( + basic_outlier_dataframe: pd.DataFrame, + cytotable_CFReT_data_df: pd.DataFrame, +): + """ + Tests identify_outliers + """ + + assert analyze.identify_outliers( + df=basic_outlier_dataframe, + feature_thresholds={"example_feature": 1}, + include_threshold_scores=True, + ).to_dict(orient="dict") == { + "Z_Score_example_feature": { + 0: -1.5666989036012806, + 1: -1.2185435916898848, + 2: -0.8703882797784892, + 3: -0.5222329678670935, + 4: -0.17407765595569785, + 5: 0.17407765595569785, + 6: 0.5222329678670935, + 7: 0.8703882797784892, + 8: 1.2185435916898848, + 9: 1.5666989036012806, + }, + "outlier_custom": { + 0: False, + 1: False, + 2: False, + 3: False, + 4: False, + 5: False, + 6: False, + 7: False, + 8: True, + 9: True, + }, + } + + pd.testing.assert_frame_equal( + analyze.identify_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="large_nuclei", + include_threshold_scores=True, + ), + pd.read_parquet("tests/data/coSMicQC/test_identifier_outliers_output.parquet"), + ) + + identified_df = analyze.identify_outliers( + df=cytotable_CFReT_data_df, + feature_thresholds="large_nuclei", + ) + pd.testing.assert_series_equal( + identified_df, + pd.read_parquet( + "tests/data/coSMicQC/test_identifier_outliers_output.parquet", + columns=["outlier_large_nuclei"], + )["outlier_large_nuclei"], + check_names=False, + )