Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to "samples='all'" as default and move operations to subfolder #88

Merged
merged 7 commits into from
Jul 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions pycytominer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from .aggregate import aggregate
from .annotate import annotate
from .audit import audit
from .correlation_threshold import correlation_threshold
from .count_na_features import count_na_features
from .covariance import covariance
from .feature_select import feature_select
from .get_na_columns import get_na_columns
from .normalize import normalize
from .sparse_random_projection import sparse_random_projection
from .variance_threshold import variance_threshold
6 changes: 3 additions & 3 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def infer_cp_features(population_df, metadata=False):


def drop_outlier_features(
population_df, features="infer", samples="none", outlier_cutoff=15
population_df, features="infer", samples="all", outlier_cutoff=15
):
"""
Exclude a feature if its min or max absolute value is greater than the threshold
Expand All @@ -104,14 +104,14 @@ def drop_outlier_features(
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
[default: "all"] - if "all", use all samples to calculate
outlier_cutoff - threshold to remove feature if absolute value is greater

Return:
list of features to exclude from the population_df
"""
# Subset dataframe
if samples != "none":
if samples != "all":
population_df = population_df.loc[samples, :]

if features == "infer":
Expand Down
12 changes: 7 additions & 5 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import os
import pandas as pd

from pycytominer.correlation_threshold import correlation_threshold
from pycytominer.variance_threshold import variance_threshold
from pycytominer.get_na_columns import get_na_columns
from pycytominer.operations import (
correlation_threshold,
variance_threshold,
get_na_columns,
)
from pycytominer.cyto_utils import (
load_profiles,
output,
Expand All @@ -20,7 +22,7 @@
def feature_select(
profiles,
features="infer",
samples="none",
samples="all",
operation="variance_threshold",
output_file="none",
na_cutoff=0.05,
Expand All @@ -42,7 +44,7 @@ def feature_select(
if "infer", then assume cell painting features are those that start with
"Cells", "Nuclei", or "Cytoplasm"
samples - if provided, a list of samples to provide operation on
[default: "none"] - if "none", use all samples to calculate
[default: "all"] - if "all", use all samples to calculate
operation - str or list of given operations to perform on input profiles
output_file - [default: "none"] if provided, will write annotated profiles to file
if not specified, will return the annotated profiles. We recommend
Expand Down
3 changes: 3 additions & 0 deletions pycytominer/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .correlation_threshold import correlation_threshold
from .variance_threshold import variance_threshold, calculate_frequency
from .get_na_columns import get_na_columns
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

import numpy as np
import pandas as pd
from pycytominer.cyto_utils.features import infer_cp_features
from pycytominer.cyto_utils.util import (
from pycytominer.cyto_utils import (
infer_cp_features,
get_pairwise_correlation,
check_correlation_method,
)


def correlation_threshold(
population_df, features="infer", samples="none", threshold=0.9, method="pearson"
population_df, features="infer", samples="all", threshold=0.9, method="pearson"
):
"""
Exclude features that have correlations above a certain threshold
Expand All @@ -38,7 +38,7 @@ def correlation_threshold(
assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"

# Subset dataframe and calculate correlation matrix across subset features
if samples != "none":
if samples != "all":
population_df = population_df.loc[samples, :]

if features == "infer":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pycytominer.cyto_utils.features import infer_cp_features


def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05):
def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
"""
Get features that have more NA values than cutoff defined

Expand All @@ -17,14 +17,14 @@ def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05)
if "infer", then assume cell painting features are those that do not
start with "Cells", "Nuclei", or "Cytoplasm"
samples - if provided, a list of samples to provide operation on
[default: "none"] - if "none", use all samples to calculate
[default: "all"] - if "all", use all samples to calculate
cutoff - float to exclude features that have a higher proportion of missingness

Output:
A list of the features to exclude
"""

if samples != "none":
if samples != "all":
population_df = population_df.loc[samples, :]

if features == "infer":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

import numpy as np
import pandas as pd
from pycytominer.cyto_utils.features import infer_cp_features
from pycytominer.cyto_utils import infer_cp_features


def variance_threshold(
population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01
population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01
):
"""
Exclude features that have low variance (low information content)
Expand All @@ -20,7 +20,7 @@ def variance_threshold(
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
[default: "all"] - if "all", use all samples to calculate
freq_cut - float of ratio (second most common feature value / most common) [default: 0.1]
unique_cut - float of ratio (num unique features / num samples) [default: 0.1]

Expand All @@ -32,7 +32,7 @@ def variance_threshold(
assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)"

# Subset dataframe
if samples != "none":
if samples != "all":
population_df = population_df.loc[samples, :]

if features == "infer":
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
import pytest
from pycytominer.correlation_threshold import correlation_threshold
from pycytominer.operations import correlation_threshold

# Build data to use in tests
data_df = pd.DataFrame(
Expand All @@ -27,7 +27,7 @@ def test_correlation_threshold():
correlation_threshold_result = correlation_threshold(
population_df=data_df,
features=data_df.columns.tolist(),
samples="none",
samples="all",
threshold=0.9,
method="pearson",
)
Expand All @@ -39,7 +39,7 @@ def test_correlation_threshold():
correlation_threshold_result = correlation_threshold(
population_df=data_df,
features=data_df.columns.tolist(),
samples="none",
samples="all",
threshold=0.2,
method="pearson",
)
Expand All @@ -53,7 +53,7 @@ def test_correlation_threshold_uncorrelated():
correlation_threshold_result = correlation_threshold(
population_df=data_uncorrelated_df,
features=data_uncorrelated_df.columns.tolist(),
samples="none",
samples="all",
threshold=0.9,
method="pearson",
)
Expand All @@ -80,7 +80,7 @@ def test_correlation_threshold_featureinfer():
correlation_threshold_result = correlation_threshold(
population_df=data_df,
features="infer",
samples="none",
samples="all",
threshold=0.9,
method="pearson",
)
Expand All @@ -93,7 +93,7 @@ def test_correlation_threshold_featureinfer():
correlation_threshold_result = correlation_threshold(
population_df=data_cp_df,
features="infer",
samples="none",
samples="all",
threshold=0.9,
method="pearson",
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pycytominer.get_na_columns import get_na_columns
from pycytominer.operations import get_na_columns

data_df = pd.DataFrame(
{
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_get_na_columns_featureinfer():
with pytest.raises(AssertionError) as nocp:
na_result = get_na_columns(
population_df=data_df,
samples="none",
samples="all",
features="infer",
cutoff=0.1
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest
import numpy as np
import pandas as pd
from pycytominer.variance_threshold import variance_threshold, calculate_frequency
from pycytominer.operations import variance_threshold, calculate_frequency

random.seed(123)

Expand Down