cytomining · gwaybio · Jul 26, 2020 · Jul 26, 2020 · Jul 26, 2020 · Jul 26, 2020
diff --git a/pycytominer/__init__.py b/pycytominer/__init__.py
@@ -1,11 +1,8 @@
 from .aggregate import aggregate
 from .annotate import annotate
 from .audit import audit
-from .correlation_threshold import correlation_threshold
 from .count_na_features import count_na_features
 from .covariance import covariance
 from .feature_select import feature_select
-from .get_na_columns import get_na_columns
 from .normalize import normalize
 from .sparse_random_projection import sparse_random_projection
-from .variance_threshold import variance_threshold
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -93,7 +93,7 @@ def infer_cp_features(population_df, metadata=False):
 
 
 def drop_outlier_features(
-    population_df, features="infer", samples="none", outlier_cutoff=15
+    population_df, features="infer", samples="all", outlier_cutoff=15
 ):
     """
     Exclude a feature if its min or max absolute value is greater than the threshold
@@ -104,14 +104,14 @@ def drop_outlier_features(
                if "infer", then assume cell painting features are those that start with
                "Cells_", "Nuclei_", or "Cytoplasm_"
     samples - list samples to perform operation on
-              [default: "none"] - if "none", use all samples to calculate
+              [default: "all"] - if "all", use all samples to calculate
     outlier_cutoff - threshold to remove feature if absolute value is greater
 
     Return:
     list of features to exclude from the population_df
     """
     # Subset dataframe
-    if samples != "none":
+    if samples != "all":
         population_df = population_df.loc[samples, :]
 
     if features == "infer":

diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -5,9 +5,11 @@
 import os
 import pandas as pd
 
-from pycytominer.correlation_threshold import correlation_threshold
-from pycytominer.variance_threshold import variance_threshold
-from pycytominer.get_na_columns import get_na_columns
+from pycytominer.operations import (
+    correlation_threshold,
+    variance_threshold,
+    get_na_columns,
+)
 from pycytominer.cyto_utils import (
     load_profiles,
     output,
@@ -20,7 +22,7 @@
 def feature_select(
     profiles,
     features="infer",
-    samples="none",
+    samples="all",
     operation="variance_threshold",
     output_file="none",
     na_cutoff=0.05,
@@ -42,7 +44,7 @@ def feature_select(
                if "infer", then assume cell painting features are those that start with
                "Cells", "Nuclei", or "Cytoplasm"
     samples - if provided, a list of samples to provide operation on
-              [default: "none"] - if "none", use all samples to calculate
+              [default: "all"] - if "all", use all samples to calculate
     operation - str or list of given operations to perform on input profiles
     output_file - [default: "none"] if provided, will write annotated profiles to file
                   if not specified, will return the annotated profiles. We recommend

diff --git a/pycytominer/operations/__init__.py b/pycytominer/operations/__init__.py
@@ -0,0 +1,3 @@
+from .correlation_threshold import correlation_threshold
+from .variance_threshold import variance_threshold, calculate_frequency
+from .get_na_columns import get_na_columns
diff --git a/pycytominer/correlation_threshold.py → ...miner/operations/correlation_threshold.py b/pycytominer/correlation_threshold.py → ...miner/operations/correlation_threshold.py
@@ -5,15 +5,15 @@
 
 import numpy as np
 import pandas as pd
-from pycytominer.cyto_utils.features import infer_cp_features
-from pycytominer.cyto_utils.util import (
+from pycytominer.cyto_utils import (
+    infer_cp_features,
     get_pairwise_correlation,
     check_correlation_method,
 )
 
 
 def correlation_threshold(
-    population_df, features="infer", samples="none", threshold=0.9, method="pearson"
+    population_df, features="infer", samples="all", threshold=0.9, method="pearson"
 ):
     """
     Exclude features that have correlations above a certain threshold
@@ -38,7 +38,7 @@ def correlation_threshold(
     assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"
 
     # Subset dataframe and calculate correlation matrix across subset features
-    if samples != "none":
+    if samples != "all":
         population_df = population_df.loc[samples, :]
 
     if features == "infer":

diff --git a/pycytominer/get_na_columns.py → pycytominer/operations/get_na_columns.py b/pycytominer/get_na_columns.py → pycytominer/operations/get_na_columns.py
@@ -7,7 +7,7 @@
 from pycytominer.cyto_utils.features import infer_cp_features
 
 
-def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05):
+def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
     """
     Get features that have more NA values than cutoff defined
 
@@ -17,14 +17,14 @@ def get_na_columns(population_df, features="infer", samples="none", cutoff=0.05)
                if "infer", then assume cell painting features are those that do not
                start with "Cells", "Nuclei", or "Cytoplasm"
     samples - if provided, a list of samples to provide operation on
-              [default: "none"] - if "none", use all samples to calculate
+              [default: "all"] - if "all", use all samples to calculate
     cutoff - float to exclude features that have a higher proportion of missingness
 
     Output:
     A list of the features to exclude
     """
 
-    if samples != "none":
+    if samples != "all":
         population_df = population_df.loc[samples, :]
 
     if features == "infer":

diff --git a/pycytominer/variance_threshold.py → pycytominer/operations/variance_threshold.py b/pycytominer/variance_threshold.py → pycytominer/operations/variance_threshold.py
@@ -5,11 +5,11 @@
 
 import numpy as np
 import pandas as pd
-from pycytominer.cyto_utils.features import infer_cp_features
+from pycytominer.cyto_utils import infer_cp_features
 
 
 def variance_threshold(
-    population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01
+    population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01
 ):
     """
     Exclude features that have low variance (low information content)
@@ -20,7 +20,7 @@ def variance_threshold(
                if "infer", then assume cell painting features are those that start with
                "Cells_", "Nuclei_", or "Cytoplasm_"
     samples - list samples to perform operation on
-              [default: "none"] - if "none", use all samples to calculate
+              [default: "all"] - if "all", use all samples to calculate
     freq_cut - float of ratio (second most common feature value / most common) [default: 0.1]
     unique_cut - float of ratio (num unique features / num samples) [default: 0.1]
 
@@ -32,7 +32,7 @@ def variance_threshold(
     assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)"
 
     # Subset dataframe
-    if samples != "none":
+    if samples != "all":
         population_df = population_df.loc[samples, :]
 
     if features == "infer":

diff --git a/pycytominer/tests/test_operations/__init__.py b/pycytominer/tests/test_operations/__init__.py
diff --git a/...miner/tests/test_correlation_threshold.py → ..._operations/test_correlation_threshold.py b/...miner/tests/test_correlation_threshold.py → ..._operations/test_correlation_threshold.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import pytest
-from pycytominer.correlation_threshold import correlation_threshold
+from pycytominer.operations import correlation_threshold
 
 # Build data to use in tests
 data_df = pd.DataFrame(
@@ -27,7 +27,7 @@ def test_correlation_threshold():
     correlation_threshold_result = correlation_threshold(
         population_df=data_df,
         features=data_df.columns.tolist(),
-        samples="none",
+        samples="all",
         threshold=0.9,
         method="pearson",
     )
@@ -39,7 +39,7 @@ def test_correlation_threshold():
     correlation_threshold_result = correlation_threshold(
         population_df=data_df,
         features=data_df.columns.tolist(),
-        samples="none",
+        samples="all",
         threshold=0.2,
         method="pearson",
     )
@@ -53,7 +53,7 @@ def test_correlation_threshold_uncorrelated():
     correlation_threshold_result = correlation_threshold(
         population_df=data_uncorrelated_df,
         features=data_uncorrelated_df.columns.tolist(),
-        samples="none",
+        samples="all",
         threshold=0.9,
         method="pearson",
     )
@@ -80,7 +80,7 @@ def test_correlation_threshold_featureinfer():
         correlation_threshold_result = correlation_threshold(
             population_df=data_df,
             features="infer",
-            samples="none",
+            samples="all",
             threshold=0.9,
             method="pearson",
         )
@@ -93,7 +93,7 @@ def test_correlation_threshold_featureinfer():
     correlation_threshold_result = correlation_threshold(
         population_df=data_cp_df,
         features="infer",
-        samples="none",
+        samples="all",
         threshold=0.9,
         method="pearson",
     )

diff --git a/pycytominer/tests/test_get_na_columns.py → ...ts/test_operations/test_get_na_columns.py b/pycytominer/tests/test_get_na_columns.py → ...ts/test_operations/test_get_na_columns.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pycytominer.get_na_columns import get_na_columns
+from pycytominer.operations import get_na_columns
 
 data_df = pd.DataFrame(
     {
@@ -67,7 +67,7 @@ def test_get_na_columns_featureinfer():
     with pytest.raises(AssertionError) as nocp:
         na_result = get_na_columns(
             population_df=data_df,
-            samples="none",
+            samples="all",
             features="infer",
             cutoff=0.1
         )

diff --git a/pycytominer/tests/test_variance_threshold.py → ...est_operations/test_variance_threshold.py b/pycytominer/tests/test_variance_threshold.py → ...est_operations/test_variance_threshold.py
@@ -2,7 +2,7 @@
 import pytest
 import numpy as np
 import pandas as pd
-from pycytominer.variance_threshold import variance_threshold, calculate_frequency
+from pycytominer.operations import variance_threshold, calculate_frequency
 
 random.seed(123)