From 271265885b3887c4bf0e34ecba0255d442ab94ff Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Thu, 12 Aug 2021 00:44:48 +0200
Subject: [PATCH] use only exact matching in subsetBy related to #244

---
 drop/config/SampleAnnotation.py               | 25 +++++++++----------
 .../submodules/MonoallelicExpression.py       | 10 ++++----
 drop/utils.py                                 | 18 ++++++-------
 3 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 63ccc280..57fa7221 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -153,13 +153,12 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
 
     ### Subsetting
 
-    def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
+    def subsetSampleAnnotation(self, column, values, subset=None):
         """
         subset by one or more values of different columns from sample file mapping
             :param column: valid column in sample annotation
             :param values: values of column to subset
             :param subset: subset sample annotation
-            :param exact_match: whether to match substrings in the sample annotation, false allows substring matching
         """
         sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS)
         if subset is None:
@@ -174,7 +173,7 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
         # check if column is valid
         if column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
-        return utils.subsetBy(subset, column, values, exact_match=exact_match)
+        return utils.subsetBy(subset, column, values)
 
     def subsetFileMapping(self, file_type=None, sample_id=None):
         """
@@ -233,10 +232,9 @@ def getFilePaths(self, file_type, group=None):
         return self.getFilePath(sampleIDs, file_type, single_file=False)
 
     # build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output
-    def getGenomes(self, value, group, file_type="RNA_ID",
-                            column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False):
+    def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False):
         """
-        :param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation 
+        :param value: values to match in the column.
         :param group: a group of the group_key (DROP_GROUP) column. 
         :return: dict file_type to column
         """
@@ -245,24 +243,25 @@ def getGenomes(self, value, group, file_type="RNA_ID",
         if skip:
             subset = None
         else:
-            subset = self.subsetSampleAnnotation(column, value,exact_match=True)
+            subset = self.subsetSampleAnnotation(column, value)
 
         # additionally subset for the group_key and the group
-        subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
 
         return {sample_id: value for sample_id in subset[file_type].tolist()}
 
     def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE",
-                            annotation_key: str = "ANNOTATION", group_key: str = "DROP_GROUP", 
+                            annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", 
                             asSet: bool = True):
         """
-        :param annotation: annotation name as specified in config and ANNOTATION column. Can be None
-        :param group: a group of the DROP_GROUP column
+        :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None
+        :param group: a group of the DROP_GROUP column.
         :return: set of unique external count file names
         """
+        
         #subset for the annotation_key in the annotation group and the group_key in the group
-        subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match)
-        subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(annotation_key, annotation)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
             
         ans = subset[file_type].tolist()
         if asSet:
diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py
index 5d87fb96..2a9d35cc 100644
--- a/drop/config/submodules/MonoallelicExpression.py
+++ b/drop/config/submodules/MonoallelicExpression.py
@@ -34,7 +34,7 @@ def __init__(
         self.checkConfigSampleannotation()
 
     def checkConfigSampleannotation(self):
-        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False)
+        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups)
 
         if len(self.genomeFiles.keys()) > 1:  # more than 1 value in config defined genome dictionary
             if "GENOME" not in subset.columns.values:  # GENOME column not defined
@@ -147,16 +147,16 @@ def setGenomeDict(self, genomeFiles):
         if len(genomeFiles) == 1:  # globally defined in the config
             globalGenome = list(genomeFiles.values())[0]
 
-            # subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column
+            # subset SA by the drop group and skip the filtering by SA-GENOME column
             genomeDict = self.sampleAnnotation.getGenomes(
                 globalGenome,
                 self.groups,
                 file_type="RNA_ID",
                 column="DROP_GROUP", group_key="DROP_GROUP",
-                exact_match=False, skip=True
+                skip=True
             )
         else:
-            # subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key
+            # subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key
             for gf in genomeFiles.keys():
                 genomeDict.update(
                     self.sampleAnnotation.getGenomes(
@@ -164,7 +164,7 @@ def setGenomeDict(self, genomeFiles):
                         self.groups,
                         file_type="RNA_ID",
                         column="GENOME", group_key="DROP_GROUP",
-                        exact_match=False, skip=False
+                        skip=False
                     )
                 )
 
diff --git a/drop/utils.py b/drop/utils.py
index b325f38f..5659804b 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -65,22 +65,20 @@ def getWBuildSnakefile(str_=True):
     return returnPath(wb_path / "wBuild.snakefile", str_=str_)
 
 
-def subsetBy(df, column, values, exact_match=True):
+def subsetBy(df, column, values):
     """
     Subset by one or more values of different columns from data frame
     :param df: data frame
     :param column: column to subset by
     :param values: values to subset by
-    :param exact_match: default True. when False match substrings. Important for subsetting drop groups
     :return: df subset by values and column
     """
     if values is None:
         return df
-    elif isinstance(values, str) and exact_match :
-        return df[df[column] == values]
-    elif not isinstance(values,str) and exact_match:
-        return df[df[column].isin(values)]
-    elif isinstance(values,str) and not exact_match:
-        return df[df[column].str.contains(values)]
-    else:
-        return df[df[column].str.contains("|".join(values))]
+    
+    inner_regex = values
+    if not isinstance(values, str) :
+        inner_regex = "(" + "|".join(values) + ")"
+    
+    return  df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")]
+