From 271265885b3887c4bf0e34ecba0255d442ab94ff Mon Sep 17 00:00:00 2001 From: Christian Mertes Date: Thu, 12 Aug 2021 00:44:48 +0200 Subject: [PATCH] use only exact matching in subsetBy related to #244 --- drop/config/SampleAnnotation.py | 25 +++++++++---------- .../submodules/MonoallelicExpression.py | 10 ++++---- drop/utils.py | 18 ++++++------- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py index 63ccc280..57fa7221 100644 --- a/drop/config/SampleAnnotation.py +++ b/drop/config/SampleAnnotation.py @@ -153,13 +153,12 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','): ### Subsetting - def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True): + def subsetSampleAnnotation(self, column, values, subset=None): """ subset by one or more values of different columns from sample file mapping :param column: valid column in sample annotation :param values: values of column to subset :param subset: subset sample annotation - :param exact_match: whether to match substrings in the sample annotation, false allows substring matching """ sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS) if subset is None: @@ -174,7 +173,7 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True): # check if column is valid if column not in sa_cols: raise KeyError(f"Column '{column}' not present in sample annotation.") - return utils.subsetBy(subset, column, values, exact_match=exact_match) + return utils.subsetBy(subset, column, values) def subsetFileMapping(self, file_type=None, sample_id=None): """ @@ -233,10 +232,9 @@ def getFilePaths(self, file_type, group=None): return self.getFilePath(sampleIDs, file_type, single_file=False) # build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output - def getGenomes(self, value, group, file_type="RNA_ID", - column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False): + def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False): """ - :param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation + :param value: values to match in the column. :param group: a group of the group_key (DROP_GROUP) column. :return: dict file_type to column """ @@ -245,24 +243,25 @@ def getGenomes(self, value, group, file_type="RNA_ID", if skip: subset = None else: - subset = self.subsetSampleAnnotation(column, value,exact_match=True) + subset = self.subsetSampleAnnotation(column, value) # additionally subset for the group_key and the group - subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match) + subset = self.subsetSampleAnnotation(group_key, group, subset) return {sample_id: value for sample_id in subset[file_type].tolist()} def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE", - annotation_key: str = "ANNOTATION", group_key: str = "DROP_GROUP", + annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", asSet: bool = True): """ - :param annotation: annotation name as specified in config and ANNOTATION column. Can be None - :param group: a group of the DROP_GROUP column + :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None + :param group: a group of the DROP_GROUP column. :return: set of unique external count file names """ + #subset for the annotation_key in the annotation group and the group_key in the group - subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match) - subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match) + subset = self.subsetSampleAnnotation(annotation_key, annotation) + subset = self.subsetSampleAnnotation(group_key, group, subset) ans = subset[file_type].tolist() if asSet: diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py index 5d87fb96..2a9d35cc 100644 --- a/drop/config/submodules/MonoallelicExpression.py +++ b/drop/config/submodules/MonoallelicExpression.py @@ -34,7 +34,7 @@ def __init__( self.checkConfigSampleannotation() def checkConfigSampleannotation(self): - subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False) + subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups) if len(self.genomeFiles.keys()) > 1: # more than 1 value in config defined genome dictionary if "GENOME" not in subset.columns.values: # GENOME column not defined @@ -147,16 +147,16 @@ def setGenomeDict(self, genomeFiles): if len(genomeFiles) == 1: # globally defined in the config globalGenome = list(genomeFiles.values())[0] - # subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column + # subset SA by the drop group and skip the filtering by SA-GENOME column genomeDict = self.sampleAnnotation.getGenomes( globalGenome, self.groups, file_type="RNA_ID", column="DROP_GROUP", group_key="DROP_GROUP", - exact_match=False, skip=True + skip=True ) else: - # subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key + # subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key for gf in genomeFiles.keys(): genomeDict.update( self.sampleAnnotation.getGenomes( @@ -164,7 +164,7 @@ def setGenomeDict(self, genomeFiles): self.groups, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", - exact_match=False, skip=False + skip=False ) ) diff --git a/drop/utils.py b/drop/utils.py index b325f38f..5659804b 100644 --- a/drop/utils.py +++ b/drop/utils.py @@ -65,22 +65,20 @@ def getWBuildSnakefile(str_=True): return returnPath(wb_path / "wBuild.snakefile", str_=str_) -def subsetBy(df, column, values, exact_match=True): +def subsetBy(df, column, values): """ Subset by one or more values of different columns from data frame :param df: data frame :param column: column to subset by :param values: values to subset by - :param exact_match: default True. when False match substrings. Important for subsetting drop groups :return: df subset by values and column """ if values is None: return df - elif isinstance(values, str) and exact_match : - return df[df[column] == values] - elif not isinstance(values,str) and exact_match: - return df[df[column].isin(values)] - elif isinstance(values,str) and not exact_match: - return df[df[column].str.contains(values)] - else: - return df[df[column].str.contains("|".join(values))] + + inner_regex = values + if not isinstance(values, str) : + inner_regex = "(" + "|".join(values) + ")" + + return df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")] +