Skip to content

Commit

Permalink
use only exact matching in subsetBy related to #244
Browse files Browse the repository at this point in the history
  • Loading branch information
c-mertes committed Aug 11, 2021
1 parent d353a56 commit 2712658
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 28 deletions.
25 changes: 12 additions & 13 deletions drop/config/SampleAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,12 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):

### Subsetting

def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
def subsetSampleAnnotation(self, column, values, subset=None):
"""
subset by one or more values of different columns from sample file mapping
:param column: valid column in sample annotation
:param values: values of column to subset
:param subset: subset sample annotation
:param exact_match: whether to match substrings in the sample annotation, false allows substring matching
"""
sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS)
if subset is None:
Expand All @@ -174,7 +173,7 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
# check if column is valid
if column not in sa_cols:
raise KeyError(f"Column '{column}' not present in sample annotation.")
return utils.subsetBy(subset, column, values, exact_match=exact_match)
return utils.subsetBy(subset, column, values)

def subsetFileMapping(self, file_type=None, sample_id=None):
"""
Expand Down Expand Up @@ -233,10 +232,9 @@ def getFilePaths(self, file_type, group=None):
return self.getFilePath(sampleIDs, file_type, single_file=False)

# build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output
def getGenomes(self, value, group, file_type="RNA_ID",
column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False):
def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False):
"""
:param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation
:param value: values to match in the column.
:param group: a group of the group_key (DROP_GROUP) column.
:return: dict file_type to column
"""
Expand All @@ -245,24 +243,25 @@ def getGenomes(self, value, group, file_type="RNA_ID",
if skip:
subset = None
else:
subset = self.subsetSampleAnnotation(column, value,exact_match=True)
subset = self.subsetSampleAnnotation(column, value)

# additionally subset for the group_key and the group
subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match)
subset = self.subsetSampleAnnotation(group_key, group, subset)

return {sample_id: value for sample_id in subset[file_type].tolist()}

def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE",
annotation_key: str = "ANNOTATION", group_key: str = "DROP_GROUP",
annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP",
asSet: bool = True):
"""
:param annotation: annotation name as specified in config and ANNOTATION column. Can be None
:param group: a group of the DROP_GROUP column
:param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None
:param group: a group of the DROP_GROUP column.
:return: set of unique external count file names
"""

#subset for the annotation_key in the annotation group and the group_key in the group
subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match)
subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match)
subset = self.subsetSampleAnnotation(annotation_key, annotation)
subset = self.subsetSampleAnnotation(group_key, group, subset)

ans = subset[file_type].tolist()
if asSet:
Expand Down
10 changes: 5 additions & 5 deletions drop/config/submodules/MonoallelicExpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
self.checkConfigSampleannotation()

def checkConfigSampleannotation(self):
subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False)
subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups)

if len(self.genomeFiles.keys()) > 1: # more than 1 value in config defined genome dictionary
if "GENOME" not in subset.columns.values: # GENOME column not defined
Expand Down Expand Up @@ -147,24 +147,24 @@ def setGenomeDict(self, genomeFiles):
if len(genomeFiles) == 1: # globally defined in the config
globalGenome = list(genomeFiles.values())[0]

# subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column
# subset SA by the drop group and skip the filtering by SA-GENOME column
genomeDict = self.sampleAnnotation.getGenomes(
globalGenome,
self.groups,
file_type="RNA_ID",
column="DROP_GROUP", group_key="DROP_GROUP",
exact_match=False, skip=True
skip=True
)
else:
# subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key
# subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key
for gf in genomeFiles.keys():
genomeDict.update(
self.sampleAnnotation.getGenomes(
gf,
self.groups,
file_type="RNA_ID",
column="GENOME", group_key="DROP_GROUP",
exact_match=False, skip=False
skip=False
)
)

Expand Down
18 changes: 8 additions & 10 deletions drop/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,20 @@ def getWBuildSnakefile(str_=True):
return returnPath(wb_path / "wBuild.snakefile", str_=str_)


def subsetBy(df, column, values, exact_match=True):
def subsetBy(df, column, values):
"""
Subset by one or more values of different columns from data frame
:param df: data frame
:param column: column to subset by
:param values: values to subset by
:param exact_match: default True. when False match substrings. Important for subsetting drop groups
:return: df subset by values and column
"""
if values is None:
return df
elif isinstance(values, str) and exact_match :
return df[df[column] == values]
elif not isinstance(values,str) and exact_match:
return df[df[column].isin(values)]
elif isinstance(values,str) and not exact_match:
return df[df[column].str.contains(values)]
else:
return df[df[column].str.contains("|".join(values))]

inner_regex = values
if not isinstance(values, str) :
inner_regex = "(" + "|".join(values) + ")"

return df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")]

0 comments on commit 2712658

Please sign in to comment.