gagneurlab · nickhsmith · Apr 22, 2022 · Feb 10, 2021 · Feb 10, 2021 · Feb 10, 2021
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ The manuscript is available in [Nature Protocols](https://www.nature.com/article
 DROP is available on [bioconda](https://anaconda.org/bioconda/drop).
 We recommend using a dedicated conda environment. (installation time: ~ 10min)
 ```
-mamba install -c conda-forge -c bioconda drop
+mamba create -n drop_env -c conda-forge -c bioconda drop
 ```
 
 Test installation with demo project

diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py
@@ -27,7 +27,7 @@ def __init__(
         self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"]
         self.config_dict = self.setDefaults(dict_, genome.annotation)
         self.outputRoot = outputRoot / "exported_counts"
-        self.sa = sampleAnnotation
+        self.sampleAnnotation = sampleAnnotation
         self.genomeAssembly = genome.assembly
         self.geneAnnotations = self.get("geneAnnotations")
         self.modules = {

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
@@ -9,7 +9,7 @@
 
 
 class SampleAnnotation:
-    FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"]
+    FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR"]
     SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [
         "RNA_ID", "DNA_ID", "DROP_GROUP", "GENE_ANNOTATION",
         "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND", "GENOME"
@@ -32,6 +32,7 @@ def __init__(self, file, root, genome):
         self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',')
         # external counts
         self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',')
+        self.extSpliceCountIDs = self.createGroupIds(file_type="SPLICE_COUNTS_DIR", sep=',')
 
     def parse(self, sep='\t'):
         """
@@ -80,10 +81,12 @@ def createSampleFileMapping(self):
             columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ]
         """
 
-        assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']}
+        assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE', 'SPLICE_COUNTS_DIR'], 'DNA_ID': ['DNA_VCF_FILE']}
         assay_subsets = []
         for id_, file_types in assay_mapping.items():
             for file_type in file_types:
+                if file_type not in self.annotationTable.columns:
+                    continue
                 df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy()
                 df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True)
                 df['ASSAY'] = id_
@@ -142,21 +145,20 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
         groups = set(groups)
 
         # collect IDs per group
-        grouped = {gr: df[df[group_key].str.contains(f'(^|{sep}){gr}({sep}|$)')][assay_id].tolist()
+        grouped = {gr: df[df[group_key].str.contains(f'(?:^|{sep}){gr}(?:{sep}|$)')][assay_id].tolist()
                    for gr in groups}
         # remove groups labeled as None
         grouped = {gr: list(set(ids)) for gr, ids in grouped.items() if gr is not None}
         return grouped
 
     ### Subsetting
 
-    def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
+    def subsetSampleAnnotation(self, column, values, subset=None):
         """
         subset by one or more values of different columns from sample file mapping
             :param column: valid column in sample annotation
             :param values: values of column to subset
             :param subset: subset sample annotation
-            :param exact_match: whether to match substrings in the sample annotation, false allows substring matching
         """
         sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS)
         if subset is None:
@@ -171,7 +173,7 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
         # check if column is valid
         if column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
-        return utils.subsetBy(subset, column, values, exact_match=exact_match)
+        return utils.subsetBy(subset, column, values)
 
     def subsetFileMapping(self, file_type=None, sample_id=None):
         """
@@ -230,10 +232,9 @@ def getFilePaths(self, file_type, group=None):
         return self.getFilePath(sampleIDs, file_type, single_file=False)
 
     # build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output
-    def getGenomes(self, value, group, file_type="RNA_ID",
-                            column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False):
+    def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False):
         """
-        :param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation 
+        :param value: values to match in the column.
         :param group: a group of the group_key (DROP_GROUP) column. 
         :return: dict file_type to column
         """
@@ -242,24 +243,30 @@ def getGenomes(self, value, group, file_type="RNA_ID",
         if skip:
             subset = None
         else:
-            subset = self.subsetSampleAnnotation(column, value,exact_match=True)
+            subset = self.subsetSampleAnnotation(column, value)
 
         # additionally subset for the group_key and the group
-        subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
 
         return {sample_id: value for sample_id in subset[file_type].tolist()}
 
-    def getImportCountFiles(self, annotation, group, file_type="GENE_COUNTS_FILE",
-                            annotation_key="GENE_ANNOTATION", group_key="DROP_GROUP",exact_match = True):
+    def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE",
+                            annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", 
+                            asSet: bool = True):
         """
-        :param annotation: annotation name as specified in config and GENE_ANNOTATION column
-        :param group: a group of the DROP_GROUP column. exact match is passed to subsetter, false allows for substring matching
+        :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None
+        :param group: a group of the DROP_GROUP column.
         :return: set of unique external count file names
         """
+
         #subset for the annotation_key in the annotation group and the group_key in the group
-        subset = self.subsetSampleAnnotation(annotation_key, annotation,exact_match=exact_match)
-        subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=False)
-        return set(subset[file_type].tolist())
+        subset = self.subsetSampleAnnotation(annotation_key, annotation)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
+
+        ans = subset[file_type].tolist()
+        if asSet:
+            ans = set(ans)
+        return ans
 
     def getRow(self, column, value):
         sa = self.annotationTable
@@ -277,15 +284,18 @@ def getGroupedIDs(self, assays):
         Get group to IDs mapping
         :param assays: list of or single assay the IDs should be from. Can be file_type or 'RNA'/'DNA'
         """
-        assays = [assays] if isinstance(assays, str) else assays
+        if isinstance(assays, str):
+            assays = [assays]
         groupedIDs = defaultdict(list)
         for assay in assays:
             if "RNA" in assay:
-                groupedIDs.update(self.rnaIDs)
+                utils.deep_merge_dict(groupedIDs, self.rnaIDs, inplace=True)
             elif "DNA" in assay:
-                groupedIDs.update(self.dnaIDs)
+                utils.deep_merge_dict(groupedIDs, self.dnaIDs, inplace=True)
             elif "GENE_COUNT" in assay:
-                groupedIDs.update(self.extGeneCountIDs)
+                utils.deep_merge_dict(groupedIDs, self.extGeneCountIDs, inplace=True)
+            elif "SPLICE_COUNT" in assay:
+                utils.deep_merge_dict(groupedIDs, self.extSpliceCountIDs, inplace=True)
             else:
                 raise ValueError(f"'{assay}' is not a valid assay name")
         return groupedIDs

diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py
@@ -25,7 +25,7 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD
                 please fix to only have either external count or BAM processing\n")
 
         # check number of IDs per group
-        all_ids = {g: self.rnaIDs[g] + self.extRnaIDs[g] for g in self.groups}
+        all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "GENE_COUNTS"])
         self.checkSubset(all_ids)
 
     def setDefaultKeys(self, dict_):

diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
@@ -1,3 +1,6 @@
+import numpy as np
+import pandas as pd
+
 from snakemake.io import expand
 
 from drop import utils
@@ -16,8 +19,16 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD
         # if self.run is false return without doing any config/sa checks for completeness
         if not self.run:
             return
-        self.rnaIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA")
-        self.checkSubset(self.rnaIDs)
+
+        self.rnaIDs   = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA")
+        self.rnaExIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="SPLICE_COUNT")
+        for g in self.groups:
+            if len(set(self.rnaIDs[g]) & set(self.rnaExIDs[g])) > 0:
+                raise ValueError(f"{set(self.rnaIDs[g]) & set(self.extRnaIDs[g])} has both BAM and external count file \
+                please fix in sample annotation table to only have either external count or BAM processing\n")
+
+        all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "SPLICE_COUNT"])
+        self.checkSubset(all_ids)
 
     def setDefaultKeys(self, dict_):
         super().setDefaultKeys(dict_)
@@ -44,7 +55,7 @@ def getSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
                      "sample_tmp" / "splitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
@@ -56,7 +67,25 @@ def getNonSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
                      "sample_tmp" / "nonSplitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
+
+
+    def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
+        """
+        Get externally provided splice count data dir based on the given group.
+        If a file type is given the corresponding file within the folder is returned. 
+        :param group: DROP group name from wildcard
+        :param fileType: name of the file without extension which is to be returned
+        :return: list of directories or files
+        """
+        ids = self.sampleAnnotation.getIDsByGroup(group, assay="SPLICE_COUNT")
+        extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation=None, group=group, 
+                file_type="SPLICE_COUNTS_DIR", asSet=False)
+        if fileType is not None:
+            extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist()
+            extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles]
+        return extCountFiles
+
diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py
@@ -51,7 +51,7 @@ def setDefaultKeys(self, dict_):
         return dict_
 
     def checkConfigSampleannotation(self):
-        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False)
+        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups)
 
         if len(self.genomeFiles.keys()) > 1:  # more than 1 value in config defined genome dictionary
             if "GENOME" not in subset.columns.values:  # GENOME column not defined
@@ -149,24 +149,24 @@ def setGenomeDict(self, genomeFiles):
         if len(genomeFiles) == 1:  # globally defined in the config
             globalGenome = list(genomeFiles.values())[0]
 
-            # subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column
+            # subset SA by the drop group and skip the filtering by SA-GENOME column
             genomeDict = self.sampleAnnotation.getGenomes(
                 globalGenome,
                 self.groups,
                 file_type="RNA_ID",
                 column="DROP_GROUP", group_key="DROP_GROUP",
-                exact_match=False, skip=True
+                skip=True
             )
         else:
-            # subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key
+            # subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key
             for gf in genomeFiles.keys():
                 genomeDict.update(
                     self.sampleAnnotation.getGenomes(
                         gf,
                         self.groups,
                         file_type="RNA_ID",
                         column="GENOME", group_key="DROP_GROUP",
-                        exact_match=False, skip=False
+                        skip=False
                     )
                 )
 

diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
@@ -16,13 +16,14 @@ exportCounts:
     - v29
   excludeGroups:
     - mae
-    - import_exp
+    - outrider_external
+    - fraser_external
 
 aberrantExpression:
     run: true
     groups:
       - outrider
-      - import_exp
+      - outrider_external
     fpkmCutoff: 1
     implementation: autoencoder
     padjCutoff: 1
@@ -36,6 +37,7 @@ aberrantSplicing:
     run: true
     groups:
       - fraser
+      - fraser_external
     recount: true
     longRead: false
     keepNonStandardChrs: true