From 4b6b0bf09630612dca42429e422faabc9d2df0ea Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Wed, 10 Feb 2021 00:00:04 +0000
Subject: [PATCH 01/65] initial merge of external splicing counts for FRASER

---
 drop/config/SampleAnnotation.py               | 31 +++++++++++++------
 drop/config/submodules/AberrantSplicing.py    | 21 +++++++++++++
 drop/demo/config_relative.yaml                |  2 ++
 drop/demo/sample_annotation_relative.tsv      | 22 +++++++------
 drop/download_data.sh                         |  2 +-
 .../Counting/00_define_datasets_from_anno.R   |  5 +--
 .../Counting/03_filter_expression_FraseR.R    | 26 +++++++++++++---
 drop/requirementsR.txt                        |  2 +-
 tests/config/test_AE.py                       |  2 +-
 tests/config/test_AS.py                       |  2 +-
 tests/config/test_SampleAnnotation.py         | 10 +++---
 11 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 0bdfc7d2..d965d2af 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -9,7 +9,7 @@
 
 
 class SampleAnnotation:
-    FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"]
+    FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR"]
     SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [
         "RNA_ID", "DNA_ID", "DROP_GROUP", "GENE_ANNOTATION",
         "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND", "GENOME"
@@ -32,6 +32,7 @@ def __init__(self, file, root, genome):
         self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',')
         # external counts
         self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',')
+        self.extSpliceCountIDs = self.createGroupIds(file_type="SPLICE_COUNTS_DIR", sep=',')
 
     def parse(self, sep='\t'):
         """
@@ -80,10 +81,12 @@ def createSampleFileMapping(self):
             columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ]
         """
 
-        assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']}
+        assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE', 'SPLICE_COUNTS_DIR'], 'DNA_ID': ['DNA_VCF_FILE']}
         assay_subsets = []
         for id_, file_types in assay_mapping.items():
             for file_type in file_types:
+                if file_type not in self.sa.columns:
+                    continue
                 df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy()
                 df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True)
                 df['ASSAY'] = id_
@@ -249,17 +252,22 @@ def getGenomes(self, value, group, file_type="RNA_ID",
 
         return {sample_id: value for sample_id in subset[file_type].tolist()}
 
-    def getImportCountFiles(self, annotation, group, file_type="GENE_COUNTS_FILE",
-                            annotation_key="GENE_ANNOTATION", group_key="DROP_GROUP",exact_match = True):
+    def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE",
+                            annotation_key: str = "ANNOTATION", group_key: str = "DROP_GROUP", 
+                            asSet: bool = True):
         """
-        :param annotation: annotation name as specified in config and GENE_ANNOTATION column
-        :param group: a group of the DROP_GROUP column. exact match is passed to subsetter, false allows for substring matching
+        :param annotation: annotation name as specified in config and ANNOTATION column. Can be None
+        :param group: a group of the DROP_GROUP column
         :return: set of unique external count file names
         """
         #subset for the annotation_key in the annotation group and the group_key in the group
-        subset = self.subsetSampleAnnotation(annotation_key, annotation,exact_match=exact_match)
-        subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match)
-        return set(subset[file_type].tolist())
+        subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match)
+            
+        ans = subset[file_type].tolist()
+        if asSet:
+            ans = set(ans)
+        return ans
 
     def getRow(self, column, value):
         sa = self.annotationTable
@@ -277,7 +285,8 @@ def getGroupedIDs(self, assays):
         Get group to IDs mapping
         :param assays: list of or single assay the IDs should be from. Can be file_type or 'RNA'/'DNA'
         """
-        assays = [assays] if isinstance(assays, str) else assays
+        if isinstance(assays, str):
+            assays = [assays]
         groupedIDs = defaultdict(list)
         for assay in assays:
             if "RNA" in assay:
@@ -286,6 +295,8 @@ def getGroupedIDs(self, assays):
                 groupedIDs.update(self.dnaIDs)
             elif "GENE_COUNT" in assay:
                 groupedIDs.update(self.extGeneCountIDs)
+            elif "SPLICE_COUNT" in assay:
+                groupedIDs.update(self.extSpliceCountIDs)
             else:
                 raise ValueError(f"'{assay}' is not a valid assay name")
         return groupedIDs
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index 4d3f3651..3cdd46d1 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -1,3 +1,6 @@
+import numpy as np
+import pandas as pd
+
 from snakemake.io import expand
 
 from drop import utils
@@ -59,3 +62,21 @@ def getNonSplitCountFiles(self, dataset):
                      "sample_tmp" / "nonSplitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
+
+
+    def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
+        """
+        Get externally provided splice count data dir based on the given group.
+        If a file type is given the corresponding file within the folder is returned. 
+        :param group: DROP group name from wildcard
+        :param fileType: name of the file without extension which is to be returned
+        :return: list of directories or files
+        """
+        ids = self.sa.getIDsByGroup(group, assay="SPLICE_COUNT")
+        extCountFiles = self.sa.getImportCountFiles(annotation=None, group=group, 
+                file_type="SPLICE_COUNTS_DIR", asSet=False)
+        if fileType is not None:
+            extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist()
+            extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles]
+        return extCountFiles
+    
diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
index f79852d0..ca9cc6c9 100755
--- a/drop/demo/config_relative.yaml
+++ b/drop/demo/config_relative.yaml
@@ -17,6 +17,7 @@ exportCounts:
   excludeGroups:
     - mae
     - import_exp
+    - fraser_ex
 
 aberrantExpression:
     run: true
@@ -36,6 +37,7 @@ aberrantSplicing:
     run: true
     groups:
       - fraser
+      - fraser_ex
     recount: true
     longRead: false
     keepNonStandardChrs: true
diff --git a/drop/demo/sample_annotation_relative.tsv b/drop/demo/sample_annotation_relative.tsv
index 0da07a27..f87a4f8d 100755
--- a/drop/demo/sample_annotation_relative.tsv
+++ b/drop/demo/sample_annotation_relative.tsv
@@ -1,4 +1,4 @@
-RNA_ID	RNA_BAM_FILE	DNA_VCF_FILE	DNA_ID	DROP_GROUP	PAIRED_END	COUNT_MODE	COUNT_OVERLAPS	STRAND	HPO_TERMS	GENE_COUNTS_FILE	GENE_ANNOTATION	GENOME
+RNA_ID	RNA_BAM_FILE	DNA_VCF_FILE	DNA_ID	DROP_GROUP	PAIRED_END	COUNT_MODE	COUNT_OVERLAPS	STRAND	HPO_TERMS	GENE_COUNTS_FILE	GENE_ANNOTATION	GENOME	SPLICE_COUNTS_DIR
 HG00096.1.M_111124_6	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	outrider,mae	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
 HG00103.4.M_120208_3	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	outrider,mae	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
 HG00106.4.M_120208_5	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
@@ -7,17 +7,19 @@ HG00116.2.M_120131_1	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam	Data/dna_vcf/de
 HG00126.1.M_111124_8	Data/rna_bam/HG00126.1.M_111124_8_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00126	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0000290,HP:0000293
 HG00132.2.M_111215_4	Data/rna_bam/HG00132.2.M_111215_4_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00132	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0006489,HP:0006490
 HG00149.1.M_111124_6	Data/rna_bam/HG00149.1.M_111124_6_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00149	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0000014,HP:0000020,HP:0032663
-HG00150.4.M_120208_7	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
-HG00176.4.M_120208_2	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
-HG00096.1.M_111124_6_trunc	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
-HG00103.4.M_120208_3_trunc	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
-HG00106.4.M_120208_5_trunc	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
-HG00111.2.M_111215_4_trunc	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
-HG00116.2.M_120131_1_trunc	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
+HG00150.4.M_120208_7	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	fraser_ex,outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
+HG00176.4.M_120208_2	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	fraser_ex,outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
+HG00096.1.M_111124_6_trunc	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
+HG00103.4.M_120208_3_trunc	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
+HG00106.4.M_120208_5_trunc	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
+HG00111.2.M_111215_4_trunc	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
+HG00116.2.M_120131_1_trunc	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
 HG00126.1.M_111124_8_trunc	Data/rna_bam/HG00126.1.M_111124_8_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00126	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0000290,HP:0000293
 HG00132.2.M_111215_4_trunc	Data/rna_bam/HG00132.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00132	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0006489,HP:0006490
 HG00149.1.M_111124_6_trunc	Data/rna_bam/HG00149.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00149	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0000014,HP:0000020,HP:0032663
 HG00150.4.M_120208_7_trunc	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
 HG00176.4.M_120208_2_trunc	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
-HG00178.4.M_120208_8				import_exp						Data/external_geneCounts.tsv.gz	v29
-HG00181.4.M_120208_4				import_exp						Data/external_geneCounts.tsv.gz	v29
+HG00178.4.M_120208_8				import_exp						Data/external_count_data/geneCounts.tsv.gz	v29
+HG00181.4.M_120208_4				fraser_ex,import_exp						Data/external_count_data/geneCounts.tsv.gz	v29		Data/external_count_data
+HG00191.3.M_120208_3				fraser_ex									Data/external_count_data
+HG00201.1.M_120208_6				fraser_ex									Data/external_count_data
diff --git a/drop/download_data.sh b/drop/download_data.sh
index ba2ac064..b84bf07e 100644
--- a/drop/download_data.sh
+++ b/drop/download_data.sh
@@ -2,7 +2,7 @@
 set -e
 
 # get data
-resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz"
+resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz"
 tmpdir="$(dirname "$(mktemp)")"
 wget -nc -P $tmpdir $resource_url
 mkdir -p Data
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R
index 72e17db9..499e6a9a 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R
@@ -38,9 +38,10 @@ mapping <- fread(fileMapFile)
 
 subset_ids <- snakemake@params$ids
 annoSub <- anno[RNA_ID %in% subset_ids]
-colData <- merge(
-    annoSub[,.(sampleID = RNA_ID, STRAND, PAIRED_END)],
+setnames(annoSub, "RNA_ID", "sampleID")
+colData <- merge(annoSub,
     mapping[FILE_TYPE == "RNA_BAM_FILE", .(sampleID=ID, bamFile=FILE_PATH)])
+setcolorder(colData, unique(c("sampleID", "STRAND", "PAIRED_END", "bamFile"), colnames(annoSub)))
 
 #'
 #' ## Dataset: `r name`
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index 731300ae..b40caf89 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -7,9 +7,11 @@
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="SPLICE_COUNT")`'
 #'  input:
 #'   - theta:  '`sm cfg.getProcessedDataDir()+
 #'                  "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'   - exCounts: '`sm lambda w: cfg.AS.getExternalCounts(w.dataset, "k_j_counts")`'
 #'  output:
 #'   - fds: '`sm cfg.getProcessedDataDir() +
 #'                "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`'
@@ -27,7 +29,12 @@ opts_chunk$set(fig.width=12, fig.height=8)
 # input
 dataset    <- snakemake@wildcards$dataset
 workingDir <- snakemake@params$workingDir
-params <- snakemake@config$aberrantSplicing
+params     <- snakemake@config$aberrantSplicing
+exCountIDs <- snakemake@params$exCountIDs
+exCountFiles <- snakemake@input$exCounts
+sample_anno_file <- snakemake@config$sampleAnnotation
+minExpressionInOneSample <- params$minExpressionInOneSample
+minDeltaPsi <- params$minDeltaPsi
 
 fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
 
@@ -35,10 +42,21 @@ register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
 setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
-# Apply filter
-minExpressionInOneSample <- params$minExpressionInOneSample
-minDeltaPsi <- params$minDeltaPsi
+# Add external data if provided by dataset
+if(length(exCountIDs) > 0){
+    for(resource in unique(exCountFiles)){
+        exSampleIDs <- exCountIDs[exCountFiles == resource]
+        exAnno <- fread(sample_anno_file, key="RNA_ID")[J(exSampleIDs)]
+        setnames(exAnno, "RNA_ID", "sampleID")
+        
+        ctsNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
+        ctsFiles <- paste0(dirname(resource), "/", ctsNames, "_counts.tsv.gz")
+        fds <- mergeExternalData(fds=fds, countFiles=ctsFiles,
+                sampleIDs=exSampleIDs, annotation=exAnno)
+    }
+}
 
+# Apply filter
 fds <- filterExpressionAndVariability(fds, 
                         minExpressionInOneSample = minExpressionInOneSample,
                         minDeltaPsi = minDeltaPsi,
diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
index 02151c2d..b6fb8718 100644
--- a/drop/requirementsR.txt
+++ b/drop/requirementsR.txt
@@ -1,6 +1,6 @@
 package	version
 gagneurlab/OUTRIDER	1.6.1
-c-mertes/FRASER	1.2.1
+c-mertes/FRASER	1.2.2
 mumichae/tMAE	1.0.0
 VariantAnnotation	
 rmarkdown	
diff --git a/tests/config/test_AE.py b/tests/config/test_AE.py
index fa1e1023..8643130d 100644
--- a/tests/config/test_AE.py
+++ b/tests/config/test_AE.py
@@ -35,7 +35,7 @@ def test_getCountsFiles(self, demo_dir, dropConfig):
 
         # import count
         counts_files_true = counts_files_true[2:]
-        counts_files_true.append(f"{demo_dir}/Data/external_geneCounts.tsv.gz")
+        counts_files_true.append(f"{demo_dir}/Data/external_count_data/geneCounts.tsv.gz")
         counts_files_true.sort()
         counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="import_exp")
         counts_files_test.sort()
diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py
index df6b527d..c3e3dc38 100644
--- a/tests/config/test_AS.py
+++ b/tests/config/test_AS.py
@@ -3,7 +3,7 @@ class Test_AS_Config:
     def test_config(self, dropConfig,demo_dir):
         assert dropConfig.AS.getWorkdir() == f"{demo_dir}/Scripts/AberrantSplicing/pipeline"
         dict_ = {
-            'groups': ['fraser'],
+            'groups': ['fraser', 'fraser_ex'],
             'recount': True,
             'longRead': False,
             'keepNonStandardChrs': True,
diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py
index ba93966f..6ddaf610 100644
--- a/tests/config/test_SampleAnnotation.py
+++ b/tests/config/test_SampleAnnotation.py
@@ -13,9 +13,9 @@ def test_columns(self, sampleAnnotation):
 
     def test_mapping(self, sampleAnnotation):
         # ID mappings/groups
-        assert sampleAnnotation.idMapping.shape == (22, 2)
-        assert sampleAnnotation.sampleFileMapping.shape == (32, 4)
-        true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10}
+        assert sampleAnnotation.idMapping.shape == (24, 2)
+        assert sampleAnnotation.sampleFileMapping.shape == (35, 4)
+        true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10, 'fraser_ex': 10}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()}
 
@@ -23,7 +23,7 @@ def test_mapping(self, sampleAnnotation):
         "sample_id,file_type,file_name",
         [
             ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"),
-            ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_geneCounts.tsv.gz"),
+            ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_count_data/geneCounts.tsv.gz"),
             ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz")
         ]
     )
@@ -35,7 +35,7 @@ def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_
     @pytest.mark.parametrize(
         "annotation,group,files",
         [
-            ("v29", "import_exp", {'Data/external_geneCounts.tsv.gz'})
+            ("v29", "import_exp", {'Data/external_count_data/geneCounts.tsv.gz'})
         ]
     )
     def test_import(self, demo_dir, sampleAnnotation, annotation, group, files):

From e0e58443a65480b81ec6063c6041cbde8b8e13d0 Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Wed, 10 Feb 2021 07:58:23 +0000
Subject: [PATCH 02/65] fix download and add more test cases

---
 drop/download_data.sh                 | 2 +-
 tests/config/test_SampleAnnotation.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drop/download_data.sh b/drop/download_data.sh
index b84bf07e..0b4748e9 100644
--- a/drop/download_data.sh
+++ b/drop/download_data.sh
@@ -4,7 +4,7 @@ set -e
 # get data
 resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz"
 tmpdir="$(dirname "$(mktemp)")"
-wget -nc -P $tmpdir $resource_url
+wget -nc -O $tmpdir/resource.tar.gz $resource_url
 mkdir -p Data
 if [ -z "$(ls Data)" ]; then
 	tar -zxvf "$tmpdir/resource.tar.gz" -C .
diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py
index 6ddaf610..6fdcbf5e 100644
--- a/tests/config/test_SampleAnnotation.py
+++ b/tests/config/test_SampleAnnotation.py
@@ -24,7 +24,8 @@ def test_mapping(self, sampleAnnotation):
         [
             ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"),
             ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_count_data/geneCounts.tsv.gz"),
-            ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz")
+            ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz"),
+            ("HG00201.1.M_120208_6", "SPLICE_COUNT_DIR" "Data/external_count_data")
         ]
     )
     def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_name):

From 7e597644d04cbc4316454dc470ccfa9f016de74d Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Wed, 10 Feb 2021 08:42:08 +0000
Subject: [PATCH 03/65] fix test

---
 tests/config/test_SampleAnnotation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py
index 6fdcbf5e..f1e1d21e 100644
--- a/tests/config/test_SampleAnnotation.py
+++ b/tests/config/test_SampleAnnotation.py
@@ -25,7 +25,7 @@ def test_mapping(self, sampleAnnotation):
             ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"),
             ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_count_data/geneCounts.tsv.gz"),
             ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz"),
-            ("HG00201.1.M_120208_6", "SPLICE_COUNT_DIR" "Data/external_count_data")
+            ("HG00201.1.M_120208_6", "SPLICE_COUNTS_DIR", "Data/external_count_data")
         ]
     )
     def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_name):

From 434135dd303c188b4cc8d45fdf2bfb0ff708546e Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Wed, 10 Feb 2021 16:43:24 +0000
Subject: [PATCH 04/65] fix wget download and heatmap plotting

---
 drop/download_data.sh                                    | 2 +-
 drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drop/download_data.sh b/drop/download_data.sh
index 0b4748e9..c2600003 100644
--- a/drop/download_data.sh
+++ b/drop/download_data.sh
@@ -4,7 +4,7 @@ set -e
 # get data
 resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz"
 tmpdir="$(dirname "$(mktemp)")"
-wget -nc -O $tmpdir/resource.tar.gz $resource_url
+wget -c -O $tmpdir/resource.tar.gz $resource_url
 mkdir -p Data
 if [ -z "$(ls Data)" ]; then
 	tar -zxvf "$tmpdir/resource.tar.gz" -C .
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index a24168a5..d8b549c9 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -62,7 +62,7 @@ topN <- 30000
 topJ <- 10000
 for(type in psiTypes){
   before <- plotCountCorHeatmap(
-    fds,
+    object=fds,
     type = type,
     logit = TRUE,
     topN = topN,
@@ -78,7 +78,7 @@ for(type in psiTypes){
   )
   before
   after <- plotCountCorHeatmap(
-    fds,
+    object=fds,
     type = type,
     logit = TRUE,
     topN = topN,

From d353a566b01df6555a114bf1125191e9129439fd Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Wed, 11 Aug 2021 14:19:15 +0200
Subject: [PATCH 05/65] adapt to new naming of sampleannotation

---
 drop/config/ExportCounts.py                | 2 +-
 drop/config/SampleAnnotation.py            | 2 +-
 drop/config/submodules/AberrantSplicing.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py
index 5044d753..f02e7023 100644
--- a/drop/config/ExportCounts.py
+++ b/drop/config/ExportCounts.py
@@ -27,7 +27,7 @@ def __init__(
         self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"]
         self.config_dict = self.setDefaults(dict_, genome.annotation)
         self.outputRoot = outputRoot / "exported_counts"
-        self.sa = sampleAnnotation
+        self.sampleAnnotation = sampleAnnotation
         self.genomeAssembly = genome.assembly
         self.geneAnnotations = self.get("geneAnnotations")
         self.modules = {
diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index d965d2af..63ccc280 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -85,7 +85,7 @@ def createSampleFileMapping(self):
         assay_subsets = []
         for id_, file_types in assay_mapping.items():
             for file_type in file_types:
-                if file_type not in self.sa.columns:
+                if file_type not in self.annotationTable.columns:
                     continue
                 df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy()
                 df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True)
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index 3cdd46d1..f81df573 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -72,8 +72,8 @@ def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
         :param fileType: name of the file without extension which is to be returned
         :return: list of directories or files
         """
-        ids = self.sa.getIDsByGroup(group, assay="SPLICE_COUNT")
-        extCountFiles = self.sa.getImportCountFiles(annotation=None, group=group, 
+        ids = self.sampleAnnotation.getIDsByGroup(group, assay="SPLICE_COUNT")
+        extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation=None, group=group, 
                 file_type="SPLICE_COUNTS_DIR", asSet=False)
         if fileType is not None:
             extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist()

From 271265885b3887c4bf0e34ecba0255d442ab94ff Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Thu, 12 Aug 2021 00:44:48 +0200
Subject: [PATCH 06/65] use only exact matching in subsetBy related to #244

---
 drop/config/SampleAnnotation.py               | 25 +++++++++----------
 .../submodules/MonoallelicExpression.py       | 10 ++++----
 drop/utils.py                                 | 18 ++++++-------
 3 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 63ccc280..57fa7221 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -153,13 +153,12 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
 
     ### Subsetting
 
-    def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
+    def subsetSampleAnnotation(self, column, values, subset=None):
         """
         subset by one or more values of different columns from sample file mapping
             :param column: valid column in sample annotation
             :param values: values of column to subset
             :param subset: subset sample annotation
-            :param exact_match: whether to match substrings in the sample annotation, false allows substring matching
         """
         sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS)
         if subset is None:
@@ -174,7 +173,7 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True):
         # check if column is valid
         if column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
-        return utils.subsetBy(subset, column, values, exact_match=exact_match)
+        return utils.subsetBy(subset, column, values)
 
     def subsetFileMapping(self, file_type=None, sample_id=None):
         """
@@ -233,10 +232,9 @@ def getFilePaths(self, file_type, group=None):
         return self.getFilePath(sampleIDs, file_type, single_file=False)
 
     # build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output
-    def getGenomes(self, value, group, file_type="RNA_ID",
-                            column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False):
+    def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False):
         """
-        :param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation 
+        :param value: values to match in the column.
         :param group: a group of the group_key (DROP_GROUP) column. 
         :return: dict file_type to column
         """
@@ -245,24 +243,25 @@ def getGenomes(self, value, group, file_type="RNA_ID",
         if skip:
             subset = None
         else:
-            subset = self.subsetSampleAnnotation(column, value,exact_match=True)
+            subset = self.subsetSampleAnnotation(column, value)
 
         # additionally subset for the group_key and the group
-        subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
 
         return {sample_id: value for sample_id in subset[file_type].tolist()}
 
     def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE",
-                            annotation_key: str = "ANNOTATION", group_key: str = "DROP_GROUP", 
+                            annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", 
                             asSet: bool = True):
         """
-        :param annotation: annotation name as specified in config and ANNOTATION column. Can be None
-        :param group: a group of the DROP_GROUP column
+        :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None
+        :param group: a group of the DROP_GROUP column.
         :return: set of unique external count file names
         """
+        
         #subset for the annotation_key in the annotation group and the group_key in the group
-        subset = self.subsetSampleAnnotation(annotation_key, annotation, exact_match=exact_match)
-        subset = self.subsetSampleAnnotation(group_key, group, subset, exact_match=exact_match)
+        subset = self.subsetSampleAnnotation(annotation_key, annotation)
+        subset = self.subsetSampleAnnotation(group_key, group, subset)
             
         ans = subset[file_type].tolist()
         if asSet:
diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py
index 5d87fb96..2a9d35cc 100644
--- a/drop/config/submodules/MonoallelicExpression.py
+++ b/drop/config/submodules/MonoallelicExpression.py
@@ -34,7 +34,7 @@ def __init__(
         self.checkConfigSampleannotation()
 
     def checkConfigSampleannotation(self):
-        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False)
+        subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups)
 
         if len(self.genomeFiles.keys()) > 1:  # more than 1 value in config defined genome dictionary
             if "GENOME" not in subset.columns.values:  # GENOME column not defined
@@ -147,16 +147,16 @@ def setGenomeDict(self, genomeFiles):
         if len(genomeFiles) == 1:  # globally defined in the config
             globalGenome = list(genomeFiles.values())[0]
 
-            # subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column
+            # subset SA by the drop group and skip the filtering by SA-GENOME column
             genomeDict = self.sampleAnnotation.getGenomes(
                 globalGenome,
                 self.groups,
                 file_type="RNA_ID",
                 column="DROP_GROUP", group_key="DROP_GROUP",
-                exact_match=False, skip=True
+                skip=True
             )
         else:
-            # subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key
+            # subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key
             for gf in genomeFiles.keys():
                 genomeDict.update(
                     self.sampleAnnotation.getGenomes(
@@ -164,7 +164,7 @@ def setGenomeDict(self, genomeFiles):
                         self.groups,
                         file_type="RNA_ID",
                         column="GENOME", group_key="DROP_GROUP",
-                        exact_match=False, skip=False
+                        skip=False
                     )
                 )
 
diff --git a/drop/utils.py b/drop/utils.py
index b325f38f..5659804b 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -65,22 +65,20 @@ def getWBuildSnakefile(str_=True):
     return returnPath(wb_path / "wBuild.snakefile", str_=str_)
 
 
-def subsetBy(df, column, values, exact_match=True):
+def subsetBy(df, column, values):
     """
     Subset by one or more values of different columns from data frame
     :param df: data frame
     :param column: column to subset by
     :param values: values to subset by
-    :param exact_match: default True. when False match substrings. Important for subsetting drop groups
     :return: df subset by values and column
     """
     if values is None:
         return df
-    elif isinstance(values, str) and exact_match :
-        return df[df[column] == values]
-    elif not isinstance(values,str) and exact_match:
-        return df[df[column].isin(values)]
-    elif isinstance(values,str) and not exact_match:
-        return df[df[column].str.contains(values)]
-    else:
-        return df[df[column].str.contains("|".join(values))]
+    
+    inner_regex = values
+    if not isinstance(values, str) :
+        inner_regex = "(" + "|".join(values) + ")"
+    
+    return  df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")]
+    

From f85e13072be885a0631b0fbcb330c2512fd6212e Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Fri, 13 Aug 2021 00:10:01 +0200
Subject: [PATCH 07/65] fix merge of subsetGroups function related to: #246

---
 drop/config/SampleAnnotation.py              |  8 +++---
 drop/config/submodules/AberrantExpression.py |  2 +-
 drop/config/submodules/AberrantSplicing.py   |  7 ++++--
 drop/utils.py                                | 26 ++++++++++++++++++++
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 57fa7221..7338a728 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -289,13 +289,13 @@ def getGroupedIDs(self, assays):
         groupedIDs = defaultdict(list)
         for assay in assays:
             if "RNA" in assay:
-                groupedIDs.update(self.rnaIDs)
+                utils.deep_merge_dict(groupedIDs, self.rnaIDs, inplace=True)
             elif "DNA" in assay:
-                groupedIDs.update(self.dnaIDs)
+                groupedIDs = utils.deep_merge_dict(groupedIDs, self.dnaIDs)
             elif "GENE_COUNT" in assay:
-                groupedIDs.update(self.extGeneCountIDs)
+                groupedIDs = utils.deep_merge_dict(groupedIDs, self.extGeneCountIDs)
             elif "SPLICE_COUNT" in assay:
-                groupedIDs.update(self.extSpliceCountIDs)
+                groupedIDs = utils.deep_merge_dict(groupedIDs, self.extSpliceCountIDs)
             else:
                 raise ValueError(f"'{assay}' is not a valid assay name")
         return groupedIDs
diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py
index edd10d9c..c1458a9b 100644
--- a/drop/config/submodules/AberrantExpression.py
+++ b/drop/config/submodules/AberrantExpression.py
@@ -25,7 +25,7 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD
                 please fix to only have either external count or BAM processing\n")
 
         # check number of IDs per group
-        all_ids = {g: self.rnaIDs[g] + self.extRnaIDs[g] for g in self.groups}
+        all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "GENE_COUNTS"])
         self.checkSubset(all_ids)
 
     def setDefaultKeys(self, dict_):
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index f81df573..40b64067 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -19,8 +19,11 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD
         # if self.run is false return without doing any config/sa checks for completeness
         if not self.run:
             return
-        self.rnaIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA")
-        self.checkSubset(self.rnaIDs)
+        
+        self.rnaIDs   = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA")
+        self.rnaExIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="SPLICE_COUNT")
+        all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "SPLICE_COUNT"])
+        self.checkSubset(all_ids)
 
     def setDefaultKeys(self, dict_):
         super().setDefaultKeys(dict_)
diff --git a/drop/utils.py b/drop/utils.py
index 5659804b..e49efd5d 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from snakemake.logging import logger
 import wbuild
+import copy
 
 
 def returnPath(path, str_=True):
@@ -82,3 +83,28 @@ def subsetBy(df, column, values):
     
     return  df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")]
     
+def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
+    """
+    Merges two dictionaries and all is children recursively
+    
+    :param dict1: dictionary to be merged into
+    :param dict2: dictionary to be merged
+    :param inplace: if False, default, a new dictionary will be returned als in-place merging is performed.
+    """
+    if not inplace:
+        dict1 = copy.deepcopy(dict1)
+        dict2 = copy.deepcopy(dict2)
+    
+    for k, v in dict2.items():
+        if isinstance(dict1.get(k), dict) and isinstance(v, dict):
+            dict1[k] = dict_merge(dict1[k], v, inplace=inplace)
+        elif k not in dict1:
+            dict1[k] = v
+        elif isinstance(dict1.get(k), list) and isinstance(v, list):
+            dict1[k] = list(dict.fromkeys(dict1[k] + v))
+        elif isinstance(dict1.get(k), str) and isinstance(v, str):
+            dict1[k] = [dict1.get(k), v]
+        else:
+            raise Error(f"{k} has different types that can not be merged.")
+        
+    return dict1

From 91566f6c2a2df24f5cb6da1be073bd6cf09b23a6 Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Fri, 13 Aug 2021 00:13:39 +0200
Subject: [PATCH 08/65] fix snakemake file dependency after merging external
 counts.

---
 .../Counting/01_2_countRNA_splitReads_merge.R    |  4 ++--
 .../Counting/01_4_countRNA_nonSplitReads_merge.R |  8 ++++++--
 .../Counting/01_5_countRNA_collect.R             | 16 +++++++++-------
 .../Counting/03_filter_expression_FraseR.R       | 15 ++++++++++-----
 4 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
index 1ea2a86e..c172d6b8 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
@@ -11,8 +11,8 @@
 #'  input:
 #'   - sample_counts: '`sm lambda w: cfg.AS.getSplitCountFiles(w.dataset)`'
 #'  output:
-#'   - countsJ: '`sm cfg.getProcessedDataDir() +
-#'                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
+###   - countsJ: '`sm cfg.getProcessedDataDir() +
+###                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
 #'   - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
 #'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
index 83c48939..5d19c2aa 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
@@ -13,8 +13,10 @@
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
 #'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
 #'  output:
-#'   - countsSS: '`sm cfg.getProcessedDataDir() +
-#'                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+###   - countsSS: '`sm cfg.getProcessedDataDir() +
+###                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+#'   - done:     '`sm cfg.getProcessedDataDir() + 
+#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/merge_theta.done"`'
 #'  type: script
 #'---
 
@@ -50,3 +52,5 @@ nonSplitCounts <- getNonSplitReadCountsForAllSamples(fds=fds,
                                                      longRead=params$longRead)
 
 message(date(), ":", dataset, " nonSplit counts done")
+
+file.create(snakemake@output$done)
\ No newline at end of file
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
index da3d38b6..e2cccf8d 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
@@ -8,10 +8,12 @@
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
 #'  input:
-#'   - countsJ:  '`sm cfg.getProcessedDataDir() + 
-#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
-#'   - countsSS: '`sm cfg.getProcessedDataDir() + 
-#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+###   - countsJ:  '`sm cfg.getProcessedDataDir() + 
+###                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
+###   - countsSS: '`sm cfg.getProcessedDataDir() + 
+###                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+#'   - countsSSdone: '`sm cfg.getProcessedDataDir() + 
+#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/merge_theta.done"`'
 #'   - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
 #'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
 #'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
@@ -27,7 +29,7 @@ source(snakemake@params$setup, echo=FALSE)
 
 dataset    <- snakemake@wildcards$dataset
 workingDir <- snakemake@params$workingDir
-
+saveDir    <- dirname(snakemake@input$countsSSdone)
 
 # Read FRASER object
 fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
@@ -35,7 +37,7 @@ splitCounts_gRanges <- readRDS(snakemake@input$gRangesSplitCounts)
 spliceSiteCoords <- readRDS(snakemake@input$spliceSites)
 
 # Get splitReads and nonSplitRead counts in order to store them in FRASER object
-splitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsJ, "rawCountsJ")
+splitCounts_h5 <- HDF5Array::HDF5Array(file.path(saveDir, "rawCountsJ.h5"), "rawCountsJ")
 splitCounts_se <- SummarizedExperiment(
   colData = colData(fds),
   rowRanges = splitCounts_gRanges,
@@ -43,7 +45,7 @@ splitCounts_se <- SummarizedExperiment(
 )
 
 
-nonSplitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsSS, "rawCountsSS")
+nonSplitCounts_h5 <- HDF5Array::HDF5Array(file.path(saveDir, "rawCountsSS.h5"), "rawCountsSS")
 nonSplitCounts_se <- SummarizedExperiment(
   colData = colData(fds),
   rowRanges = spliceSiteCoords,
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index b40caf89..9c6d6e50 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -51,17 +51,22 @@ if(length(exCountIDs) > 0){
         
         ctsNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
         ctsFiles <- paste0(dirname(resource), "/", ctsNames, "_counts.tsv.gz")
+        
         fds <- mergeExternalData(fds=fds, countFiles=ctsFiles,
                 sampleIDs=exSampleIDs, annotation=exAnno)
     }
 }
 
-# Apply filter
+# filter for expression and write it out to disc.
+# 
+# TODO:   This will brake a rerun of step 01_5_countRNA_collect.R as it writes 
+#         out the rawCountsJ and rawCountsSS file including the external samples. 
+# 
 fds <- filterExpressionAndVariability(fds, 
-                        minExpressionInOneSample = minExpressionInOneSample,
-                        minDeltaPsi = minDeltaPsi,
-                        filter=FALSE)
-devNull <- saveFraserDataSet(fds)
+        minExpressionInOneSample = minExpressionInOneSample,
+        minDeltaPsi = minDeltaPsi,
+        filter=FALSE)
+fds <- saveFraserDataSet(fds)
 
 # Keep junctions that pass filter
 name(fds) <- dataset

From 0cf58326c44674ec0e6c8f31de5ac722c1d30491 Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Fri, 13 Aug 2021 00:53:01 +0200
Subject: [PATCH 09/65] correct naming

---
 drop/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drop/utils.py b/drop/utils.py
index e49efd5d..307b3dae 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -97,7 +97,7 @@ def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
     
     for k, v in dict2.items():
         if isinstance(dict1.get(k), dict) and isinstance(v, dict):
-            dict1[k] = dict_merge(dict1[k], v, inplace=inplace)
+            dict1[k] = deep_merge_dict(dict1[k], v, inplace=inplace)
         elif k not in dict1:
             dict1[k] = v
         elif isinstance(dict1.get(k), list) and isinstance(v, list):
@@ -105,6 +105,6 @@ def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
         elif isinstance(dict1.get(k), str) and isinstance(v, str):
             dict1[k] = [dict1.get(k), v]
         else:
-            raise Error(f"{k} has different types that can not be merged.")
+            raise TypeError(f"{k} has different types that can not be merged.")
         
     return dict1

From b1be9d3e436bf8b3a3e8e2d73a6103cf20a3fe9a Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Fri, 13 Aug 2021 01:00:50 +0200
Subject: [PATCH 10/65] cleanup code

---
 drop/config/SampleAnnotation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 7338a728..af3ccc0f 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -291,11 +291,11 @@ def getGroupedIDs(self, assays):
             if "RNA" in assay:
                 utils.deep_merge_dict(groupedIDs, self.rnaIDs, inplace=True)
             elif "DNA" in assay:
-                groupedIDs = utils.deep_merge_dict(groupedIDs, self.dnaIDs)
+                utils.deep_merge_dict(groupedIDs, self.dnaIDs, inplace=True)
             elif "GENE_COUNT" in assay:
-                groupedIDs = utils.deep_merge_dict(groupedIDs, self.extGeneCountIDs)
+                utils.deep_merge_dict(groupedIDs, self.extGeneCountIDs, inplace=True)
             elif "SPLICE_COUNT" in assay:
-                groupedIDs = utils.deep_merge_dict(groupedIDs, self.extSpliceCountIDs)
+                utils.deep_merge_dict(groupedIDs, self.extSpliceCountIDs, inplace=True)
             else:
                 raise ValueError(f"'{assay}' is not a valid assay name")
         return groupedIDs

From 6e0467c1ffbfab4d5bbb77a5e557796a6aebf0e7 Mon Sep 17 00:00:00 2001
From: Christian Mertes <mertes@in.tum.de>
Date: Sat, 14 Aug 2021 00:18:23 +0200
Subject: [PATCH 11/65] update FRASER dependency for merge count functionality

---
 drop/requirementsR.txt | 2 +-
 environment.yml        | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
index b6fb8718..58cd9202 100644
--- a/drop/requirementsR.txt
+++ b/drop/requirementsR.txt
@@ -1,6 +1,6 @@
 package	version
 gagneurlab/OUTRIDER	1.6.1
-c-mertes/FRASER	1.2.2
+c-mertes/FRASER	1.4.1
 mumichae/tMAE	1.0.0
 VariantAnnotation	
 rmarkdown	
diff --git a/environment.yml b/environment.yml
index ec905be2..53be3e1c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,3 +8,6 @@ dependencies:
   - drop
   - flake8
   - bioconductor-bsgenome.hsapiens.ucsc.hg19
+
+  # required for downloading R packages through github
+  - unzip

From fdfd3cd76a5a1f47d1d35a29d6b7c5376adfd087 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Tue, 29 Mar 2022 18:07:28 +0200
Subject: [PATCH 12/65] change input/output paths.

---
 drop/config/submodules/AberrantSplicing.py    |  4 ++--
 .../Counting/01_0_countRNA_init.R             |  6 ++---
 .../01_1_countRNA_splitReads_samplewise.R     |  6 ++---
 .../Counting/01_2_countRNA_splitReads_merge.R | 12 +++++-----
 .../01_3_countRNA_nonSplitReads_samplewise.R  |  6 ++---
 .../01_4_countRNA_nonSplitReads_merge.R       | 10 ++++-----
 .../Counting/01_5_countRNA_collect.R          | 20 +++++++----------
 .../02_psi_value_calculation_FraseR.R         |  6 ++---
 .../Counting/03_filter_expression_FraseR.R    | 22 ++++++++++++-------
 .../Counting/Summary.R                        |  4 ++--
 .../Counting/exportCounts.R                   |  2 +-
 .../FRASER/04_fit_hyperparameters_FraseR.R    |  6 ++---
 .../FRASER/05_fit_autoencoder_FraseR.R        |  6 ++---
 .../FRASER/06_calculation_stats_AE_FraseR.R   |  6 ++---
 .../FRASER/07_extract_results_FraseR.R        |  8 +++----
 .../FRASER/Summary.R                          |  2 +-
 16 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index bae20e33..28ff9296 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -50,7 +50,7 @@ def getSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
                      "sample_tmp" / "splitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
@@ -62,7 +62,7 @@ def getNonSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
                      "sample_tmp" / "nonSplitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
index c54948ee..c081c8d4 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
@@ -6,15 +6,15 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_0_init.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  input:
 #'    - colData: '`sm cfg.getProcessedDataDir() + 
 #'                    "/aberrant_splicing/annotations/{dataset}.tsv"`'
 #'  output:
 #'   - fdsobj:  '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-object.RDS"`'
+#'                   "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/fds-object.RDS"`'
 #'   - done_fds: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done" `'
+#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/fds.done" `'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
index 5da348af..ed63f8e2 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
@@ -6,13 +6,13 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "splitReads" / "{sample_id}.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  input:
 #'   - done_fds: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done"`'
+#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/fds.done"`'
 #'  output:
 #'   - done_sample_splitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/cache/raw-{dataset}"
+#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}"
 #'                +"/sample_tmp/splitCounts/sample_{sample_id}.done"`'
 #'  threads: 3
 #'  type: script
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
index c172d6b8..3f56d792 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
@@ -6,19 +6,19 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_2_splitReadsMerge.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  threads: 20
 #'  input:
 #'   - sample_counts: '`sm lambda w: cfg.AS.getSplitCountFiles(w.dataset)`'
 #'  output:
-###   - countsJ: '`sm cfg.getProcessedDataDir() +
-###                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
+#'   - countsJ: '`sm cfg.getProcessedDataDir() +
+#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
 #'   - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
 #'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
index c5d9e6a7..8f290094 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
@@ -6,13 +6,13 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "nonsplitReads" / "{sample_id}.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  input:
 #'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
 #'  output:
 #'   - done_sample_nonSplitCounts : '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' 
+#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' 
 #'  threads: 3
 #'  type: script
 #'---
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
index 5d19c2aa..bffecc51 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
@@ -6,17 +6,17 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_4_nonSplitReadsMerge.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  threads: 20
 #'  input:
 #'   - sample_counts:  '`sm lambda w: cfg.AS.getNonSplitCountFiles(w.dataset)`'
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
 #'  output:
 ###   - countsSS: '`sm cfg.getProcessedDataDir() +
-###                   "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+###                   "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
 #'   - done:     '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/merge_theta.done"`'
+#'                "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/merge_theta.done"`'
 #'  type: script
 #'---
 
@@ -53,4 +53,4 @@ nonSplitCounts <- getNonSplitReadCountsForAllSamples(fds=fds,
 
 message(date(), ":", dataset, " nonSplit counts done")
 
-file.create(snakemake@output$done)
\ No newline at end of file
+file.create(snakemake@output$done)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
index e2cccf8d..35c8fdf8 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
@@ -6,21 +6,17 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_5_collect.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
 #'  input:
-###   - countsJ:  '`sm cfg.getProcessedDataDir() + 
-###                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
-###   - countsSS: '`sm cfg.getProcessedDataDir() + 
-###                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
-#'   - countsSSdone: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/merge_theta.done"`'
-#'   - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
-#'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'    - countsSSdone: '`sm cfg.getProcessedDataDir() + 
+#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/merge_theta.done"`'
+#'    - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
+#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
+#'    - spliceSites: '`sm cfg.getProcessedDataDir() + 
+#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
 #'  output:
 #'   - counting_done: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `'
+#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/counting.done" `'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
index 24e5cb9b..aafa766c 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
@@ -6,14 +6,14 @@
 #'   - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "02_PSIcalc.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
 #'  threads: 30
 #'  input:
 #'   - counting_done: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `'
+#'                "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/counting.done" `'
 #'  output:
 #'  - theta:     '`sm cfg.getProcessedDataDir() +
-#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'                    "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/theta.h5"`'
 #'  type: script
 #'--- 
 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index e86c2ceb..f9f913d9 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -6,17 +6,18 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "03_filter.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDirIn: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
+#'   - workingDirOut: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
 #'   - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="SPLICE_COUNT")`'
 #'  input:
 #'   - theta:  '`sm cfg.getProcessedDataDir()+
-#'                  "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'                  "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/theta.h5"`'
 #'   - exCounts: '`sm lambda w: cfg.AS.getExternalCounts(w.dataset, "k_j_counts")`'
 #'  output:
 #'   - fds: '`sm cfg.getProcessedDataDir() +
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/fds-object.RDS"`'
 #'   - done: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
 #'  threads: 3
 #'  type: script
 #'---
@@ -28,7 +29,8 @@ opts_chunk$set(fig.width=12, fig.height=8)
 
 # input
 dataset    <- snakemake@wildcards$dataset
-workingDir <- snakemake@params$workingDir
+workingDirIn <- snakemake@params$workingDirIn
+workingDirOut <- snakemake@params$workingDirOut
 params     <- snakemake@config$aberrantSplicing
 exCountIDs <- snakemake@params$exCountIDs
 exCountFiles <- snakemake@input$exCounts
@@ -36,7 +38,9 @@ sample_anno_file <- snakemake@config$sampleAnnotation
 minExpressionInOneSample <- params$minExpressionInOneSample
 minDeltaPsi <- params$minDeltaPsi
 
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDirIn, name=paste0("raw-", dataset))
+workingDir(fds) <- workingDirOut
+fds <- saveFraserDataSet(fds,dir = workingDirOut, name=paste0("raw-", dataset))
 
 register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
@@ -66,7 +70,9 @@ fds <- filterExpressionAndVariability(fds,
         minExpressionInOneSample = minExpressionInOneSample,
         minDeltaPsi = minDeltaPsi,
         filter=FALSE)
-fds <- saveFraserDataSet(fds)
+
+message("save new fraser object", workingDirOut)
+fds <- saveFraserDataSet(fds,dir=workingDirOut)
 
 # Keep junctions that pass filter
 name(fds) <- dataset
@@ -78,5 +84,5 @@ if (params$filter == TRUE) {
 
 seqlevels(fds) <- seqlevelsInUse(fds)
 colData(fds)$sampleID <- as.character(colData(fds)$sampleID)
-fds <- saveFraserDataSet(fds)
+fds <- saveFraserDataSet(fds,dir = workingDirOut)
 file.create(snakemake@output$done)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index e2c61d86..538a9ee8 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -6,10 +6,10 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "CountSummary.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
 #'  input:
 #'   - filter: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
 #'  output:
 #'   - wBhtml: '`sm config["htmlOutputPath"] + 
 #'                  "/AberrantSplicing/{dataset}_countSummary.html"`'
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
index 052a9c2e..4c5f0d98 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
@@ -9,7 +9,7 @@
 #'  input:
 #'   - annotation: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
 #'   - fds_theta: '`sm cfg.getProcessedDataDir() + 
-#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'                    "/aberrant_splicing/datasets/merged/savedObjects/raw-{dataset}/theta.h5"`'
 #'  output:
 #'    - k_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/k_{metric}_counts.tsv.gz", metric=["j", "theta"])`'
 #'    - n_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/n_{metric}_counts.tsv.gz", metric=["psi5", "psi3", "theta"])`'
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
index 1a99b259..573902c3 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
@@ -6,14 +6,14 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "04_hyper.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
 #'  threads: 12
 #'  input:
 #'   - filter: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
 #'  output:
 #'   - hyper: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/hyper.done" `'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
index f5e760c8..bd6ad7d0 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
@@ -6,14 +6,14 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "05_fit.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
 #'  threads: 20
 #'  input:
 #'   - hyper: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `'
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/hyper.done" `'
 #'  output:
 #'   - fdsout: '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/predictedMeans_theta.h5"`'
+#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/predictedMeans_theta.h5"`'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
index 84b222cb..3046ac4c 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
@@ -6,15 +6,15 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "06_stats.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
 #'  threads: 20
 #'  input:
 #'   - fdsin:  '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
+#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
 #'                  "predictedMeans_theta.h5"`'
 #'  output:
 #'   - fdsout: '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
+#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
 #'                  "padjBetaBinomial_theta.h5"`'
 #'  type: script
 #'---
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
index a4323d79..4b9dace7 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
@@ -5,8 +5,8 @@
 #'  log:
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}--{annotation}" / "07_results.Rds")`'
 #'  params:
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
-#'   - outputDir: '`sm cfg.getProcessedResultsDir() + "/aberrant_splicing/datasets/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - outputDir: '`sm cfg.getProcessedResultsDir() + "/aberrant_splicing/datasets/merged/"`'
 #'   - padjCutoff: '`sm cfg.AS.get("padjCutoff")`'
 #'   - zScoreCutoff: '`sm cfg.AS.get("zScoreCutoff")`'
 #'   - deltaPsiCutoff: '`sm cfg.AS.get("deltaPsiCutoff")`'
@@ -16,7 +16,7 @@
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - add_HPO_cols: '`sm str(projectDir / ".drop" / "helpers" / "add_HPO_cols.R")`'
 #'   - fdsin: '`sm cfg.getProcessedDataDir() +
-#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
+#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
 #'                 "padjBetaBinomial_theta.h5"`'
 #'   - txdb: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
 #'   - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/gene_name_mapping_{annotation}.tsv"`'
@@ -26,7 +26,7 @@
 #'   - resultTableGene: '`sm cfg.getProcessedResultsDir() +
 #'                          "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results.tsv"`'
 #'   - fds: '`sm cfg.getProcessedResultsDir() +
-#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
+#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index d8b549c9..31336fe9 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -8,7 +8,7 @@
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'  input:
 #'   - fdsin: '`sm cfg.getProcessedResultsDir() + 
-#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
+#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
 #'   - results: '`sm cfg.getProcessedResultsDir() + 
 #'                   "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results.tsv"`'
 #'  output:

From 39744c91c70316f4f45e371a88be790c7e4c90b0 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Wed, 30 Mar 2022 18:01:28 +0200
Subject: [PATCH 13/65] add symlinks

---
 .../Counting/03_filter_expression_FraseR.R       | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index f9f913d9..3004a0e5 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -39,8 +39,6 @@ minExpressionInOneSample <- params$minExpressionInOneSample
 minDeltaPsi <- params$minDeltaPsi
 
 fds <- loadFraserDataSet(dir=workingDirIn, name=paste0("raw-", dataset))
-workingDir(fds) <- workingDirOut
-fds <- saveFraserDataSet(fds,dir = workingDirOut, name=paste0("raw-", dataset))
 
 register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
@@ -48,6 +46,10 @@ setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
 # Add external data if provided by dataset
 if(length(exCountIDs) > 0){
+    message("create new merged fraser object")
+    workingDir(fds) <- workingDirOut
+    fds <- saveFraserDataSet(fds,dir = workingDirOut, name=paste0("raw-", dataset))
+
     for(resource in unique(exCountFiles)){
         exSampleIDs <- exCountIDs[exCountFiles == resource]
         exAnno <- fread(sample_anno_file, key="RNA_ID")[J(exSampleIDs)]
@@ -59,6 +61,13 @@ if(length(exCountIDs) > 0){
         fds <- mergeExternalData(fds=fds, countFiles=ctsFiles,
                 sampleIDs=exSampleIDs, annotation=exAnno)
     }
+} else {
+    message("symLink fraser dir")
+    file.symlink(paste0(workingDirIn, "savedObjects/","raw-", dataset),
+                 paste0(workingDirOut, "savedObjects/","raw-", dataset))
+    
+    workingDir(fds) <- workingDirOut
+    name(fds) <- paste0("raw-", dataset)
 }
 
 # filter for expression and write it out to disc.
@@ -71,8 +80,7 @@ fds <- filterExpressionAndVariability(fds,
         minDeltaPsi = minDeltaPsi,
         filter=FALSE)
 
-message("save new fraser object", workingDirOut)
-fds <- saveFraserDataSet(fds,dir=workingDirOut)
+devNull <- saveFraserDataSet(fds,dir = workingDirOut)
 
 # Keep junctions that pass filter
 name(fds) <- dataset

From d7e0894e90e0611de6301f24a1e26ce6b750fa6e Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 30 Mar 2022 18:02:00 +0200
Subject: [PATCH 14/65] add explicit biallelic filter

---
 README.md                                   | 2 +-
 drop/modules/mae-pipeline/MAE/filterSNVs.sh | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 732d7639..cc4e0c8d 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ The manuscript is available in [Nature Protocols](https://www.nature.com/article
 DROP is available on [bioconda](https://anaconda.org/bioconda/drop).
 We recommend using a dedicated conda environment. (installation time: ~ 10min)
 ```
-mamba install -c conda-forge -c bioconda drop
+mamba create -n drop_env -c conda-forge -c bioconda drop
 ```
 
 Test installation with demo project
diff --git a/drop/modules/mae-pipeline/MAE/filterSNVs.sh b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
index b6591c34..9b0da58c 100755
--- a/drop/modules/mae-pipeline/MAE/filterSNVs.sh
+++ b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
@@ -21,6 +21,7 @@ samtools=$8
 
 tmp=$(mktemp)
 tmp2=$(mktemp)
+tmp3=$(mktemp)
 
 canonical_chr="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,\
 chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,\
@@ -57,12 +58,16 @@ $bcftools view  $vcf_file -r $canonical_chr | \
     $bcftools view ${sample_flag} -m2 -M2 -v snps > $tmp
 
 # use the select_pattern defined above to pull out the heterozygous variants used for MAE
-gatk SelectVariants -V $tmp ${sample_name} ${select_pattern} -O $tmp2
+gatk SelectVariants -V $tmp ${sample_name} ${select_pattern} -O ${tmp2}
+gatk SelectVariants --restrict-alleles-to BIALLELIC -V $tmp2 -O ${tmp3}
 
 # zip and save as tmp file
-bgzip -c $tmp2 > $tmp
+bgzip -c $tmp3 > $tmp
 $bcftools index -t $tmp
 
+rm -f $tmp2
+rm -f $tmp3
+
 # compare and correct chromosome format mismatch
 bam_chr=$($samtools idxstats ${bam_file} | cut -f1 | grep "^chr" | wc -l)
 vcf_chr=$($bcftools index --stats $tmp   | cut -f1 | grep "^chr" | wc -l)

From 2f81989c04320277317b23164ed542ae7a460add Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 31 Mar 2022 10:45:48 +0200
Subject: [PATCH 15/65] update regex matching

---
 drop/config/SampleAnnotation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index ce560719..2704cd89 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -142,7 +142,7 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
         groups = set(groups)
 
         # collect IDs per group
-        grouped = {gr: df[df[group_key].str.contains(f'(^|{sep}){gr}({sep}|$)')][assay_id].tolist()
+        grouped = {gr: df[df[group_key].str.contains(f'(?:^|{sep}){gr}(?:{sep}|$)')][assay_id].tolist()
                    for gr in groups}
         # remove groups labeled as None
         grouped = {gr: list(set(ids)) for gr, ids in grouped.items() if gr is not None}

From d1f60cd1db5c4976b8584dfb7bb3b544a0cc747f Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Thu, 31 Mar 2022 14:24:22 +0200
Subject: [PATCH 16/65] snakemake 7 workarounds

---
 drop/template/readme.md         | 5 ++++-
 tests/pipeline/test_pipeline.py | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drop/template/readme.md b/drop/template/readme.md
index 9d49194d..23fcd4f9 100644
--- a/drop/template/readme.md
+++ b/drop/template/readme.md
@@ -1,3 +1,6 @@
+---                                                                                                                     
+title: "README"                                                                                                         
+--- 
 # DROP Analysis
 
 Analysis of the demo dataset using the Detection of RNA Outliers Pipeline. For 
@@ -18,4 +21,4 @@ the following links to see the pipelines results.
 
 **Manuscript**: available in [Nature Protocols](https://www.nature.com/articles/s41596-020-00462-5)
 
-**Data**: subset of 100 samples from the GEUVADIS [dataset](https://www.ebi.ac.uk/Tools/geuvadis-das/)
\ No newline at end of file
+**Data**: subset of 100 samples from the GEUVADIS [dataset](https://www.ebi.ac.uk/Tools/geuvadis-das/)
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index 998278ae..ad98e0c9 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -12,7 +12,8 @@ def test_pipeline_no_run(demo_dir):
     # change all "run: true" config values to "run: false" and save result into new config file
     # runthe pipeline with each module turned off
     run("sed 's/run: true/run: false/g' config.yaml > config_norun.yaml  ",demo_dir)
-    pipeline_run = run(["snakemake",  f"-j{CORES}", "--configfile", "config_norun.yaml"], demo_dir)
+    pipeline_run = run(f"snakemake --until Index --cores {CORES} --configfile config_norun.yaml", demo_dir)
+    pipeline_run = run(f"snakemake --cores {CORES} --configfile config_norun.yaml", demo_dir)
     assert "Finished job 0." in pipeline_run.stderr
     return pipeline_run
 

From a5f8de0026be123c3f40406b5208c23b75fdef9d Mon Sep 17 00:00:00 2001
From: Karthik Nair <kvn95ss@gmail.com>
Date: Tue, 22 Mar 2022 11:03:26 +0530
Subject: [PATCH 17/65] Update to MAE filter scripts

---
 drop/modules/mae-pipeline/MAE/ASEReadCounter.sh | 13 +++++++++++++
 drop/modules/mae-pipeline/MAE/filterSNVs.sh     |  5 +++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
index a4290269..4eb3f675 100755
--- a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
+++ b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
@@ -45,6 +45,19 @@ fi
 chr_subset=$(comm -12 <(cut -f1 -d" " ${canonical} | sort -u) <(echo "${vcf_chr}"))
 chr_subset=$(comm -12 <(echo "${bam_chr}") <(echo "${chr_subset}") | uniq)
 
+# ASEReadCounter fails without RG, this snippet checks for RG in bam file
+# and if RG tag isn't present, lets the user know how to fix it
+if samtools view -H ${bam_file} | grep -q "@RG";then
+  printf "BAM contains RG, continuing with ASEReadCounter...\n"
+else
+  printf "%s\n" "" "ERROR: BAM file doesn't contain Read Group Tag" \
+  " RG doesn't exist, it can be added using -" \
+  "   gatk AddOrReplaceGroups -R /path/to/reference -I /your/input.bam -O /your/output.bam --QUIET true" \
+  " Try rerunning this module using the BAM with RG tags"
+  exit 1
+fi
+
+
 for chr in $chr_subset; do
   $gatk ASEReadCounter \
     -R ${fasta} \
diff --git a/drop/modules/mae-pipeline/MAE/filterSNVs.sh b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
index 9b0da58c..d684fde4 100755
--- a/drop/modules/mae-pipeline/MAE/filterSNVs.sh
+++ b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
@@ -50,11 +50,12 @@ fi
 # view the vcf file and remove the info header information and the set the INFO column to '.'
 # split any multi-allelic lines
 # pull out the sample and only the snps that have at least 2 reads supporting it
+# Using bcftools -Ou to speed up processing
 $bcftools view  $vcf_file -r $canonical_chr | \
     grep -vP '^##INFO=' | \
     awk -F'\t' 'BEGIN {OFS = FS} { if($1 ~ /^[^#]/){ $8 = "." }; print $0 }' | \
-    $bcftools norm -m-both | \
-    $bcftools norm -d both | \
+    $bcftools norm -Ou -m-both | \
+    $bcftools norm -Ou -d both | \
     $bcftools view ${sample_flag} -m2 -M2 -v snps > $tmp
 
 # use the select_pattern defined above to pull out the heterozygous variants used for MAE

From 4dbcf0e5fb992ad456e7e37e8f9a869ef4207200 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 31 Mar 2022 18:11:52 +0200
Subject: [PATCH 18/65] update backend for externalCounts

---
 drop/download_data.sh                             | 4 ++--
 drop/template/Scripts/AberrantSplicing/Overview.R | 2 +-
 drop/utils.py                                     | 2 +-
 tests/config/test_AS.py                           | 4 ++--
 tests/config/test_SampleAnnotation.py             | 2 +-
 tests/pipeline/test_AE.py                         | 7 +------
 tests/pipeline/test_AS.py                         | 2 +-
 7 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drop/download_data.sh b/drop/download_data.sh
index c2600003..3e9ff804 100644
--- a/drop/download_data.sh
+++ b/drop/download_data.sh
@@ -4,10 +4,10 @@ set -e
 # get data
 resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz"
 tmpdir="$(dirname "$(mktemp)")"
-wget -c -O $tmpdir/resource.tar.gz $resource_url
+wget -c -O $tmpdir/resource_exsplice.tar.gz $resource_url
 mkdir -p Data
 if [ -z "$(ls Data)" ]; then
-	tar -zxvf "$tmpdir/resource.tar.gz" -C .
+	tar -zxvf "$tmpdir/resource_exsplice.tar.gz" -C .
 	rm -rf Data
 	mv resource Data
 else
diff --git a/drop/template/Scripts/AberrantSplicing/Overview.R b/drop/template/Scripts/AberrantSplicing/Overview.R
index 6bcff37b..db90bd6c 100644
--- a/drop/template/Scripts/AberrantSplicing/Overview.R
+++ b/drop/template/Scripts/AberrantSplicing/Overview.R
@@ -11,7 +11,7 @@
 #'  input:
 #'    - functions: '`sm cfg.workDir / "Scripts/html_functions.R"`'
 #'    - fds_files: '`sm expand(cfg.getProcessedResultsDir() +
-#'                "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/" +
+#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/" +
 #'                "fds-object.RDS", dataset=cfg.AS.groups, annotation=cfg.genome.getGeneVersions())`'
 #'    - result_tables: '`sm expand(cfg.getProcessedResultsDir() +
 #'                    "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results_per_junction.tsv",
diff --git a/drop/utils.py b/drop/utils.py
index 307b3dae..7cde9321 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -81,7 +81,7 @@ def subsetBy(df, column, values):
     if not isinstance(values, str) :
         inner_regex = "(" + "|".join(values) + ")"
     
-    return  df[df[column].str.contains("(^|,)" + inner_regex + "(,|$)")]
+    return  df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)")]
     
 def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
     """
diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py
index c3e3dc38..e9b84cd2 100644
--- a/tests/config/test_AS.py
+++ b/tests/config/test_AS.py
@@ -19,7 +19,7 @@ def test_config(self, dropConfig,demo_dir):
         assert dict_.items() <= dropConfig.AS.dict_.items()
 
     def test_getSplitCountFiles(self, demo_dir, dropConfig):
-        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-fraser/sample_tmp/" \
+        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/fromBam/cache/raw-fraser/sample_tmp/" \
                      "splitCounts"
         ids = [
             'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc',
@@ -35,7 +35,7 @@ def test_getSplitCountFiles(self, demo_dir, dropConfig):
         assert counts_files_true == counts_files_test
 
     def test_getNonSplitCountFiles(self, demo_dir, dropConfig):
-        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-fraser/sample_tmp/" \
+        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/fromBam/cache/raw-fraser/sample_tmp/" \
                      "nonSplitCounts"
         ids = [
             'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc',
diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py
index f1e1d21e..461ef7ae 100644
--- a/tests/config/test_SampleAnnotation.py
+++ b/tests/config/test_SampleAnnotation.py
@@ -15,7 +15,7 @@ def test_mapping(self, sampleAnnotation):
         # ID mappings/groups
         assert sampleAnnotation.idMapping.shape == (24, 2)
         assert sampleAnnotation.sampleFileMapping.shape == (35, 4)
-        true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10, 'fraser_ex': 10}
+        true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10, 'fraser_ex': 7}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()}
 
diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py
index 1251c5ec..a0ab5a60 100644
--- a/tests/pipeline/test_AE.py
+++ b/tests/pipeline/test_AE.py
@@ -69,17 +69,12 @@ def test_import_results(self, demo_dir):
     def no_import(self, demo_dir):
         LOGGER.info("dryrun without import counts...")
 
-        # remove last 2 lines of sample annotation
-        run("head -n -2 Data/sample_annotation.tsv > Data/sample_annotation_noimp.tsv", demo_dir)
-
         # adapt config
-        run("sed 's/sample_annotation.tsv/sample_annotation_noimp.tsv/' config.yaml | "
-            "sed '/import_exp/d' > config_noimp.yaml", demo_dir)
+        run("sed '/import_exp/d' tests/pipeline/test_pipeline.py > config_noimp.yaml", demo_dir)
 
         yield demo_dir
 
         # reset changed files back to original
-        run("rm Data/sample_annotation_noimp.tsv", demo_dir)
         run("rm config_noimp.yaml", demo_dir)
 
     def test_no_import(self, no_import):
diff --git a/tests/pipeline/test_AS.py b/tests/pipeline/test_AS.py
index becd0e30..c62166b4 100644
--- a/tests/pipeline/test_AS.py
+++ b/tests/pipeline/test_AS.py
@@ -38,7 +38,7 @@ def pipeline_run(self, demo_dir):
     def test_counts(self, demo_dir):
         annotation = "v29"
         dataset = "fraser"
-        cnt_file = f"Output/processed_results/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"
+        cnt_file = f"Output/processed_results/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"
         r_cmd = """
             library(FRASER)
             fds <- loadFraserDataSet(file="{}")

From 5c40c889621c0482c2e6f3a1b7fb71c1daf8c911 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 31 Mar 2022 18:13:19 +0200
Subject: [PATCH 19/65] remove importExport for test

---
 tests/pipeline/test_AE.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py
index a0ab5a60..a18b343b 100644
--- a/tests/pipeline/test_AE.py
+++ b/tests/pipeline/test_AE.py
@@ -70,7 +70,7 @@ def no_import(self, demo_dir):
         LOGGER.info("dryrun without import counts...")
 
         # adapt config
-        run("sed '/import_exp/d' tests/pipeline/test_pipeline.py > config_noimp.yaml", demo_dir)
+        run("sed '/import_exp/d' config.yaml > config_noimp.yaml", demo_dir)
 
         yield demo_dir
 

From fa12be89c27b5f45c965af98eddeee0259b53bbc Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Fri, 1 Apr 2022 11:18:20 +0200
Subject: [PATCH 20/65] comments and cleanup

---
 drop/modules/mae-pipeline/MAE/ASEReadCounter.sh |  1 +
 drop/modules/mae-pipeline/MAE/filterSNVs.sh     | 14 +++++++-------
 tests/pipeline/test_pipeline.py                 |  5 +++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
index 4eb3f675..e3c52df6 100755
--- a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
+++ b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
@@ -53,6 +53,7 @@ else
   printf "%s\n" "" "ERROR: BAM file doesn't contain Read Group Tag" \
   " RG doesn't exist, it can be added using -" \
   "   gatk AddOrReplaceGroups -R /path/to/reference -I /your/input.bam -O /your/output.bam --QUIET true" \
+  " https://gatk.broadinstitute.org/hc/en-us/articles/360037226472-AddOrReplaceReadGroups-Picard-" \
   " Try rerunning this module using the BAM with RG tags"
   exit 1
 fi
diff --git a/drop/modules/mae-pipeline/MAE/filterSNVs.sh b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
index d684fde4..6b85bd18 100755
--- a/drop/modules/mae-pipeline/MAE/filterSNVs.sh
+++ b/drop/modules/mae-pipeline/MAE/filterSNVs.sh
@@ -21,7 +21,6 @@ samtools=$8
 
 tmp=$(mktemp)
 tmp2=$(mktemp)
-tmp3=$(mktemp)
 
 canonical_chr="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,\
 chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,\
@@ -56,18 +55,19 @@ $bcftools view  $vcf_file -r $canonical_chr | \
     awk -F'\t' 'BEGIN {OFS = FS} { if($1 ~ /^[^#]/){ $8 = "." }; print $0 }' | \
     $bcftools norm -Ou -m-both | \
     $bcftools norm -Ou -d both | \
-    $bcftools view ${sample_flag} -m2 -M2 -v snps > $tmp
+    $bcftools view ${sample_flag} -m2 -M2 -v snps > ${tmp2}
 
 # use the select_pattern defined above to pull out the heterozygous variants used for MAE
-gatk SelectVariants -V $tmp ${sample_name} ${select_pattern} -O ${tmp2}
-gatk SelectVariants --restrict-alleles-to BIALLELIC -V $tmp2 -O ${tmp3}
+gatk SelectVariants -V ${tmp2} ${sample_name} ${select_pattern} -O ${tmp}
+
+# explicitly remove all BIALLELIC variants. This is needed as ASEReadCounter does not support them.
+gatk SelectVariants --restrict-alleles-to BIALLELIC -V ${tmp} -O ${tmp2}
 
 # zip and save as tmp file
-bgzip -c $tmp3 > $tmp
-$bcftools index -t $tmp
+bgzip -c ${tmp2} > ${tmp}
+$bcftools index -t ${tmp}
 
 rm -f $tmp2
-rm -f $tmp3
 
 # compare and correct chromosome format mismatch
 bam_chr=$($samtools idxstats ${bam_file} | cut -f1 | grep "^chr" | wc -l)
diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
index ad98e0c9..a22fa9c5 100644
--- a/tests/pipeline/test_pipeline.py
+++ b/tests/pipeline/test_pipeline.py
@@ -12,6 +12,11 @@ def test_pipeline_no_run(demo_dir):
     # change all "run: true" config values to "run: false" and save result into new config file
     # runthe pipeline with each module turned off
     run("sed 's/run: true/run: false/g' config.yaml > config_norun.yaml  ",demo_dir)
+
+    # snakemake with wBuild introduces a directory locked error which does not affect processing, but does
+    # affect pipeline error log.
+    # instead run this step in 2 parts. (1) run until wBuild indexing (builds dependency graphs)
+    # (2) run the indexing and finishing touches of wBuild to avoid snakemake DAG locked error
     pipeline_run = run(f"snakemake --until Index --cores {CORES} --configfile config_norun.yaml", demo_dir)
     pipeline_run = run(f"snakemake --cores {CORES} --configfile config_norun.yaml", demo_dir)
     assert "Finished job 0." in pipeline_run.stderr

From ab7598f2f21739c676e64321e196780a112f153a Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Fri, 1 Apr 2022 14:43:39 +0200
Subject: [PATCH 21/65] rename demo groups

---
 drop/demo/config_relative.yaml           |  8 +++---
 drop/demo/sample_annotation_relative.tsv | 34 ++++++++++++------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
index ca9cc6c9..a4e702df 100755
--- a/drop/demo/config_relative.yaml
+++ b/drop/demo/config_relative.yaml
@@ -16,14 +16,14 @@ exportCounts:
     - v29
   excludeGroups:
     - mae
-    - import_exp
-    - fraser_ex
+    - outrider_external
+    - fraser_external
 
 aberrantExpression:
     run: true
     groups:
       - outrider
-      - import_exp
+      - outrider_external
     fpkmCutoff: 1
     implementation: autoencoder
     padjCutoff: 1
@@ -37,7 +37,7 @@ aberrantSplicing:
     run: true
     groups:
       - fraser
-      - fraser_ex
+      - fraser_external
     recount: true
     longRead: false
     keepNonStandardChrs: true
diff --git a/drop/demo/sample_annotation_relative.tsv b/drop/demo/sample_annotation_relative.tsv
index f87a4f8d..9b720f9b 100755
--- a/drop/demo/sample_annotation_relative.tsv
+++ b/drop/demo/sample_annotation_relative.tsv
@@ -1,25 +1,25 @@
 RNA_ID	RNA_BAM_FILE	DNA_VCF_FILE	DNA_ID	DROP_GROUP	PAIRED_END	COUNT_MODE	COUNT_OVERLAPS	STRAND	HPO_TERMS	GENE_COUNTS_FILE	GENE_ANNOTATION	GENOME	SPLICE_COUNTS_DIR
 HG00096.1.M_111124_6	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	outrider,mae	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
 HG00103.4.M_120208_3	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	outrider,mae	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
-HG00106.4.M_120208_5	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
-HG00111.2.M_111215_4	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
-HG00116.2.M_120131_1	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
-HG00126.1.M_111124_8	Data/rna_bam/HG00126.1.M_111124_8_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00126	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0000290,HP:0000293
-HG00132.2.M_111215_4	Data/rna_bam/HG00132.2.M_111215_4_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00132	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0006489,HP:0006490
-HG00149.1.M_111124_6	Data/rna_bam/HG00149.1.M_111124_6_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00149	outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0000014,HP:0000020,HP:0032663
-HG00150.4.M_120208_7	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	fraser_ex,outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
-HG00176.4.M_120208_2	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	fraser_ex,outrider,import_exp	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
-HG00096.1.M_111124_6_trunc	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
-HG00103.4.M_120208_3_trunc	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
-HG00106.4.M_120208_5_trunc	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
-HG00111.2.M_111215_4_trunc	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
-HG00116.2.M_120131_1_trunc	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	fraser,fraser_ex	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
+HG00106.4.M_120208_5	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
+HG00111.2.M_111215_4	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
+HG00116.2.M_120131_1	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
+HG00126.1.M_111124_8	Data/rna_bam/HG00126.1.M_111124_8_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00126	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0000290,HP:0000293
+HG00132.2.M_111215_4	Data/rna_bam/HG00132.2.M_111215_4_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00132	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0006489,HP:0006490
+HG00149.1.M_111124_6	Data/rna_bam/HG00149.1.M_111124_6_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00149	outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0000014,HP:0000020,HP:0032663
+HG00150.4.M_120208_7	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	fraser_external,outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
+HG00176.4.M_120208_2	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	fraser_external,outrider,outrider_external	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
+HG00096.1.M_111124_6_trunc	Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00096	fraser,fraser_external	TRUE	IntersectionStrict	TRUE	no	HP:0009802,HP:0010896
+HG00103.4.M_120208_3_trunc	Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00103	fraser,fraser_external	TRUE	IntersectionStrict	TRUE	no	HP:0004582,HP:0031959
+HG00106.4.M_120208_5_trunc	Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00106	fraser,fraser_external	TRUE	IntersectionStrict	TRUE	no	HP:0002895,HP:0006731
+HG00111.2.M_111215_4_trunc	Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00111	fraser,fraser_external	TRUE	IntersectionStrict	TRUE	no	HP:0100491,HP:0100871
+HG00116.2.M_120131_1_trunc	Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00116	fraser,fraser_external	TRUE	IntersectionStrict	TRUE	no	HP:0030613,HP:0012767
 HG00126.1.M_111124_8_trunc	Data/rna_bam/HG00126.1.M_111124_8_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00126	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0000290,HP:0000293
 HG00132.2.M_111215_4_trunc	Data/rna_bam/HG00132.2.M_111215_4_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00132	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0006489,HP:0006490
 HG00149.1.M_111124_6_trunc	Data/rna_bam/HG00149.1.M_111124_6_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00149	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0000014,HP:0000020,HP:0032663
 HG00150.4.M_120208_7_trunc	Data/rna_bam/HG00150.4.M_120208_7_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00150	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0030809,HP:0006144
 HG00176.4.M_120208_2_trunc	Data/rna_bam/HG00176.4.M_120208_2_chr21.bam_trunc.bam	Data/dna_vcf/demo_chr21.vcf.gz	HG00176	fraser	TRUE	IntersectionStrict	TRUE	no	HP:0005215,HP:0010234
-HG00178.4.M_120208_8				import_exp						Data/external_count_data/geneCounts.tsv.gz	v29
-HG00181.4.M_120208_4				fraser_ex,import_exp						Data/external_count_data/geneCounts.tsv.gz	v29		Data/external_count_data
-HG00191.3.M_120208_3				fraser_ex									Data/external_count_data
-HG00201.1.M_120208_6				fraser_ex									Data/external_count_data
+HG00178.4.M_120208_8				outrider_external						Data/external_count_data/geneCounts.tsv.gz	v29
+HG00181.4.M_120208_4				fraser_external,outrider_external						Data/external_count_data/geneCounts.tsv.gz	v29		Data/external_count_data
+HG00191.3.M_120208_3				fraser_external									Data/external_count_data
+HG00201.1.M_120208_6				fraser_external									Data/external_count_data

From 4b385c375a4c4f6c044ffc97ffc1ab2cf0f712de Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Fri, 1 Apr 2022 15:55:11 +0200
Subject: [PATCH 22/65] more information with external counts

---
 .../Counting/Summary.R                        | 89 +++++++++++++------
 .../Counting/Summary.R                        | 28 ++++--
 2 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index c7a8666c..e21b482f 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -32,9 +32,20 @@ suppressPackageStartupMessages({
 })
 
 ods <- readRDS(snakemake@input$ods)
+
+has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
+if(has_external){
+    ods@colData$isExternal <- ods@colData$GENE_COUNTS_FILE != ""
+}else{
+    ods@colData$isExternal <- FALSE
+}
+
+cnts_mtx_local <- counts(ods, normalized = F)[,!ods@colData$isExternal]
 cnts_mtx <- counts(ods, normalized = F)
 
-#' Number of samples: `r ncol(ods)`
+#' ## Number of samples:  
+#' Local (fromBam): `r sum(!ods@colData$isExternal)`  
+#' External: `r sum(ods@colData$isExternal)`  
 #' 
 #' # Count Quality Control
 #' 
@@ -44,7 +55,8 @@ bam_coverage <- fread(snakemake@input$bam_cov)
 bam_coverage[, sampleID := as.character(sampleID)]
 coverage_dt <- merge(bam_coverage,
                      data.table(sampleID = colnames(ods),
-                                read_count = colSums(cnts_mtx)),
+                                read_count = colSums(cnts_mtx),
+                                isExternal = ods@colData$isExternal),
                      by = "sampleID", sort = FALSE)
 # read count
 setorder(coverage_dt, read_count)
@@ -60,15 +72,15 @@ coverage_dt[, size_factors := sizeFactors(ods)]
 setorder(coverage_dt, size_factors)
 coverage_dt[, sf_rank := 1:.N]
 
-p_depth <- ggplot(coverage_dt, aes(count_rank, read_count)) +
-  geom_point() +
+p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count,col= isExternal )) +
+  geom_point(size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Counts", x="Sample Rank", y = "Reads Counted") +
   ylim(c(0,NA))
 
-p_frac <- ggplot(coverage_dt, aes(frac_rank, counted_frac)) +
-  geom_point() +
+p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac)) +
+  geom_point(size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Count Ratio", x = "Sample Rank", 
@@ -76,17 +88,17 @@ p_frac <- ggplot(coverage_dt, aes(frac_rank, counted_frac)) +
   ylim(c(0,NA))
 
 #+ QC, fig.height=6, fig.width=12
-plot_grid(p_depth, p_frac)
+plot_grid(p_depth, p_frac) 
 
-p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors)) +
-  geom_point() +
+p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors,col = isExternal)) +
+  geom_point(size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
   labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors')
 
-p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors)) +
-  geom_point() +
+p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors,col = isExternal)) +
+  geom_point(size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
@@ -97,18 +109,40 @@ p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors)) +
 plot_grid(p_sf, p_sf_cov)
 
 #' # Filtering
+#' **all_local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
+#' **all**: A pre-filtered summary of counts using only the merged local (from BAM) and external counts  
+#' **passed_FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes  
+#' **min_1**: minimum of 1 read expressed in 5% of genes  
+#' **min_10**: minimum of 10 reads expressed in 5% of genes  
+
 quant <- .95
-filter_mtx <- list(
-  all = cnts_mtx,
-  passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
-  min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
-  min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
-)
-filter_dt <- lapply(names(filter_mtx), function(filter_name) {
-  mtx <- filter_mtx[[filter_name]]
-  data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
-}) %>% rbindlist
-filter_dt[, filter := factor(filter, levels = c('all', 'passed_FPKM', 'min_1', 'min_10'))]
+
+if(has_external){
+    filter_mtx <- list(
+      local = cnts_mtx_local,
+      all = cnts_mtx,
+      passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
+      min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
+      min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
+    )
+    filter_dt <- lapply(names(filter_mtx), function(filter_name) {
+      mtx <- filter_mtx[[filter_name]]
+      data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
+    }) %>% rbindlist
+    filter_dt[, filter := factor(filter, levels = c('local', 'all', 'passed_FPKM', 'min_1', 'min_10'))]
+} else{
+    filter_mtx <- list(
+      all = cnts_mtx,
+      passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
+      min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
+      min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
+    )
+    filter_dt <- lapply(names(filter_mtx), function(filter_name) {
+      mtx <- filter_mtx[[filter_name]]
+      data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
+    }) %>% rbindlist
+    filter_dt[, filter := factor(filter, levels = c('all', 'passed_FPKM', 'min_1', 'min_10'))]
+}
 
 binwidth <- .2
 p_hist <- ggplot(filter_dt, aes(x = median_counts, fill = filter)) +
@@ -136,19 +170,20 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
 plot_grid(p_hist, p_dens)
 
 #+ expressedGenes, fig.height=6, fig.width=8
-plotExpressedGenes(ods) +
+plotExpressedGenes(ods) + 
   theme_cowplot() +
   background_grid(major = "y")
 
 expressed_genes <- as.data.table(colData(ods))
 expressed_genes <- expressed_genes[, .(expressedGenes, unionExpressedGenes,
                                        intersectionExpressedGenes, passedFilterGenes,
-                                       expressedGenesRank)]
+                                       expressedGenesRank,isExternal)]
 
 #+echo=F
-rank_1 <- expressed_genes[expressedGenesRank == 1]
-#' **Rank 1:**
-#' `r as.character(rank_1$expressedGenes)` expressed genes
+rank_1 <- expressed_genes[,.SD[expressedGenesRank == min(expressedGenesRank)],by = isExternal]
+#' **Rank 1:**  
+#' Local Rank 1: `r as.character(rank_1[(!isExternal),expressedGenes])` expressed genes  
+#' External Rank 1: `r if(has_external){as.character(rank_1[(isExternal),expressedGenes])}else{as.character(0)}` expressed genes  
 #+echo=F
 rank_n <- expressed_genes[expressedGenesRank == .N]
 #' **Rank `r rank_n$expressedGenesRank`:**  
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index 538a9ee8..9dc0f6b0 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -7,6 +7,7 @@
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDirLocal: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
 #'  input:
 #'   - filter: '`sm cfg.getProcessedDataDir() + 
 #'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
@@ -27,17 +28,32 @@ suppressPackageStartupMessages({
 #+ input
 dataset    <- snakemake@wildcards$dataset
 workingDir <- snakemake@params$workingDir
+workingDirLocal <- snakemake@params$workingDirLocal
 
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fdsLocal <- loadFraserDataSet(dir=workingDirLocal, name=paste0("raw-", dataset))
+fdsMerge <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
 
-#' Number of samples: `r nrow(colData(fds))`
+has_external <- !(all(is.na(fds@colData$SPLICE_COUNTS_DIR)) || is.null(fds@colData$SPLICE_COUNTS_DIR))
+if(has_external){
+    fds@colData$isExternal <- !is.na(fds@colData$SPLICE_COUNTS_DIR)
+}else{
+    fds@colData$isExternal <- FALSE
+}
+
+#' ## Number of samples:   
+#' Local (fromBam): `r sum(!fds@colData$isExternal)`  
+#' External: `r sum(fds@colData$isExternal)`  
 #' 
-#' Number of introns (psi5 or psi3): `r length(rowRanges(fds, type = "psi5"))`
+#' ### Number of introns (psi5 or psi3):  
+#' Local (fromBam): `r length(rowRanges(fdsLocal, type = "psi5"))`  
+#' Merged : `r length(rowRanges(fdsMerged, type = "psi5"))`  
 #' 
-#' Number of splice sites (theta): `r length(rowRanges(fds, type = "theta"))`
+#' ### Number of splice sites (theta): 
+#' Local (fromBam): `r length(rowRanges(fdsLocal, type = "theta"))`  
+#' Merged: `r length(rowRanges(fdsMerged, type = "theta"))`  
 #' 
-#' Introns that passed filter
-table(mcols(fds, type="j")[,"passed"])
+#' Introns that passed filter (after merging)
+table(mcols(fdsMerged, type="j")[,"passed"])
 
 #' ## Expression filtering
 #' Min expression cutoff: `r snakemake@config$aberrantSplicing$minExpressionInOneSample`

From 9d68b2fb1c1db977606a969772282ec2545ff9ca Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Fri, 1 Apr 2022 16:51:19 +0200
Subject: [PATCH 23/65] Update README.md

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index cc4e0c8d..783c25d8 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,12 @@ This shows you the rules of all subworkflows. Omit `-n` and specify the number o
 snakemake aberrantExpression --cores 10
 ```
 
+## Citation
+
+If you use DROP in research, please cite our [manuscript](https://www.nature.com/articles/s41596-020-00462-5).
+
+Furthermore, if you use the aberrant expression module, also cite [OUTRIDER](https://doi.org/10.1016/j.ajhg.2018.10.025), and if you use the aberrant splicing module, also cite [FRASER](https://www.nature.com/articles/s41467-020-20573-7).
+
 ## Datasets
 The following publicly-available datasets of gene counts can be used as controls.
 Please cite as instructed for each dataset.

From a0796063e9e67eec7acf7bee295699418ad2fcd2 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Fri, 1 Apr 2022 16:59:15 +0200
Subject: [PATCH 24/65] update with fdsMerge

---
 .../Counting/Summary.R                        | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index 9dc0f6b0..5d94e262 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -33,33 +33,33 @@ workingDirLocal <- snakemake@params$workingDirLocal
 fdsLocal <- loadFraserDataSet(dir=workingDirLocal, name=paste0("raw-", dataset))
 fdsMerge <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
 
-has_external <- !(all(is.na(fds@colData$SPLICE_COUNTS_DIR)) || is.null(fds@colData$SPLICE_COUNTS_DIR))
+has_external <- !(all(is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)) || is.null(fdsMerge@colData$SPLICE_COUNTS_DIR))
 if(has_external){
-    fds@colData$isExternal <- !is.na(fds@colData$SPLICE_COUNTS_DIR)
+    fdsMerge@colData$isExternal <- !is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)
 }else{
-    fds@colData$isExternal <- FALSE
+    fdsMerge@colData$isExternal <- FALSE
 }
 
 #' ## Number of samples:   
-#' Local (fromBam): `r sum(!fds@colData$isExternal)`  
-#' External: `r sum(fds@colData$isExternal)`  
+#' Local (fromBam): `r sum(!fdsMerge@colData$isExternal)`  
+#' External: `r sum(fdsMerge@colData$isExternal)`  
 #' 
 #' ### Number of introns (psi5 or psi3):  
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "psi5"))`  
-#' Merged : `r length(rowRanges(fdsMerged, type = "psi5"))`  
+#' Merged : `r length(rowRanges(fdsMerge, type = "psi5"))`  
 #' 
 #' ### Number of splice sites (theta): 
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "theta"))`  
-#' Merged: `r length(rowRanges(fdsMerged, type = "theta"))`  
+#' Merged: `r length(rowRanges(fdsMerge, type = "theta"))`  
 #' 
 #' Introns that passed filter (after merging)
-table(mcols(fdsMerged, type="j")[,"passed"])
+table(mcols(fdsMerge, type="j")[,"passed"])
 
 #' ## Expression filtering
 #' Min expression cutoff: `r snakemake@config$aberrantSplicing$minExpressionInOneSample`
-plotFilterExpression(fds) + theme_cowplot(font_size = 16)
+plotFilterExpression(fdsMerge) + theme_cowplot(font_size = 16)
 
 #' ## Variability filtering
 #' Variability cutoff: `r snakemake@config$aberrantSplicing$minDeltaPsi`
-plotFilterVariability(fds) + theme_cowplot(font_size = 16)
+plotFilterVariability(fdsMerge) + theme_cowplot(font_size = 16)
 

From f98ca7c962b1dbec1075aafefd7fa30da7e160f3 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Mon, 4 Apr 2022 11:26:29 +0200
Subject: [PATCH 25/65] change group names

---
 drop/config/submodules/AberrantSplicing.py | 5 +++++
 tests/config/test_AE.py                    | 4 ++--
 tests/config/test_AS.py                    | 2 +-
 tests/config/test_SampleAnnotation.py      | 4 ++--
 tests/pipeline/test_AE.py                  | 4 ++--
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index 28ff9296..3ae001ac 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -22,6 +22,11 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD
         
         self.rnaIDs   = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA")
         self.rnaExIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="SPLICE_COUNT")
+        for g in self.groups:
+            if len(set(self.rnaIDs[g]) & set(self.rnaExIDs[g])) > 0:
+                raise ValueError(f"{set(self.rnaIDs[g]) & set(self.extRnaIDs[g])} has both BAM and external count file \
+                please fix in sample annotation table to only have either external count or BAM processing\n")
+
         all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "SPLICE_COUNT"])
         self.checkSubset(all_ids)
 
diff --git a/tests/config/test_AE.py b/tests/config/test_AE.py
index 8643130d..ed211c24 100644
--- a/tests/config/test_AE.py
+++ b/tests/config/test_AE.py
@@ -3,7 +3,7 @@ class Test_AE_Config:
     def test_config(self, dropConfig,demo_dir):
         assert dropConfig.AE.getWorkdir() == f"{demo_dir}/Scripts/AberrantExpression/pipeline"
         dict_ = {
-            'groups': ['outrider', 'import_exp'],
+            'groups': ['outrider', 'outrider_external'],
             'fpkmCutoff': 1,
             'implementation': 'autoencoder',
             'padjCutoff': 1,
@@ -37,7 +37,7 @@ def test_getCountsFiles(self, demo_dir, dropConfig):
         counts_files_true = counts_files_true[2:]
         counts_files_true.append(f"{demo_dir}/Data/external_count_data/geneCounts.tsv.gz")
         counts_files_true.sort()
-        counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="import_exp")
+        counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="outrider_external")
         counts_files_test.sort()
         assert counts_files_true == counts_files_test
 
diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py
index e9b84cd2..1e0559ae 100644
--- a/tests/config/test_AS.py
+++ b/tests/config/test_AS.py
@@ -3,7 +3,7 @@ class Test_AS_Config:
     def test_config(self, dropConfig,demo_dir):
         assert dropConfig.AS.getWorkdir() == f"{demo_dir}/Scripts/AberrantSplicing/pipeline"
         dict_ = {
-            'groups': ['fraser', 'fraser_ex'],
+            'groups': ['fraser', 'fraser_external'],
             'recount': True,
             'longRead': False,
             'keepNonStandardChrs': True,
diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py
index 461ef7ae..311789f3 100644
--- a/tests/config/test_SampleAnnotation.py
+++ b/tests/config/test_SampleAnnotation.py
@@ -15,7 +15,7 @@ def test_mapping(self, sampleAnnotation):
         # ID mappings/groups
         assert sampleAnnotation.idMapping.shape == (24, 2)
         assert sampleAnnotation.sampleFileMapping.shape == (35, 4)
-        true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10, 'fraser_ex': 7}
+        true_mapping = {'mae': 2, 'outrider_external': 8, 'outrider': 10, 'fraser': 10, 'fraser_external': 7}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()}
         assert true_mapping == {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()}
 
@@ -36,7 +36,7 @@ def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_
     @pytest.mark.parametrize(
         "annotation,group,files",
         [
-            ("v29", "import_exp", {'Data/external_count_data/geneCounts.tsv.gz'})
+            ("v29", "outrider_external", {'Data/external_count_data/geneCounts.tsv.gz'})
         ]
     )
     def test_import(self, demo_dir, sampleAnnotation, annotation, group, files):
diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py
index a18b343b..2befe1e2 100644
--- a/tests/pipeline/test_AE.py
+++ b/tests/pipeline/test_AE.py
@@ -50,7 +50,7 @@ def test_results(self, demo_dir):
         assert "res: 4310 15" in r.stdout
 
     def test_import_results(self, demo_dir):
-        output_dir = "Output/processed_results/aberrant_expression/v29/outrider/import_exp"
+        output_dir = "Output/processed_results/aberrant_expression/v29/outrider/outrider_external"
         r_cmd = """
                 # ods object
                 ods <- readRDS(file.path("{}", "ods.Rds"))
@@ -70,7 +70,7 @@ def no_import(self, demo_dir):
         LOGGER.info("dryrun without import counts...")
 
         # adapt config
-        run("sed '/import_exp/d' config.yaml > config_noimp.yaml", demo_dir)
+        run("sed '/outrider_external/d' config.yaml > config_noimp.yaml", demo_dir)
 
         yield demo_dir
 

From 39b5590c76c4595be4db56c931429160c387d827 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Mon, 4 Apr 2022 18:13:13 +0200
Subject: [PATCH 26/65] comments

---
 drop/modules/mae-pipeline/MAE/ASEReadCounter.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
index e3c52df6..cff0c18e 100755
--- a/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
+++ b/drop/modules/mae-pipeline/MAE/ASEReadCounter.sh
@@ -45,12 +45,12 @@ fi
 chr_subset=$(comm -12 <(cut -f1 -d" " ${canonical} | sort -u) <(echo "${vcf_chr}"))
 chr_subset=$(comm -12 <(echo "${bam_chr}") <(echo "${chr_subset}") | uniq)
 
-# ASEReadCounter fails without RG, this snippet checks for RG in bam file
+# ASEReadCounter fails without read groups (RG), this snippet checks for RG in bam file
 # and if RG tag isn't present, lets the user know how to fix it
 if samtools view -H ${bam_file} | grep -q "@RG";then
-  printf "BAM contains RG, continuing with ASEReadCounter...\n"
+  printf "BAM contains read groups (RG), continuing with ASEReadCounter...\n"
 else
-  printf "%s\n" "" "ERROR: BAM file doesn't contain Read Group Tag" \
+  printf "%s\n" "" "ERROR: BAM file doesn't contain Read Group Tag (RG)" \
   " RG doesn't exist, it can be added using -" \
   "   gatk AddOrReplaceGroups -R /path/to/reference -I /your/input.bam -O /your/output.bam --QUIET true" \
   " https://gatk.broadinstitute.org/hc/en-us/articles/360037226472-AddOrReplaceReadGroups-Picard-" \

From f6ea59836392159a1e624d580aa16258d7fa5a39 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Tue, 5 Apr 2022 11:32:22 +0200
Subject: [PATCH 27/65] AE summary

---
 .../Counting/Summary.R                        | 34 +++++++------------
 drop/requirementsR.txt                        |  2 +-
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index e21b482f..82b5936a 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -73,14 +73,14 @@ setorder(coverage_dt, size_factors)
 coverage_dt[, sf_rank := 1:.N]
 
 p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count,col= isExternal )) +
-  geom_point(size = 3,show.legend = has_external) +
+  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Counts", x="Sample Rank", y = "Reads Counted") +
   ylim(c(0,NA))
 
 p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac)) +
-  geom_point(size = 3,show.legend = has_external) +
+  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Count Ratio", x = "Sample Rank", 
@@ -91,14 +91,14 @@ p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac)) +
 plot_grid(p_depth, p_frac) 
 
 p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors,col = isExternal)) +
-  geom_point(size = 3,show.legend = has_external) +
+  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
   labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors')
 
 p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors,col = isExternal)) +
-  geom_point(size = 3,show.legend = has_external) +
+  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
@@ -169,25 +169,17 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
 #+ meanCounts, fig.height=6, fig.width=12
 plot_grid(p_hist, p_dens)
 
-#+ expressedGenes, fig.height=6, fig.width=8
-plotExpressedGenes(ods) + 
-  theme_cowplot() +
-  background_grid(major = "y")
-
 expressed_genes <- as.data.table(colData(ods))
 expressed_genes <- expressed_genes[, .(expressedGenes, unionExpressedGenes,
                                        intersectionExpressedGenes, passedFilterGenes,
                                        expressedGenesRank,isExternal)]
 
-#+echo=F
-rank_1 <- expressed_genes[,.SD[expressedGenesRank == min(expressedGenesRank)],by = isExternal]
-#' **Rank 1:**  
-#' Local Rank 1: `r as.character(rank_1[(!isExternal),expressedGenes])` expressed genes  
-#' External Rank 1: `r if(has_external){as.character(rank_1[(isExternal),expressedGenes])}else{as.character(0)}` expressed genes  
-#+echo=F
-rank_n <- expressed_genes[expressedGenesRank == .N]
-#' **Rank `r rank_n$expressedGenesRank`:**  
-#' `r as.character(rank_n$expressedGenes)` expressed genes  
-#' `r as.character(rank_n$unionExpressedGenes)` expressed genes (union)  
-#' `r as.character(rank_n$intersectionExpressedGenes)` expressed genes (intersection)  
-#' `r as.character(rank_n$passedFilterGenes)` genes passed the filter
+#+ expressedGenes, fig.height=6, fig.width=8
+exp_plot <- plotExpressedGenes(ods) + 
+  theme_cowplot() +
+  background_grid(major = "y")
+external_shapes_plot <- ggplot(melt(expressed_genes,id.vars = c("expressedGenesRank","isExternal")),
+  aes(x = expressedGenesRank, y = value, col = variable, shape = isExternal)) + geom_point()
+exp_plot +external_shapes_plot
+
+DT::datatable(expressed_genes)
diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
index 3aae9bb7..6cde622e 100644
--- a/drop/requirementsR.txt
+++ b/drop/requirementsR.txt
@@ -1,7 +1,7 @@
 package	version
 devtools
 gagneurlab/OUTRIDER	1.6.1
-c-mertes/FRASER	1.4.1
+c-mertes/FRASER	1.2.2
 gagneurlab/tMAE	1.0.4
 VariantAnnotation	
 rmarkdown	

From 41b9d21ec621b6a52a89fb3ce9cde495bf456c52 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Tue, 5 Apr 2022 15:31:14 +0200
Subject: [PATCH 28/65] Summary styling

---
 .../Counting/Summary.R                        | 54 +++++++++++--------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 82b5936a..e153d7ef 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -72,44 +72,48 @@ coverage_dt[, size_factors := sizeFactors(ods)]
 setorder(coverage_dt, size_factors)
 coverage_dt[, sf_rank := 1:.N]
 
-p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count,col= isExternal )) +
-  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
+p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count, col= isExternal )) +
+  geom_point(size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Counts", x="Sample Rank", y = "Reads Counted") +
-  ylim(c(0,NA))
+  ylim(c(0,NA)) +
+  scale_color_brewer(palette="Dark2")
 
-p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac)) +
-  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
+p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac, col = isExternal)) +
+  geom_point(size = 3,show.legend = has_external) +
   theme_cowplot() +
   background_grid() +
   labs(title = "Obtained Read Count Ratio", x = "Sample Rank", 
        y = "Percent Reads Counted") +
-  ylim(c(0,NA))
+  ylim(c(0,NA)) +
+  scale_color_brewer(palette="Dark2")
 
 #+ QC, fig.height=6, fig.width=12
 plot_grid(p_depth, p_frac) 
 
-p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors,col = isExternal)) +
-  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
+p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors, col = isExternal)) +
+  geom_point(size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
-  labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors')
+  labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors') +
+  scale_color_brewer(palette="Dark2")
 
-p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors,col = isExternal)) +
-  geom_point(palette = "Dark2",size = 3,show.legend = has_external) +
+p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors, col = isExternal)) +
+  geom_point(size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
   theme_cowplot() +
   background_grid() +
   labs(title = 'Size Factors vs. Read Counts',
-       x = 'Read Counts', y = 'Size Factors')
+       x = 'Read Counts', y = 'Size Factors') +
+  scale_color_brewer(palette="Dark2")
 
 #+ sizeFactors, fig.height=6, fig.width=12
 plot_grid(p_sf, p_sf_cov)
 
 #' # Filtering
-#' **all_local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
+#' **local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
 #' **all**: A pre-filtered summary of counts using only the merged local (from BAM) and external counts  
 #' **passed_FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes  
 #' **min_1**: minimum of 1 read expressed in 5% of genes  
@@ -169,17 +173,21 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
 #+ meanCounts, fig.height=6, fig.width=12
 plot_grid(p_hist, p_dens)
 
-expressed_genes <- as.data.table(colData(ods))
-expressed_genes <- expressed_genes[, .(expressedGenes, unionExpressedGenes,
-                                       intersectionExpressedGenes, passedFilterGenes,
-                                       expressedGenesRank,isExternal)]
+exp_genes_cols <- c(`Expressed\ngenes` = "expressedGenes", 
+                    `Union of\nexpressed genes` = "unionExpressedGenes", 
+                    `Intersection of\nexpressed genes` = "intersectionExpressedGenes", 
+                    `Genes passed\nfiltering` = "passedFilterGenes", Rank = "expressedGenesRank",
+                    `Is External` = "isExternal")
+
+expressed_genes <- as.data.table(colData(ods)[,exp_genes_cols])
+colnames(expressed_genes) <- names(exp_genes_cols)
 
 #+ expressedGenes, fig.height=6, fig.width=8
-exp_plot <- plotExpressedGenes(ods) + 
+plotExpressedGenes(ods) + 
   theme_cowplot() +
-  background_grid(major = "y")
-external_shapes_plot <- ggplot(melt(expressed_genes,id.vars = c("expressedGenesRank","isExternal")),
-  aes(x = expressedGenesRank, y = value, col = variable, shape = isExternal)) + geom_point()
-exp_plot +external_shapes_plot
+  background_grid(major = "y") +
+  geom_point(data =melt(expressed_genes,id.vars = c("Rank","Is External")),
+             aes(x = Rank, y = value, col = variable, shape = `Is External`),show.legend = has_external)
 
-DT::datatable(expressed_genes)
+#' ### Expressed Genes by 
+DT::datatable(expressed_genes[order(Rank)],rownames = F)

From f76b7413305ccdfccbbba4a011567d969b1d11c1 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Tue, 5 Apr 2022 19:13:05 +0200
Subject: [PATCH 29/65] update splicing summary and comments

---
 .../Counting/03_filter_expression_FraseR.R    |  3 ++
 .../Counting/Summary.R                        | 35 ++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index 3004a0e5..69ee5a61 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -58,6 +58,9 @@ if(length(exCountIDs) > 0){
         ctsNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
         ctsFiles <- paste0(dirname(resource), "/", ctsNames, "_counts.tsv.gz")
         
+        # Merging external counts restricts the junctions to those that 
+        # are only present in both the counted (fromBam) junctions AND the 
+        # junctions from the external counts.
         fds <- mergeExternalData(fds=fds, countFiles=ctsFiles,
                 sampleIDs=exSampleIDs, annotation=exAnno)
     }
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index 5d94e262..a1168079 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -44,16 +44,41 @@ if(has_external){
 #' Local (fromBam): `r sum(!fdsMerge@colData$isExternal)`  
 #' External: `r sum(fdsMerge@colData$isExternal)`  
 #' 
-#' ### Number of introns (psi5 or psi3):  
+#' ## Using external counts
+#' External counts introduce some complexity into the problem of counting junctions
+#' because it is ambiguous whether or not a junction is not counted (because there are no reads)
+#' compared to filtered and not present due to legal/personal sharing reasons. As a result,
+#' after merging the local (fromBam) counts and the external counts, only the junctions that are exactly
+#' the same in both remain. As a result it is likely that the number of junctions will decrease after a merge.
+#' 
+#' 
+#' ### Number of introns (psi5 or psi3) before filtering:  
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "psi5"))`  
 #' Merged : `r length(rowRanges(fdsMerge, type = "psi5"))`  
 #' 
-#' ### Number of splice sites (theta): 
+#' ### Number of splice sites (theta) before filtering: 
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "theta"))`  
 #' Merged: `r length(rowRanges(fdsMerge, type = "theta"))`  
 #' 
-#' Introns that passed filter (after merging)
-table(mcols(fdsMerge, type="j")[,"passed"])
+
+#' ## Comparison of local and external counts  
+externalCountIDs <- colData(fdsMerge)[colData(fdsMerge)[,"isExternal"],"sampleID"]
+localCountIDs <- colData(fdsMerge)[!colData(fdsMerge)[,"isExternal"],"sampleID"]
+
+cts <- K(fdsMerge,"psi5")
+ctsLocal<- cts[,localCountIDs]
+ctsExt<- cts[,externalCountIDs]
+
+rowlgmLocal <- rowMeans(log(ctsLocal + 1))
+rowlgmExt <- rowMeans(log(ctsExt + 1))
+
+dt <- data.table("Local log mean counts" = rowlgmLocal,
+                 "External log mean counts" = rowlgmExt)
+                 
+ggplot(dt,aes(x = `Local log mean counts`, y= `External log mean counts`)) +
+   geom_point() + theme_cowplot(font_size = 16) +
+   geom_abline(slope = 1, intercept =0) +
+   scale_color_brewer(palette="Dark2") 
 
 #' ## Expression filtering
 #' Min expression cutoff: `r snakemake@config$aberrantSplicing$minExpressionInOneSample`
@@ -63,3 +88,5 @@ plotFilterExpression(fdsMerge) + theme_cowplot(font_size = 16)
 #' Variability cutoff: `r snakemake@config$aberrantSplicing$minDeltaPsi`
 plotFilterVariability(fdsMerge) + theme_cowplot(font_size = 16)
 
+#' Introns that passed filter (after merging)
+table(mcols(fdsMerge, type="j")[,"passed"])
\ No newline at end of file

From ab4545b5e3859ac77ec8463f59f0387f9be5b950 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 6 Apr 2022 09:55:45 +0200
Subject: [PATCH 30/65] format summary

---
 .../aberrant-expression-pipeline/Counting/Summary.R       | 8 ++++++--
 .../modules/aberrant-splicing-pipeline/Counting/Summary.R | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index e153d7ef..8b4ec3eb 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -189,5 +189,9 @@ plotExpressedGenes(ods) +
   geom_point(data =melt(expressed_genes,id.vars = c("Rank","Is External")),
              aes(x = Rank, y = value, col = variable, shape = `Is External`),show.legend = has_external)
 
-#' ### Expressed Genes by 
-DT::datatable(expressed_genes[order(Rank)],rownames = F)
+#' ### Expressed Genes
+if(has_external){
+    DT::datatable(expressed_genes[order(Rank)],rownames = F)
+} else{
+    DT::datatable(expressed_genes[order(Rank),-"is External"],rownames = F)
+}
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index a1168079..b6b04280 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -61,7 +61,8 @@ if(has_external){
 #' Merged: `r length(rowRanges(fdsMerge, type = "theta"))`  
 #' 
 
-#' ## Comparison of local and external counts  
+#' ### Comparison of local and external counts  
+if(has_external){
 externalCountIDs <- colData(fdsMerge)[colData(fdsMerge)[,"isExternal"],"sampleID"]
 localCountIDs <- colData(fdsMerge)[!colData(fdsMerge)[,"isExternal"],"sampleID"]
 
@@ -79,6 +80,9 @@ ggplot(dt,aes(x = `Local log mean counts`, y= `External log mean counts`)) +
    geom_point() + theme_cowplot(font_size = 16) +
    geom_abline(slope = 1, intercept =0) +
    scale_color_brewer(palette="Dark2") 
+}else{
+	print("No external counts, comparison is ommitted")
+}
 
 #' ## Expression filtering
 #' Min expression cutoff: `r snakemake@config$aberrantSplicing$minExpressionInOneSample`
@@ -89,4 +93,4 @@ plotFilterExpression(fdsMerge) + theme_cowplot(font_size = 16)
 plotFilterVariability(fdsMerge) + theme_cowplot(font_size = 16)
 
 #' Introns that passed filter (after merging)
-table(mcols(fdsMerge, type="j")[,"passed"])
\ No newline at end of file
+table(mcols(fdsMerge, type="j")[,"passed"])

From d83ec496f72a9cbf2aa622cb9e2ba136c9d58b34 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 6 Apr 2022 13:22:26 +0200
Subject: [PATCH 31/65] external counts documentation

---
 docs/source/prepare.rst | 170 ++++++++++++++++++++++++++++------------
 1 file changed, 119 insertions(+), 51 deletions(-)

diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
index 47c637e9..4a1a7ec7 100644
--- a/docs/source/prepare.rst
+++ b/docs/source/prepare.rst
@@ -34,6 +34,7 @@ When providing a path to a file or directory, please provide the *full system pa
 
 Global parameters
 +++++++++++++++++
+These parameters are applied to multiple modules and as a result should be consistent throughout the data you are analyzing
 
 ===================  ==========  =======================================================================================================================================  ======
 Parameter            Type        Description                                                                                                                              Default/Examples
@@ -43,9 +44,9 @@ htmlOutputPath       character   Full path of the folder where the HTML files ar
 indexWithFolderName  boolean     If true, the basename of the project directory will be used as prefix for the index.html file                                            ``true``
 genomeAssembly       character   Either hg19/hs37d5 or hg38/GRCh38, depending on the genome assembly used for mapping                                                     ``/data/project1``
 sampleAnnotation     character   Full path of the sample annotation table                                                                                                 ``/data/project1/sample_annotation.tsv``
-root                 character   Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files.    ``/data/project1``
+root                 character   Full path of the folder where the sub-directories processed_data and processed_results will be created containing DROP's output files.    ``/data/project1``
 genome               character   Full path of a human reference genome fasta file                                                                                         ``/path/to/hg19.fa``
-genome               dictionary  (Optional) Multiple fasta files can be specified when RNA-seq BAM files belong to different genome assemblies (eg, ncbi, ucsc).          ``ncbi: /path/to/hg19_ncbi.fa``
+genome               dictionary  (Optional) Multiple fasta files can be specified when RNA-seq BAM files belong to different genome. assemblies (eg, ncbi, ucsc).          ``ncbi: /path/to/hg19_ncbi.fa``
 
                                                                                                                                                                           ``ucsc: /path/to/hg19_ucsc.fa``
 geneAnnotation       dictionary  A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided.  ``anno1: /path/to/gtf1.gtf``
@@ -61,6 +62,10 @@ tools                dictionary  A key-value list of different commands (key) an
 
 Export counts dictionary
 ++++++++++++++++++++++++
+These parameters are directly used by the ``exportCounts`` snakemake command. This section
+is used to designate which aberrant expression and aberrant splicing groups should be exported
+into datasets that can be shared. To avoid sharing sensitive data, only the canonical annotations
+as described by `geneAnnotations` are exported. Only the groups excluded by `excludeGroups` are not exported.
 
 ===============  ====  ==========================================================================================================================  ======
 Parameter        Type  Description                                                                                                                 Default/Examples
@@ -72,6 +77,8 @@ excludeGroups    list  aberrant expression and aberrant splicing groups whose co
 
 Aberrant expression dictionary
 ++++++++++++++++++++++++++++++
+These parameters are directly used by the ``aberrantExpression`` snakemake command. Aberrant expression groups must have at least ``10``
+samples per group. To use external counts please see the ``Using External Counts`` section.
 
 ============================  =========  =================================================================================================================================  ======
 Parameter                     Type       Description                                                                                                                        Default/Examples
@@ -90,6 +97,8 @@ maxTestedDimensionProportion  numeric    An integer that controls the maximum va
 
 Aberrant splicing dictionary
 ++++++++++++++++++++++++++++
+These parameters are directly used by the ``aberrantSplicing`` snakemake command. Aberrant splicing groups must have at least ``10``
+samples per group. To use external counts please see the ``Using External Counts`` section.
 
 ============================  =========  ============================================================================================  ======
 Parameter                     Type       Description                                                                                   Default/Examples
@@ -110,8 +119,10 @@ maxTestedDimensionProportion  numeric    Same as in aberrant expression.
 ============================  =========  ============================================================================================  ======
 
 
-Mono-allelic expression dictionary
+Mono-allelic expression (MAE) dictionary
 ++++++++++++++++++++++++++++++++++
+These parameters are directly used by the ``mae`` snakemake command. MAE groups are not bound by a minimum number of samples,
+but require additional information in the sample annotation table.
 
 =====================  =========  ========================================================================================================================  ======
 Parameter              Type       Description                                                                                                               Default/Examples
@@ -136,7 +147,6 @@ For example, if the AberrantExpression module is set to false, the  ``Scripts/Ab
 
 Creating the sample annotation table
 ------------------------------------
-
 For a detailed explanation of the columns of the sample annotation, please refer to
 Box 3 of the `DROP manuscript <https://rdcu.be/cdMmF>`_. 
 
@@ -144,78 +154,106 @@ Each row of the sample annotation table corresponds to a unique pair of RNA and
 samples derived from the same individual. An RNA assay can belong to one or more DNA
 assays, and vice-versa. If so, they must be specified in different rows. The required
 columns are ``RNA_ID``, ``RNA_BAM_FILE`` and ``DROP_GROUP``, plus other module-specific
-ones (see DROP manuscript). 
+ones (see DROP manuscript).
 
 The following columns describe the RNA-seq experimental setup:
 ``PAIRED_END``, ``STRAND``, ``COUNT_MODE`` and ``COUNT_OVERLAPS``. They affect the 
 counting procedures of the aberrant expression and splicing modules. For a detailed 
 explanation, refer to the documentation of `HTSeq <https://htseq.readthedocs.io/en/latest/>`_.
 
-To run the MAE module, the columns ``DNA_ID`` and ``DNA_VCF_FILE`` are needed.
-
-In case external counts are included, add a new row for each sample from those 
-files (or a subset if not all samples are needed). Add the columns: ``GENE_COUNTS_FILE``,
-``GENE_ANNOTATON``, ``SPLIT_COUNTS_FILE`` and ``NON_SPLIT_COUNTS_FILE``. See examples below.
+To run the MAE module, the columns ``DNA_ID`` and ``DNA_VCF_FILE`` are needed. MAE can not be run
+in samples using external counts as we need to use the ``RNA_BAM_FILE`` to count reads supporting
+each allele of the heterozygous variants found in the ``DNA_VCF_FILE``.
 
 In case RNA-seq BAM files belong to different genome assemblies (eg, ncbi, ucsc), multiple
 reference genome fasta files can be specified. Add a column called `GENOME` that  
 contains, for each sample, the key from the `genome` parameter in the config file that
 matches its genome assembly (eg, ncbi or ucsc).
 
-
 The sample annotation file must be saved in the tab-separated values (tsv) format. The 
 column order does not matter. Also, it does not matter where it is stored, as the path is 
 specified in the config file. Here we provide some examples on how to deal with certain
 situations. For simplicity, we do not include all possible columns in the examples.
 
-Example of RNA replicates 
-++++++++++++++++++++++++++++++++++
 
-======  ======  ==========  ===================  ==
-RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE         DNA_VCF_FILE
-======  ======  ==========  ===================  ==
-S10R_B  S10G    BLOOD       /path/to/S10R_B.BAM  /path/to/S10G.vcf.gz
-S10R_M  S10G    MUSCLE      /path/to/S10R_M.BAM  /path/to/S10G.vcf.gz
-======  ======  ==========  ===================  ==
-
-Example of DNA replicates 
+Using External Counts
 ++++++++++++++++++++++++++++++++++
+DROP can utilize external counts for the ``aberrantExpression`` and ``aberrantSplicing`` modules
+which can enhance the statistical power of these modules by providing more samples from which we 
+can build a distribution of counts and detect outliers. However this process introduces some
+particular issues that need to be addressed to make sure it is a valuable addition to the experiment.
 
-======  ======  ==========  =================  ==
-RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       DNA_VCF_FILE
-======  ======  ==========  =================  ==
-S20R    S20E    WES         /path/to/S20R.BAM  /path/to/S20E.vcf.gz
-S20R    S20G    WGS         /path/to/S20R.BAM  /path/to/S20G.vcf.gz
-======  ======  ==========  =================  ==
-
-Example of a multi-sample vcf file
-++++++++++++++++++++++++++++++++++
-
-======  ======  ==========  =================  ==
-RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       DNA_VCF_FILE
-======  ======  ==========  =================  ==
-S10R    S10G    WGS         /path/to/S10R.BAM  /path/to/multi_sample.vcf.gz
-S20R    S20G    WGS         /path/to/S20R.BAM  /path/to/multi_sample.vcf.gz
-======  ======  ==========  =================  ==
-
-External count matrices
+In case external counts are included, add a new row for each sample from those 
+files (or a subset if not all samples are needed). Add the columns: ``GENE_COUNTS_FILE``
+(for aberrant expression), ``GENE_ANNOTATON``, and ``SPLICE_COUNTS_DIR`` (for aberrant splicing).
+These columns should remain empty for samples processed locally (from ``RNA_BAM``).
+
+### Aberrant Expression
+Using external counts for aberrant expression forces you to use the exact same gene annotation for each
+external sample as well as using the same gene annotation file specified in the config file
+``Global parameters`` section. This is to avoid potential mismatching on counting, 2 different gene
+annotations could drastically affect which reads are counted in which region drastically skewing the results.
+
+The user must also use special consideration when building the sample annotation table. Samples
+using external counts need only ``RNA_ID`` which must exactly match the column header in the external count file
+``DROP_GROUP``, ``GENE_COUNTS_FILE``, and ``GENE_ANNOTATION`` which must contain the exact key specified in the config.
+The other columns should remain empty. 
+
+Using ``exportCounts`` generates the sharable ``GENE_COUNTS_FILE`` file in the appropriate
+``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory.
+
+### Aberrant Splicing
+Using external counts for aberrant splicing reduces the number of introns processed to only those
+that are exactly the same between the local and external junctions. Because rare junctions may be 
+personally identifiable the ``exportCounts`` command only exports regions canonically mentioned in the gtf file.
+As a result, when merging the external counts with the local counts we only match introns that are **exact** between
+the 2 sets, this is to ensure that if a region is missing we don't introduce 0 counts into the distribution calculations.
+
+The user must also use special consideration when building the sample annotation table. Samples
+using external counts need only ``RNA_ID`` which must exactly match the column header in the external count file
+``DROP_GROUP``, and ``SPLICE_COUNTS_DIR``. ``SPLICE_COUNTS_DIR`` is the directory containing the set of 5 needed count files.
+The other columns should remain empty. 
+
+Using ``exportCounts`` generates the necessary files in the appropriate
+``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory
+
+``SPLICE_COUNTS_DIR`` should contain the following:  
+- k_j_counts.tsv.gz  
+- k_theta_counts.tsv.gz  
+- n_psi3_counts.tsv.gz  
+- n_psi5_counts.tsv.gz  
+- n_theta_counts.tsv.gz  
+
+### Publicly available DROP external counts
+You can find different sets of publicly available external counts to add to your
+analysis on our github page  <https://github.com/gagneurlab/drop/#datasets>
+
+If you want to contribute with your own count matrices, please contact us: yepez at in.tum.de (yepez@in.tum.de)
+
+External count examples
 +++++++++++++++++++++++
 
 In case counts from external matrices are to be integrated into the analysis,
-the file must be specified in the GENE_COUNTS_FILE column. A new row must be
-added for each sample from the count matrix that should be included in the 
-analysis. An RNA_BAM_FILE must not be specified. The DROP_GROUP of the local
+the sample annotation must be built in a particular way
+A new row must be added for each sample from the count matrix that should be included in the 
+analysis. The ``RNA_ID`` must match the column header of the external files,
+the ``RNA_BAM_FILE`` must not be specified. The ``DROP_GROUP`` of the local
 and external samples that are to be analyzed together must be the same.
-Similarly, the GENE_ANNOTATION of the external counts and the key of the `geneAnnotation`
+For aberrant expression, the GENE_ANNOTATION of the external counts and the key of the `geneAnnotation`
 parameter from the config file must match.
 
-======  ======  ==========  =================  ==============================  ==
-RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       GENE_COUNTS_FILE                GENE_ANNOTATION
-======  ======  ==========  =================  ==============================  ==
-S10R    S10G    BLOOD       /path/to/S10R.BAM  
-EXT-1R          BLOOD                          /path/to/externalCounts.tsv.gz  gencode34
-EXT-2R          BLOOD                          /path/to/externalCounts.tsv.gz  gencode34
-======  ======  ==========  =================  ==============================  ==
+This example will use the ``DROP_GROUP`` BLOOD_AE for the aberrant expression module (containing S10R, EXT-1R, EXT-2R) and
+the ``DROP_GROUP`` BLOOD_AS for the aberrant expression module (containing S10R, EXT-3R, EXT-4R)
+
+======  ======  ==========  =================  ==============================  =============== ========================
+RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       GENE_COUNTS_FILE                GENE_ANNOTATION SPLICE_COUNTS_DIR
+======  ======  ==========  =================  ==============================  =============== ========================
+S10R    S10G    BLOOD_AE    /path/to/S10R.BAM  
+EXT-1R          BLOOD_AE                       /path/to/externalCounts.tsv.gz  gencode34
+EXT-2R          BLOOD_AE                       /path/to/externalCounts.tsv.gz  gencode34
+EXT-3R          BLOOD_AS                                                                       /path/to/externalCountDir 
+EXT-4R          BLOOD_AS                                                                       /path/to/externalCountDir 
+======  ======  ==========  =================  ==============================  =============== ========================
 
 .. _filesdownload:
 
@@ -236,8 +274,39 @@ Download it and indicate the full path to it in the ``hpoFile`` key.
 The file is only needed in case HPO terms are specified in the sample annotation.
 Otherwise, write ``null`` in the ``hpoFile`` key.
 
+
 .. _advancedoptions:
 
+Example of RNA replicates 
+++++++++++++++++++++++++++++++++++
+
+======  ======  ==========  ===================  ==
+RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE         DNA_VCF_FILE
+======  ======  ==========  ===================  ==
+S10R_B  S10G    BLOOD       /path/to/S10R_B.BAM  /path/to/S10G.vcf.gz
+S10R_M  S10G    MUSCLE      /path/to/S10R_M.BAM  /path/to/S10G.vcf.gz
+======  ======  ==========  ===================  ==
+
+Example of DNA replicates 
+++++++++++++++++++++++++++++++++++
+
+======  ======  ==========  =================  ==
+RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       DNA_VCF_FILE
+======  ======  ==========  =================  ==
+S20R    S20E    WES         /path/to/S20R.BAM  /path/to/S20E.vcf.gz
+S20R    S20G    WGS         /path/to/S20R.BAM  /path/to/S20G.vcf.gz
+======  ======  ==========  =================  ==
+
+Example of a multi-sample vcf file
+++++++++++++++++++++++++++++++++++
+
+======  ======  ==========  =================  ==
+RNA_ID  DNA_ID  DROP_GROUP  RNA_BAM_FILE       DNA_VCF_FILE
+======  ======  ==========  =================  ==
+S10R    S10G    WGS         /path/to/S10R.BAM  /path/to/multi_sample.vcf.gz
+S20R    S20G    WGS         /path/to/S20R.BAM  /path/to/multi_sample.vcf.gz
+======  ======  ==========  =================  ==
+
 Advanced options
 ----------------
 
@@ -272,4 +341,3 @@ In additon, DROP allows that BAM files from RNA-seq were aligned to one genome
 assembly (eg ucsc) and the corresponding VCF files from DNA sequencing to another
 genome assembly (eg ncbi). If so, the assembly of the reference genome fasta file
 must correspond to the one of the BAM file from RNA-seq.
-

From 6505a657a573d95665a5c7daf710684901d785da Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 6 Apr 2022 17:26:16 +0200
Subject: [PATCH 32/65] documentation and updating

---
 docs/source/index.rst                         |  3 +-
 docs/source/output.rst                        | 73 +++++++++++++++++++
 .../Counting/Summary.R                        |  2 +-
 .../OUTRIDER/results.R                        |  3 +-
 .../Counting/Summary.R                        |  2 +-
 .../FRASER/Summary.R                          |  9 ++-
 6 files changed, 86 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/output.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index f80b1562..c444d231 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@ Then, DROP can be executed in multiple ways (:doc:`pipeline`).
    installation
    prepare
    pipeline
+   output
    license
    help
 
@@ -24,7 +25,7 @@ We recommend using a dedicated conda environment. (installation time: ~ 10min)
 
 .. code-block:: bash
 
-    mamba install -c conda-forge -c bioconda drop
+    mamba create -n drop -c conda-forge -c bioconda drop
 
 Test installation with demo project
 
diff --git a/docs/source/output.rst b/docs/source/output.rst
new file mode 100644
index 00000000..d201476d
--- /dev/null
+++ b/docs/source/output.rst
@@ -0,0 +1,73 @@
+Results and Output of DROP
+===========================
+
+DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
+aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
+easy to read and interpret html files and output files. This section is dedicated to explaining the relevant
+results files. We will use the results of the ``demo`` to explain the files generated.
+
+```
+#install drop
+mamba create -n drop_env -c conda-forge -c bioconda drop
+conda activate drop_env
+
+mkdir drop_demo
+cd drop_demo
+drop demo
+
+snakemake -c1
+```
+
+## Aberrant Expression
+
+### html file
+Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` 
+tab at the top of the screen. Following that the Overview tab contains links to the:  
+- Counting Summaries 
+    - For each aberrant expression group
+        - split of local vs external sample counts
+        - QC relating to reads and size factors for each sample
+        - histograms relating to mean count distribution with different conditions
+        - information about the expressed genes within each sample and as a dataset
+- Outrider Summaries
+    - For each aberrant expression group
+        - the number of aberrantly expressed gene per sample
+        - how batch correction is done and the resulting lack of batch effects
+        - which samples contain outliers
+        - results table
+- Files
+    - OUTRIDER files for each aberrant expression group
+        - For each of these files you can follow the `OUTRIDER vignette for analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
+    - results.tsv files
+        - For each aberrant expression group
+            - a tsv file that contains the sampleID, hgnc gene symbol, pvalue and adjusted pvalue, and subsequent analysis points
+                - this tsv file contains only the genes and samples that meet the cutoffs defined in the ``config.yaml``
+                for ``padjCutoff`` and ``zScoreCutoff``
+
+## Aberrant Splicing
+### html file
+Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantSplicing`` 
+tab at the top of the screen. Following that the Overview tab contains links to the:  
+- Counting Summaries 
+    - For each aberrant splicing group
+        - split of local (from internal BAM files) vs external sample counts
+        - split of local vs merged with external sample splicing/intron counts
+        - comparison of local and external log mean counts
+        - histograms relating to junction expression before and after filtering and variability
+- FRASER Summaries
+    - For each aberrant splicing group
+        - the number of samples, introns, and splice sites 
+        - how batch correction is done and the resulting lack of batch effects
+        - result table
+- Files
+    - FRASER files for each aberrant splicing group
+        - For each of these files you can follow the `FRASER vignette for analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
+    - results.tsv files
+        - For each aberrant splicing group
+            - results.tsv 
+                - this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
+            - results_per_junction.tsv 
+                - this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
+
+
+## Mono-allelic Expression TODO
\ No newline at end of file
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 8b4ec3eb..1bd807e6 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -173,6 +173,7 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
 #+ meanCounts, fig.height=6, fig.width=12
 plot_grid(p_hist, p_dens)
 
+#' ### Expressed Genes
 exp_genes_cols <- c(`Expressed\ngenes` = "expressedGenes", 
                     `Union of\nexpressed genes` = "unionExpressedGenes", 
                     `Intersection of\nexpressed genes` = "intersectionExpressedGenes", 
@@ -189,7 +190,6 @@ plotExpressedGenes(ods) +
   geom_point(data =melt(expressed_genes,id.vars = c("Rank","Is External")),
              aes(x = Rank, y = value, col = variable, shape = `Is External`),show.legend = has_external)
 
-#' ### Expressed Genes
 if(has_external){
     DT::datatable(expressed_genes[order(Rank)],rownames = F)
 } else{
diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R
index b0da16ff..c76b252a 100644
--- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R
+++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R
@@ -31,7 +31,8 @@ suppressPackageStartupMessages({
 })
 
 ods <- readRDS(snakemake@input$ods)
-res <- results(ods, all = TRUE)
+res <- results(ods, padjCutoff = snakemake@params$padjCutoff,
+			   zScoreCutoff = snakemake@params$zScoreCutoff, all = TRUE)
 
 # Add fold change
 res[, foldChange := round(2^l2fc, 2)]
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index b6b04280..3a6250e2 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -44,7 +44,7 @@ if(has_external){
 #' Local (fromBam): `r sum(!fdsMerge@colData$isExternal)`  
 #' External: `r sum(fdsMerge@colData$isExternal)`  
 #' 
-#' ## Using external counts
+#' ### Using external counts
 #' External counts introduce some complexity into the problem of counting junctions
 #' because it is ambiguous whether or not a junction is not counted (because there are no reads)
 #' compared to filtered and not present due to legal/personal sharing reasons. As a result,
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index 31336fe9..d392b134 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -29,6 +29,10 @@ suppressPackageStartupMessages({
 #+ input
 dataset    <- snakemake@wildcards$dataset
 annotation <- snakemake@wildcards$annotation
+padj_cutoff <- snakemake@config$aberrantSplicing$padjCutoff
+zScore_cutoff <- snakemake@config$aberrantSplicing$zScoreCutoff
+deltaPsi_cutoff <- snakemake@config$aberrantSplicing$deltaPsiCutoff
+
 
 fds <- loadFraserDataSet(file=snakemake@input$fdsin)
 
@@ -53,11 +57,12 @@ for(type in psiTypes){
 }
 
 #' ## Aberrantly spliced genes per sample
-plotAberrantPerSample(fds, aggregate=TRUE, main=dataset_title) + 
+plotAberrantPerSample(fds, padjCutoff = padj_cutoff, zScoreCutoff = zScore_cutoff, deltaPsiCutoff = deltaPsi_cutoff,
+                      aggregate=TRUE, main=dataset_title) + 
   theme_cowplot(font_size = 16) +
   theme(legend.position = "top")
 
-#' ## Batch Correlation: Samples x samples
+#' ## Batch Correlation: samples x samples
 topN <- 30000
 topJ <- 10000
 for(type in psiTypes){

From 83ca5611a1aeb1034c8c315a2bcd7a998d9c4765 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 12:16:29 +0200
Subject: [PATCH 33/65] update MAE summary and results

---
 docs/source/output.rst                        | 20 +++++++++++++++---
 drop/modules/mae-pipeline/MAE/Results.R       | 11 ++++++++--
 .../Scripts/MonoallelicExpression/Overview.R  | 21 ++++++++++++++++---
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index d201476d..a95c567c 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -23,6 +23,7 @@ snakemake -c1
 ### html file
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` 
 tab at the top of the screen. Following that the Overview tab contains links to the:  
+
 - Counting Summaries 
     - For each aberrant expression group
         - split of local vs external sample counts
@@ -37,7 +38,7 @@ tab at the top of the screen. Following that the Overview tab contains links to
         - results table
 - Files
     - OUTRIDER files for each aberrant expression group
-        - For each of these files you can follow the `OUTRIDER vignette for analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
+        - For each of these files you can follow the `OUTRIDER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
     - results.tsv files
         - For each aberrant expression group
             - a tsv file that contains the sampleID, hgnc gene symbol, pvalue and adjusted pvalue, and subsequent analysis points
@@ -61,7 +62,7 @@ tab at the top of the screen. Following that the Overview tab contains links to
         - result table
 - Files
     - FRASER files for each aberrant splicing group
-        - For each of these files you can follow the `FRASER vignette for analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
+        - For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
     - results.tsv files
         - For each aberrant splicing group
             - results.tsv 
@@ -70,4 +71,17 @@ tab at the top of the screen. Following that the Overview tab contains links to
                 - this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
 
 
-## Mono-allelic Expression TODO
\ No newline at end of file
+## Mono-allelic Expression
+Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``MonoallelicExpression`` 
+tab at the top of the screen. Following that the Overview tab contains links to the:  
+- Results
+    - For each mae group
+        - the number of samples, unique genes, and aberrant events
+        - a cascade plot that shows additional filters
+            - MAE for REF: the monoallelic expression favors the reference allele 
+            - MAE for ALT: the monoallelic expression favors the alternative allele 
+            - rare: if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
+        - histogram of inner cohort frequency
+        - summary of cascade plots and results table
+        
+        
\ No newline at end of file
diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R
index d120388d..51527afe 100644
--- a/drop/modules/mae-pipeline/MAE/Results.R
+++ b/drop/modules/mae-pipeline/MAE/Results.R
@@ -36,6 +36,7 @@ suppressPackageStartupMessages({
   library(GenomicRanges)
   library(SummarizedExperiment)
   library(R.utils)
+  library(dplyr)
 })
 
 # Read all MAE results files
@@ -119,6 +120,7 @@ fwrite(res[MAE_ALT == TRUE & rare == TRUE], snakemake@output$res_signif_rare,
 
 # Add columns for plot
 res[, N := .N, by = ID]
+res[,c("N_MAE","N_MAE_REF","N_MAE_ALT","N_MAE_REF_RARE","N_MAE_ALT_RARE") := 0,by = ID]
 res[MAE == TRUE, N_MAE := .N, by = ID]
 res[MAE == TRUE & MAE_ALT == FALSE, N_MAE_REF := .N, by = ID]
 res[MAE_ALT == TRUE, N_MAE_ALT := .N, by = ID]
@@ -126,6 +128,11 @@ res[MAE == TRUE & MAE_ALT == FALSE & rare == TRUE, N_MAE_REF_RARE := .N, by = ID
 res[MAE_ALT == TRUE & rare == TRUE, N_MAE_ALT_RARE := .N, by = ID]
 
 rd <- unique(res[,.(ID, N, N_MAE, N_MAE_REF, N_MAE_ALT, N_MAE_REF_RARE, N_MAE_ALT_RARE)])
+
+# rd contains duplicate entries for each ID. IE when MAE==F N_MAE for ID1 is both .N and 0
+# summarize these duplicates by taking the maximum of each column for each ID
+rd <- rd %>% group_by(ID) %>% summarize_all(max) %>% as.data.table()
+
 melt_dt <- melt(rd, id.vars = 'ID')
 melt_dt[variable == 'N', variable := '>10 counts']
 melt_dt[variable == 'N_MAE', variable := '+MAE']
@@ -137,8 +144,8 @@ melt_dt[variable == 'N_MAE_ALT_RARE', variable := '+MAE for ALT\n& rare']
 #' 
 #' ## Cascade plot 
 ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
-  scale_y_log10() + theme_bw(base_size = 14) +
-  labs(y = 'Heterozygous SNVs per patient', x = '') +
+  scale_y_log10(limits = c(1,NA)) + theme_bw(base_size = 14) +
+  labs(y = 'Heterozygous SNVs per patient', x = '') + 
     annotation_logticks(sides = "l")
 
 #'
diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 5720c6a8..9a51d310 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -9,6 +9,7 @@
 #'    - datasets: '`sm cfg.MAE.groups`'
 #'    - qc_groups: '`sm cfg.MAE.qcGroups`'
 #'    - htmlDir: '`sm config["htmlOutputPath"] + "/MonoallelicExpression"`'
+#'    - resultsDir: '`sm cfg.getProcessedResultsDir() + "/mae"`'
 #'  input:
 #'    - functions: '`sm cfg.workDir / "Scripts/html_functions.R"`'
 #'    - allelic_counts: '`sm expand(cfg.getProcessedDataDir() +
@@ -24,7 +25,7 @@
 #'                  "dna_rna_qc_matrix.Rds", qc_group=cfg.MAE.qcGroups)`'
 #' output:
 #'   html_document:
-#'    code_folding: hide
+#'    code_folding: show
 #'    code_download: TRUE
 #'---
 
@@ -37,6 +38,7 @@ source(snakemake@input$functions)
 datasets <- sort(snakemake@params$datasets)
 annotations <- snakemake@params$annotations
 htmlDir <- snakemake@params$htmlDir
+resultsDir <- snakemake@params$resultsDir
 
 results_links <- sapply(
   annotations, function(v) build_link_list(
@@ -45,6 +47,13 @@ results_links <- sapply(
   )
 )
 
+table_links <- sapply(
+  annotations, function(v) build_link_list(
+    file_paths = file.path(resultsDir, paste0(datasets, '/MAE_results_', v, '.tsv')),
+    captions = paste0(datasets)
+  )
+)
+
 #'
 #' **Datasets:** `r paste(datasets, collapse = ', ')`
 #'
@@ -55,9 +64,13 @@ results_links <- sapply(
 #'
 #' ## Files
 #' * [Allelic counts](`r file.path(snakemake@config$root, 'processed_data/mae/allelic_counts/')`)
-#' * [Results tables of each sample](`r file.path(snakemake@config$root, 'processed_results/mae/samples/')`)
-#' * [Aggregated results tables of each group](`r paste('* ', snakemake@input$results_tables, collapse = '\n')`)
+#' * [Results data tables of each sample (.Rds)](`r file.path(snakemake@config$root, 'processed_results/mae/samples/')`)  
+
 #'
+#' `r display_text(caption = 'Significant MAE results tables ', links = table_links)`
+
+
+#+ eval=TRUE, echo=TRUE
 #' ## Analyze Individual Results
 # Read the first results table
 res_sample <- readRDS(snakemake@input$results_obj[[1]])
@@ -73,9 +86,11 @@ if(is.null(res_sample$rare)){
   g2 <- plotAllelicCounts(res_sample, rare_column = 'rare')
 }
 
+#+echo=F
 #' ### MA plot: fold change vs RNA coverage
 g1
 
+#+echo=F
 #' ### Alternative vs Reference plot
 g2
 

From cd5f4878802509ee47c0cfaa1f83b3edf969f91b Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 12:42:25 +0200
Subject: [PATCH 34/65] format overview

---
 drop/template/Scripts/MonoallelicExpression/Overview.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 9a51d310..32221a2f 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -98,7 +98,7 @@ g2
 #+ eval=TRUE, echo=FALSE
 qc_groups <- sort(snakemake@params$qc_groups)
 qc_links <- build_link_list(
-    file_paths = file.path(htmlDir, paste0('QC', qc_groups, '.html')),
+    file_paths = file.path(htmlDir, paste0('QC/', qc_groups, '.html')),
     captions = qc_groups
 )
 

From 16ef35c323e4b72e18319d21ce68575d56befd87 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 12:49:15 +0200
Subject: [PATCH 35/65] Overview code block

---
 drop/template/Scripts/MonoallelicExpression/Overview.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 32221a2f..328738c6 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -86,12 +86,12 @@ if(is.null(res_sample$rare)){
   g2 <- plotAllelicCounts(res_sample, rare_column = 'rare')
 }
 
-#+echo=F
 #' ### MA plot: fold change vs RNA coverage
+#+echo=F
 g1
 
-#+echo=F
 #' ### Alternative vs Reference plot
+#+echo=F
 g2
 
 #' ## Quality Control: VCF-BAM Matching

From 82870aca6eb24032a4d02f11ad4da8de316e2226 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 13:24:37 +0200
Subject: [PATCH 36/65] update QC matching

---
 docs/source/output.rst                        | 14 ++++++--
 .../mae-pipeline/QC/DNA_RNA_matrix_plot.R     |  4 +--
 .../Scripts/MonoallelicExpression/Overview.R  | 34 +++++++++----------
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index a95c567c..e574c5ec 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -83,5 +83,15 @@ tab at the top of the screen. Following that the Overview tab contains links to
             - rare: if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
         - histogram of inner cohort frequency
         - summary of cascade plots and results table
-        
-        
\ No newline at end of file
+- Files
+    - Allelic counts
+        - a directory containing the allelic counts of heterozygous variants
+    - Results data tables of each sample (.Rds)
+        - Rds objects containing the full results table regardless of MAE status
+    - Significant MAE results tables
+        - For each mae group
+            - a link to the results tsv file. Only contains MAE results for the alternative allele
+- Quality Control
+    - QC Overview
+        - For each mae group QC checks for DNA/RNA matching
+            
\ No newline at end of file
diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
index ec2df3db..d6624213 100644
--- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
+++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
@@ -33,8 +33,6 @@ identityCutoff <- .85
 
 ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.05, center = .025) + 
   theme_bw(base_size = 14) + 
-  labs(x = 'Proportion of matching DNA-RNA variants', y = 'DNA-RNA combinations') + 
-  scale_y_log10() + annotation_logticks(sides = "l") + 
   expand_limits(x=c(0,1)) +
   geom_vline(xintercept=identityCutoff, linetype='dashed', color = 'firebrick')
 
@@ -42,7 +40,7 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.
 
 #' Number of samples: `r nrow(qc_mat)`
 #' 
-#' Number of samples that match with another: `r length(qc_mat[qc_mat > identityCutoff])`
+#' Number of samples that match RNA and DNA: `r length(qc_mat[qc_mat > identityCutoff])`
 #'
 #' Median of matching samples value: `r round(median(qc_mat[qc_mat > identityCutoff]), 2)`
 #'
diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 328738c6..030ca7d2 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -70,6 +70,23 @@ table_links <- sapply(
 #' `r display_text(caption = 'Significant MAE results tables ', links = table_links)`
 
 
+#' ## Quality Control: VCF-BAM Matching
+#+ eval=TRUE, echo=FALSE
+qc_groups <- sort(snakemake@params$qc_groups)
+qc_links <- build_link_list(
+    file_paths = file.path(htmlDir, paste0('QC/', qc_groups, '.html')),
+    captions = qc_groups
+)
+
+qc_matrix_links <- build_link_list(
+    file_paths = file.path(snakemake@input$qc_matrix),
+    captions = qc_groups
+)
+
+#' `r display_text(caption = 'QC Overview ', links = qc_links)`
+#' `r display_text(caption = 'DNA-RNA matrix ', links = qc_matrix_links)`
+#'
+
 #+ eval=TRUE, echo=TRUE
 #' ## Analyze Individual Results
 # Read the first results table
@@ -93,20 +110,3 @@ g1
 #' ### Alternative vs Reference plot
 #+echo=F
 g2
-
-#' ## Quality Control: VCF-BAM Matching
-#+ eval=TRUE, echo=FALSE
-qc_groups <- sort(snakemake@params$qc_groups)
-qc_links <- build_link_list(
-    file_paths = file.path(htmlDir, paste0('QC/', qc_groups, '.html')),
-    captions = qc_groups
-)
-
-qc_matrix_links <- build_link_list(
-    file_paths = file.path(snakemake@input$qc_matrix),
-    captions = qc_groups
-)
-
-#' `r display_text(caption = 'QC Overview ', links = qc_links)`
-#' `r display_text(caption = 'DNA-RNA matrix ', links = qc_matrix_links)`
-#'

From c9f35856f20ff97167991c6c85916308e89a35d2 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 13:30:59 +0200
Subject: [PATCH 37/65] process NA rare

---
 drop/template/Scripts/MonoallelicExpression/Overview.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 030ca7d2..6bf57769 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -95,7 +95,7 @@ res_sample <- readRDS(snakemake@input$results_obj[[1]])
 #+echo=F
 library(tMAE)
 
-if(is.null(res_sample$rare)){
+if(is.na(res_sample$rare)){
   g1 <- plotMA4MAE(res_sample)
   g2 <- plotAllelicCounts(res_sample)
 } else {

From 3f231981c4c736632db77371adf125bd770248a2 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 13:40:27 +0200
Subject: [PATCH 38/65] docs

---
 docs/source/output.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index e574c5ec..15899723 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -94,4 +94,6 @@ tab at the top of the screen. Following that the Overview tab contains links to
 - Quality Control
     - QC Overview
         - For each mae group QC checks for DNA/RNA matching
-            
\ No newline at end of file
+- Analyze Individual Results
+    - An example analaysis that can be run using the Rds objects linked in the files subsection
+    - performed on the first mae sample
\ No newline at end of file

From 5bf0d44e4fbefe81b62b60c8fb0d646a8d17fccc Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 14:15:44 +0200
Subject: [PATCH 39/65] mae cutoffs to get results

---
 drop/demo/config_relative.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
index a4e702df..ea0b14b7 100755
--- a/drop/demo/config_relative.yaml
+++ b/drop/demo/config_relative.yaml
@@ -55,8 +55,8 @@ mae:
     groups:
       - mae
     gatkIgnoreHeaderCheck: true
-    padjCutoff: .05
-    allelicRatioCutoff: 0.8
+    padjCutoff: .5
+    allelicRatioCutoff: 0.7
     addAF: false
     maxAF: .001
     maxVarFreqCohort: 1

From 14edfc453684629cc0669dc00540fcd4082d2579 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 14:30:23 +0200
Subject: [PATCH 40/65] update docs

---
 docs/source/output.rst  | 167 +++++++++++++++++++++-------------------
 docs/source/prepare.rst |  11 ++-
 2 files changed, 95 insertions(+), 83 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 15899723..2b1b136f 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -4,96 +4,105 @@ Results and Output of DROP
 DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
 aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
 easy to read and interpret html files and output files. This section is dedicated to explaining the relevant
-results files. We will use the results of the ``demo`` to explain the files generated.
+results files. We will use the results of the ``demo`` to explain the files generated.::
 
-```
-#install drop
-mamba create -n drop_env -c conda-forge -c bioconda drop
-conda activate drop_env
+    #install drop
+    mamba create -n drop_env -c conda-forge -c bioconda drop
+    conda activate drop_env
+    
+    mkdir drop_demo
+    cd drop_demo
+    drop demo
+    
+    snakemake -c1
 
-mkdir drop_demo
-cd drop_demo
-drop demo
+Aberrant Expression
++++++++++++++++++++
 
-snakemake -c1
-```
-
-## Aberrant Expression
-
-### html file
+html file
+#########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` 
 tab at the top of the screen. Following that the Overview tab contains links to the:  
 
-- Counting Summaries 
-    - For each aberrant expression group
-        - split of local vs external sample counts
-        - QC relating to reads and size factors for each sample
-        - histograms relating to mean count distribution with different conditions
-        - information about the expressed genes within each sample and as a dataset
-- Outrider Summaries
-    - For each aberrant expression group
-        - the number of aberrantly expressed gene per sample
-        - how batch correction is done and the resulting lack of batch effects
-        - which samples contain outliers
-        - results table
-- Files
-    - OUTRIDER files for each aberrant expression group
-        - For each of these files you can follow the `OUTRIDER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
-    - results.tsv files
-        - For each aberrant expression group
-            - a tsv file that contains the sampleID, hgnc gene symbol, pvalue and adjusted pvalue, and subsequent analysis points
-                - this tsv file contains only the genes and samples that meet the cutoffs defined in the ``config.yaml``
+* Counting Summaries 
+    * For each aberrant expression group
+        * split of local vs external sample counts
+        * QC relating to reads and size factors for each sample
+        * histograms relating to mean count distribution with different conditions
+        * information about the expressed genes within each sample and as a dataset
+* Outrider Summaries
+    * For each aberrant expression group
+        * the number of aberrantly expressed gene per sample
+        * how batch correction is done and the resulting lack of batch effects
+        * which samples contain outliers
+        * results table
+* Files
+    * OUTRIDER files for each aberrant expression group
+        * For each of these files you can follow the `OUTRIDER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
+    * results.tsv files
+        * For each aberrant expression group
+            * a tsv file that contains the sampleID, hgnc gene symbol, pvalue and adjusted pvalue, and subsequent analysis points
+                * this tsv file contains only the genes and samples that meet the cutoffs defined in the ``config.yaml``
                 for ``padjCutoff`` and ``zScoreCutoff``
 
-## Aberrant Splicing
-### html file
+Aberrant Splicing
++++++++++++++++++
+
+html file
+##########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantSplicing`` 
 tab at the top of the screen. Following that the Overview tab contains links to the:  
-- Counting Summaries 
-    - For each aberrant splicing group
-        - split of local (from internal BAM files) vs external sample counts
-        - split of local vs merged with external sample splicing/intron counts
-        - comparison of local and external log mean counts
-        - histograms relating to junction expression before and after filtering and variability
-- FRASER Summaries
-    - For each aberrant splicing group
-        - the number of samples, introns, and splice sites 
-        - how batch correction is done and the resulting lack of batch effects
-        - result table
-- Files
-    - FRASER files for each aberrant splicing group
-        - For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
-    - results.tsv files
-        - For each aberrant splicing group
-            - results.tsv 
-                - this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
-            - results_per_junction.tsv 
-                - this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
 
+* Counting Summaries 
+    * For each aberrant splicing group
+        * split of local (from internal BAM files) vs external sample counts
+        * split of local vs merged with external sample splicing/intron counts
+        * comparison of local and external log mean counts
+        * histograms relating to junction expression before and after filtering and variability
+* FRASER Summaries
+    * For each aberrant splicing group
+        * the number of samples, introns, and splice sites 
+        * how batch correction is done and the resulting lack of batch effects
+        * result table
+* Files
+    * FRASER files for each aberrant splicing group
+        * For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
+    * results.tsv files
+        * For each aberrant splicing group
+            * results.tsv 
+                * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
+            * results_per_junction.tsv 
+                * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
 
-## Mono-allelic Expression
+
+Mono-allelic Expression
++++++++++++++++++++++++
+
+html file
+##########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``MonoallelicExpression`` 
 tab at the top of the screen. Following that the Overview tab contains links to the:  
-- Results
-    - For each mae group
-        - the number of samples, unique genes, and aberrant events
-        - a cascade plot that shows additional filters
-            - MAE for REF: the monoallelic expression favors the reference allele 
-            - MAE for ALT: the monoallelic expression favors the alternative allele 
-            - rare: if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
-        - histogram of inner cohort frequency
-        - summary of cascade plots and results table
-- Files
-    - Allelic counts
-        - a directory containing the allelic counts of heterozygous variants
-    - Results data tables of each sample (.Rds)
-        - Rds objects containing the full results table regardless of MAE status
-    - Significant MAE results tables
-        - For each mae group
-            - a link to the results tsv file. Only contains MAE results for the alternative allele
-- Quality Control
-    - QC Overview
-        - For each mae group QC checks for DNA/RNA matching
-- Analyze Individual Results
-    - An example analaysis that can be run using the Rds objects linked in the files subsection
-    - performed on the first mae sample
\ No newline at end of file
+
+* Results
+    * For each mae group
+        * the number of samples, unique genes, and aberrant events
+        * a cascade plot that shows additional filters
+            * MAE for REF: the monoallelic expression favors the reference allele 
+            * MAE for ALT: the monoallelic expression favors the alternative allele 
+            * rare: if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
+        * histogram of inner cohort frequency
+        * summary of cascade plots and results table
+* Files
+    * Allelic counts
+        * a directory containing the allelic counts of heterozygous variants
+    * Results data tables of each sample (.Rds)
+        * Rds objects containing the full results table regardless of MAE status
+    * Significant MAE results tables
+        * For each mae group
+            * a link to the results tsv file. Only contains MAE results for the alternative allele
+* Quality Control
+    * QC Overview
+        * For each mae group QC checks for DNA/RNA matching
+* Analyze Individual Results
+    * An example analaysis that can be run using the Rds objects linked in the files subsection
+    * performed on the first mae sample
\ No newline at end of file
diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
index 4a1a7ec7..d6d8007a 100644
--- a/docs/source/prepare.rst
+++ b/docs/source/prepare.rst
@@ -148,7 +148,7 @@ For example, if the AberrantExpression module is set to false, the  ``Scripts/Ab
 Creating the sample annotation table
 ------------------------------------
 For a detailed explanation of the columns of the sample annotation, please refer to
-Box 3 of the `DROP manuscript <https://rdcu.be/cdMmF>`_. 
+Box 3 of the `DROP manuscript <https://rdcu.be/cdMmF>`_. Although some information has been updated since puplication, please use this documentation as the preferred syntax/formatting.
 
 Each row of the sample annotation table corresponds to a unique pair of RNA and DNA
 samples derived from the same individual. An RNA assay can belong to one or more DNA
@@ -188,7 +188,8 @@ files (or a subset if not all samples are needed). Add the columns: ``GENE_COUNT
 (for aberrant expression), ``GENE_ANNOTATON``, and ``SPLICE_COUNTS_DIR`` (for aberrant splicing).
 These columns should remain empty for samples processed locally (from ``RNA_BAM``).
 
-### Aberrant Expression
+Aberrant Expression
+####################
 Using external counts for aberrant expression forces you to use the exact same gene annotation for each
 external sample as well as using the same gene annotation file specified in the config file
 ``Global parameters`` section. This is to avoid potential mismatching on counting, 2 different gene
@@ -202,7 +203,8 @@ The other columns should remain empty.
 Using ``exportCounts`` generates the sharable ``GENE_COUNTS_FILE`` file in the appropriate
 ``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory.
 
-### Aberrant Splicing
+Aberrant Splicing
+##################
 Using external counts for aberrant splicing reduces the number of introns processed to only those
 that are exactly the same between the local and external junctions. Because rare junctions may be 
 personally identifiable the ``exportCounts`` command only exports regions canonically mentioned in the gtf file.
@@ -224,7 +226,8 @@ Using ``exportCounts`` generates the necessary files in the appropriate
 - n_psi5_counts.tsv.gz  
 - n_theta_counts.tsv.gz  
 
-### Publicly available DROP external counts
+Publicly available DROP external counts
+#######################################
 You can find different sets of publicly available external counts to add to your
 analysis on our github page  <https://github.com/gagneurlab/drop/#datasets>
 

From 27549c922eba9271d96253584ac134c1f34d8e5a Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 14:36:33 +0200
Subject: [PATCH 41/65] update docs

---
 docs/source/output.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 2b1b136f..7757567e 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -4,7 +4,7 @@ Results and Output of DROP
 DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
 aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
 easy to read and interpret html files and output files. This section is dedicated to explaining the relevant
-results files. We will use the results of the ``demo`` to explain the files generated.::
+results files. We will use the results of the ``demo`` to explain the files generated.:: bash
 
     #install drop
     mamba create -n drop_env -c conda-forge -c bioconda drop

From cfb8309fbbead40757bd73c43cb1476e5b8397d8 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 14:37:10 +0200
Subject: [PATCH 42/65] update docs

---
 docs/source/output.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 7757567e..2b1b136f 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -4,7 +4,7 @@ Results and Output of DROP
 DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
 aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
 easy to read and interpret html files and output files. This section is dedicated to explaining the relevant
-results files. We will use the results of the ``demo`` to explain the files generated.:: bash
+results files. We will use the results of the ``demo`` to explain the files generated.::
 
     #install drop
     mamba create -n drop_env -c conda-forge -c bioconda drop

From 0e970eecd113ba30b1692bcf9bd44a8425e413c4 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 15:01:12 +0200
Subject: [PATCH 43/65] update output docs

---
 docs/source/output.rst  | 34 ++++++++++++++++++++++++++--------
 docs/source/prepare.rst | 12 ++++++------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 2b1b136f..eb94af4e 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -39,11 +39,14 @@ tab at the top of the screen. Following that the Overview tab contains links to
 * Files
     * OUTRIDER files for each aberrant expression group
         * For each of these files you can follow the `OUTRIDER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
-    * results.tsv files
+    * tsv files
         * For each aberrant expression group
-            * a tsv file that contains the sampleID, hgnc gene symbol, pvalue and adjusted pvalue, and subsequent analysis points
-                * this tsv file contains only the genes and samples that meet the cutoffs defined in the ``config.yaml``
-                for ``padjCutoff`` and ``zScoreCutoff``
+            * results.tsv
+                * this tsv file contains only the significant genes and samples that meet the cutoffs defined in the ``config.yaml`` for ``padjCutoff`` and ``zScoreCutoff``
+
+Local result files
+##################
+Additionally the ``aberrantExpression`` module creates the file Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds`` this file is the Rds object containing the entire OUTRIDER results table regardless of significance.
 
 Aberrant Splicing
 +++++++++++++++++
@@ -67,7 +70,7 @@ tab at the top of the screen. Following that the Overview tab contains links to
 * Files
     * FRASER files for each aberrant splicing group
         * For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
-    * results.tsv files
+    * tsv files
         * For each aberrant splicing group
             * results.tsv 
                 * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
@@ -89,7 +92,9 @@ tab at the top of the screen. Following that the Overview tab contains links to
         * a cascade plot that shows additional filters
             * MAE for REF: the monoallelic expression favors the reference allele 
             * MAE for ALT: the monoallelic expression favors the alternative allele 
-            * rare: if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
+            * rare: 
+                * if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``
+                * additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
         * histogram of inner cohort frequency
         * summary of cascade plots and results table
 * Files
@@ -99,10 +104,23 @@ tab at the top of the screen. Following that the Overview tab contains links to
         * Rds objects containing the full results table regardless of MAE status
     * Significant MAE results tables
         * For each mae group
-            * a link to the results tsv file. Only contains MAE results for the alternative allele
+            * a link to the results tsv file.
+            * Only contains significant MAE results based on ``config.yaml`` cutoffs for the alternative allele
 * Quality Control
     * QC Overview
         * For each mae group QC checks for DNA/RNA matching
 * Analyze Individual Results
     * An example analaysis that can be run using the Rds objects linked in the files subsection
-    * performed on the first mae sample
\ No newline at end of file
+    * performed on the first mae sample 
+    
+Local result files
+##################
+Additionally the ``mae`` module creates the following files:
+* Output/processed_results/mae/{drop_group}/MAE_results_all_v29.tsv.gz``
+    * this file is the tsv results of all heterozygous variants regardless of significance
+* Output/processed_results/mae/{drop_group}/MAE_results_v29.tsv``
+    * this is the file linked in the html document and described above
+* Output/processed_results/mae/{drop_group}/MAE_results_v29_rare.tsv``
+    * this file is the subsetted tsv of ``MAE_results_v29.tsv`` with only the variants that pass the rare cutoffs
+        * if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``
+        * inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
\ No newline at end of file
diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
index d6d8007a..5a092da4 100644
--- a/docs/source/prepare.rst
+++ b/docs/source/prepare.rst
@@ -220,16 +220,16 @@ Using ``exportCounts`` generates the necessary files in the appropriate
 ``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory
 
 ``SPLICE_COUNTS_DIR`` should contain the following:  
-- k_j_counts.tsv.gz  
-- k_theta_counts.tsv.gz  
-- n_psi3_counts.tsv.gz  
-- n_psi5_counts.tsv.gz  
-- n_theta_counts.tsv.gz  
+* k_j_counts.tsv.gz  
+* k_theta_counts.tsv.gz  
+* n_psi3_counts.tsv.gz  
+* n_psi5_counts.tsv.gz  
+* n_theta_counts.tsv.gz  
 
 Publicly available DROP external counts
 #######################################
 You can find different sets of publicly available external counts to add to your
-analysis on our github page  <https://github.com/gagneurlab/drop/#datasets>
+analysis on our `github page <https://github.com/gagneurlab/drop/#datasets>`_
 
 If you want to contribute with your own count matrices, please contact us: yepez at in.tum.de (yepez@in.tum.de)
 

From d298002dfdfe7038f657200bf8f05a87d6f906d6 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 15:02:52 +0200
Subject: [PATCH 44/65] typo

---
 docs/source/output.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index eb94af4e..3db5faf7 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -46,7 +46,7 @@ tab at the top of the screen. Following that the Overview tab contains links to
 
 Local result files
 ##################
-Additionally the ``aberrantExpression`` module creates the file Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds`` this file is the Rds object containing the entire OUTRIDER results table regardless of significance.
+Additionally the ``aberrantExpression`` module creates the file ``Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds`` this file is the Rds object containing the entire OUTRIDER results table regardless of significance.
 
 Aberrant Splicing
 +++++++++++++++++
@@ -116,11 +116,12 @@ tab at the top of the screen. Following that the Overview tab contains links to
 Local result files
 ##################
 Additionally the ``mae`` module creates the following files:
-* Output/processed_results/mae/{drop_group}/MAE_results_all_v29.tsv.gz``
+
+* ``Output/processed_results/mae/{drop_group}/MAE_results_all_v29.tsv.gz``
     * this file is the tsv results of all heterozygous variants regardless of significance
-* Output/processed_results/mae/{drop_group}/MAE_results_v29.tsv``
+* ``Output/processed_results/mae/{drop_group}/MAE_results_v29.tsv``
     * this is the file linked in the html document and described above
-* Output/processed_results/mae/{drop_group}/MAE_results_v29_rare.tsv``
+* ``Output/processed_results/mae/{drop_group}/MAE_results_v29_rare.tsv``
     * this file is the subsetted tsv of ``MAE_results_v29.tsv`` with only the variants that pass the rare cutoffs
         * if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``
         * inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
\ No newline at end of file

From b25a9d45af696bc02d503daadf5b0bc27c03c662 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 15:35:37 +0200
Subject: [PATCH 45/65] fix cutoffs and plotting

---
 .../Scripts/MonoallelicExpression/Overview.R    | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 6bf57769..03093953 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -91,16 +91,25 @@ qc_matrix_links <- build_link_list(
 #' ## Analyze Individual Results
 # Read the first results table
 res_sample <- readRDS(snakemake@input$results_obj[[1]])
+print(unique(res_sample$ID))
 
 #+echo=F
 library(tMAE)
 
 if(is.na(res_sample$rare)){
-  g1 <- plotMA4MAE(res_sample)
-  g2 <- plotAllelicCounts(res_sample)
+  g1 <- plotMA4MAE(res_sample,
+				   padjCutoff = snakemake@config$mae$padjCutoff,
+				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+  g2 <- plotAllelicCounts(res_sample,
+				   padjCutoff = snakemake@config$mae$padjCutoff,
+				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
 } else {
-  g1 <- plotMA4MAE(res_sample, rare_column = 'rare')
-  g2 <- plotAllelicCounts(res_sample, rare_column = 'rare')
+  g1 <- plotMA4MAE(res_sample, rare_column = 'rare',
+				   padjCutoff = snakemake@config$mae$padjCutoff,
+				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+  g2 <- plotAllelicCounts(res_sample, rare_column = 'rare',
+				   padjCutoff = snakemake@config$mae$padjCutoff,
+				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
 }
 
 #' ### MA plot: fold change vs RNA coverage

From aa9dd7b0b856618ed3ed828b36752978eca86112 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 16:07:21 +0200
Subject: [PATCH 46/65] MAE results test

---
 tests/pipeline/test_MAE.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/pipeline/test_MAE.py b/tests/pipeline/test_MAE.py
index d014790f..f5ea3e76 100644
--- a/tests/pipeline/test_MAE.py
+++ b/tests/pipeline/test_MAE.py
@@ -32,7 +32,7 @@ def test_counts(self, demo_dir):
         assert "[1] 235" in r.stdout
 
     @pytest.mark.usefixtures("pipeline_run")
-    def test_results(self, demo_dir):
+    def test_all_results(self, demo_dir):
         results_file = "Output/processed_results/mae/mae/MAE_results_all_v29.tsv.gz"
         r_cmd = """
                 library(data.table)
@@ -41,3 +41,14 @@ def test_results(self, demo_dir):
                 """.format(results_file)
         r = runR(r_cmd, demo_dir)
         assert "[1] 253" in r.stdout
+
+    @pytest.mark.usefixtures("pipeline_run")
+    def test_sig_results(self, demo_dir):
+        results_file = "Output/processed_results/mae/mae/MAE_results_v29.tsv"
+        r_cmd = """
+                library(data.table)
+                res <- fread("{}")
+                print(nrow(res))
+                """.format(results_file)
+        r = runR(r_cmd, demo_dir)
+        assert "[1] 3" in r.stdout

From 0141dc81f42b54cc38f11c4d224c1abcae665b8e Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Thu, 7 Apr 2022 16:15:32 +0200
Subject: [PATCH 47/65] update test to match demo config

---
 tests/config/test_MAE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/config/test_MAE.py b/tests/config/test_MAE.py
index 54a2d8e6..c1f480e9 100644
--- a/tests/config/test_MAE.py
+++ b/tests/config/test_MAE.py
@@ -7,8 +7,8 @@ def test_config(self,dropConfig,demo_dir):
             'groups': ['mae'],
             'qcGroups': ['mae'],
             'gatkIgnoreHeaderCheck': True,
-            'padjCutoff': 0.05,
-            'allelicRatioCutoff': 0.8,
+            'padjCutoff': 0.5,
+            'allelicRatioCutoff': 0.7,
             'maxAF': 0.001,
             'addAF': False,
             'maxVarFreqCohort': 1,

From c100b745a8aa6850b73e744993aec3f85a714a84 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Fri, 8 Apr 2022 13:24:28 +0200
Subject: [PATCH 48/65] allow for legacy sample annotation

---
 drop/config/SampleAnnotation.py               | 32 ++++++++++++-------
 drop/config/submodules/AberrantSplicing.py    |  8 ++++-
 .../Counting/Summary.R                        |  6 ++--
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 10f56667..0b9b9091 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -40,24 +40,27 @@ def parse(self, sep='\t'):
         clean columns and set types
         """
         data_types = {
-            "RNA_ID": str, "DNA_ID": str, "DROP_GROUP": str, "GENE_ANNOTATION": str,
-            "PAIRED_END": bool, "COUNT_MODE": str, "COUNT_OVERLAPS": bool, "STRAND": str, "GENOME": str
+            "RNA_ID": str, "DNA_ID": str, "DROP_GROUP": str, 
+            "PAIRED_END": bool, "COUNT_MODE": str, "COUNT_OVERLAPS": bool, "STRAND": str, 
+            "GENE_COUNTS_FILE": str, "SPLICE_COUNTS_DIR": str, "GENE_ANNOTATION": str, "GENOME": str
         }
+        optional_columns = {"GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR", "GENE_ANNOTATION", "GENOME"}
+
         sa = pd.read_csv(self.file, sep=sep, index_col=False)
         missing_cols = [x for x in self.SAMPLE_ANNOTATION_COLUMNS if x not in sa.columns.values]
         if len(missing_cols) > 0:
-            if "GENOME" in missing_cols:
-                # deal with missing columns in data types, remove it to fix checks later
-                del data_types["GENOME"]
-                self.SAMPLE_ANNOTATION_COLUMNS.remove("GENOME")
-                missing_cols.remove("GENOME")
-
             if "GENE_ANNOTATION" in missing_cols and "ANNOTATION" in sa.columns.values:
                 logger.info(
                     "WARNING: GENE_ANNOTATION must be a column in the sample annotation table, ANNOTATION is the old column name and will be deprecated in the future\n")
                 sa["GENE_ANNOTATION"] = sa.pop("ANNOTATION")
                 missing_cols.remove("GENE_ANNOTATION")
 
+            for toDel_optional in (set(missing_cols) & optional_columns):
+                # deal with missing columns in data types, remove it to fix checks later
+                del data_types[toDel_optional]
+                self.SAMPLE_ANNOTATION_COLUMNS.remove(toDel_optional)
+                missing_cols.remove(toDel_optional)
+
             if len(missing_cols) > 0:
                 raise ValueError(f"Incorrect columns in sample annotation file. Missing:\n{missing_cols}")
 
@@ -106,7 +109,7 @@ def createSampleFileMapping(self):
             raise FileNotFoundError(message)
         elif len(existing) < file_mapping.shape[0]:
             missing = set(file_mapping["FILE_PATH"]) - set(existing)
-            logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...")
+            logger.info(f"WARNING: {missing} files missing in samples annotation. Ignoring...")
             logger.debug(f"Missing files: {missing}")
             file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)]
 
@@ -170,10 +173,15 @@ def subsetSampleAnnotation(self, column, values, subset=None):
             if not sa_cols <= set(subset.columns):  # check if mandatory cols not contained
                 raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}")
 
-        # check if column is valid
-        if column not in sa_cols:
+        # check if values is None. Do nothing
+        if values is None:
+            return subset
+        # check if column is valid. Raise Error
+        elif column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
-        return utils.subsetBy(subset, column, values)
+        # subset column for values
+        else: 
+            return utils.subsetBy(subset, column, values)
 
     def subsetFileMapping(self, file_type=None, sample_id=None):
         """
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index 3ae001ac..1193b5a9 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -81,11 +81,17 @@ def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
         :param fileType: name of the file without extension which is to be returned
         :return: list of directories or files
         """
+
+        # if sample annotation does not contain the column SPLICE_COUNTS_DIR return no external counts
+        if("SPLICE_COUNTS_DIR" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
+            return []
+
         ids = self.sampleAnnotation.getIDsByGroup(group, assay="SPLICE_COUNT")
         extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation=None, group=group, 
                 file_type="SPLICE_COUNTS_DIR", asSet=False)
         if fileType is not None:
-            extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist()
+            extCountFiles = np.asarray(extCountFiles)
+            extCountFiles = extCountFiles[extCountFiles != "nan"].tolist()
             extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles]
         return extCountFiles
     
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index 3a6250e2..a5c64d2f 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -44,7 +44,7 @@ if(has_external){
 #' Local (fromBam): `r sum(!fdsMerge@colData$isExternal)`  
 #' External: `r sum(fdsMerge@colData$isExternal)`  
 #' 
-#' ### Using external counts
+#' **Using external counts**  
 #' External counts introduce some complexity into the problem of counting junctions
 #' because it is ambiguous whether or not a junction is not counted (because there are no reads)
 #' compared to filtered and not present due to legal/personal sharing reasons. As a result,
@@ -52,11 +52,11 @@ if(has_external){
 #' the same in both remain. As a result it is likely that the number of junctions will decrease after a merge.
 #' 
 #' 
-#' ### Number of introns (psi5 or psi3) before filtering:  
+#' ### Number of introns (psi5 or psi3) before and after merging:  
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "psi5"))`  
 #' Merged : `r length(rowRanges(fdsMerge, type = "psi5"))`  
 #' 
-#' ### Number of splice sites (theta) before filtering: 
+#' ### Number of splice sites (theta) before and after merging: 
 #' Local (fromBam): `r length(rowRanges(fdsLocal, type = "theta"))`  
 #' Merged: `r length(rowRanges(fdsMerge, type = "theta"))`  
 #' 

From 9e9d90920edfd459cf2bbc0f2ade9737237be453 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Fri, 8 Apr 2022 15:54:27 +0200
Subject: [PATCH 49/65] improve legacy handling

---
 drop/config/SampleAnnotation.py              | 13 ++++++-------
 drop/config/submodules/AberrantExpression.py |  5 +++++
 drop/config/submodules/AberrantSplicing.py   |  5 ++---
 drop/utils.py                                |  2 +-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
index 0b9b9091..4ea6dbe3 100644
--- a/drop/config/SampleAnnotation.py
+++ b/drop/config/SampleAnnotation.py
@@ -42,7 +42,6 @@ def parse(self, sep='\t'):
         data_types = {
             "RNA_ID": str, "DNA_ID": str, "DROP_GROUP": str, 
             "PAIRED_END": bool, "COUNT_MODE": str, "COUNT_OVERLAPS": bool, "STRAND": str, 
-            "GENE_COUNTS_FILE": str, "SPLICE_COUNTS_DIR": str, "GENE_ANNOTATION": str, "GENOME": str
         }
         optional_columns = {"GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR", "GENE_ANNOTATION", "GENOME"}
 
@@ -57,7 +56,6 @@ def parse(self, sep='\t'):
 
             for toDel_optional in (set(missing_cols) & optional_columns):
                 # deal with missing columns in data types, remove it to fix checks later
-                del data_types[toDel_optional]
                 self.SAMPLE_ANNOTATION_COLUMNS.remove(toDel_optional)
                 missing_cols.remove(toDel_optional)
 
@@ -109,7 +107,7 @@ def createSampleFileMapping(self):
             raise FileNotFoundError(message)
         elif len(existing) < file_mapping.shape[0]:
             missing = set(file_mapping["FILE_PATH"]) - set(existing)
-            logger.info(f"WARNING: {missing} files missing in samples annotation. Ignoring...")
+            logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...")
             logger.debug(f"Missing files: {missing}")
             file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)]
 
@@ -173,14 +171,15 @@ def subsetSampleAnnotation(self, column, values, subset=None):
             if not sa_cols <= set(subset.columns):  # check if mandatory cols not contained
                 raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}")
 
-        # check if values is None. Do nothing
+
+        # if you don't want to subset
         if values is None:
             return subset
-        # check if column is valid. Raise Error
+        # check if column is valid
         elif column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
-        # subset column for values
-        else: 
+        #subset column for matching values
+        else:
             return utils.subsetBy(subset, column, values)
 
     def subsetFileMapping(self, file_type=None, sample_id=None):
diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py
index 6db90506..cf7ad7ab 100644
--- a/drop/config/submodules/AberrantExpression.py
+++ b/drop/config/submodules/AberrantExpression.py
@@ -47,6 +47,11 @@ def getCountFiles(self, annotation, group):
         :param group: DROP group name from wildcard
         :return: list of files
         """
+
+        # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts
+        if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
+            return []
+
         bam_IDs = self.sampleAnnotation.getIDsByGroup(group, assay="RNA")
         file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds"
         count_files = expand(str(file_stump), sampleID=bam_IDs)
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index 1193b5a9..fc980ed4 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -82,7 +82,7 @@ def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
         :return: list of directories or files
         """
 
-        # if sample annotation does not contain the column SPLICE_COUNTS_DIR return no external counts
+        # if sample annotation table does not contain SPLICE_COUNTS_DIR column. return no external counts
         if("SPLICE_COUNTS_DIR" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
             return []
 
@@ -90,8 +90,7 @@ def getExternalCounts(self, group: str, fileType: str = "k_j_counts"):
         extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation=None, group=group, 
                 file_type="SPLICE_COUNTS_DIR", asSet=False)
         if fileType is not None:
-            extCountFiles = np.asarray(extCountFiles)
-            extCountFiles = extCountFiles[extCountFiles != "nan"].tolist()
+            extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist()
             extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles]
         return extCountFiles
     
diff --git a/drop/utils.py b/drop/utils.py
index 7cde9321..fae88237 100644
--- a/drop/utils.py
+++ b/drop/utils.py
@@ -81,7 +81,7 @@ def subsetBy(df, column, values):
     if not isinstance(values, str) :
         inner_regex = "(" + "|".join(values) + ")"
     
-    return  df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)")]
+    return  df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)", na = False)]
     
 def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
     """

From 0eb78fc63111b36d07739ef4bf29e927ec10ac31 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Mon, 11 Apr 2022 10:29:39 +0200
Subject: [PATCH 50/65] update FRASER version requiremtent

---
 drop/requirementsR.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
index 6cde622e..65e4152f 100644
--- a/drop/requirementsR.txt
+++ b/drop/requirementsR.txt
@@ -1,7 +1,7 @@
 package	version
 devtools
 gagneurlab/OUTRIDER	1.6.1
-c-mertes/FRASER	1.2.2
+c-mertes/FRASER	1.6.1
 gagneurlab/tMAE	1.0.4
 VariantAnnotation	
 rmarkdown	

From ef650200765a6e690ef34cc1bf19a3485b2c339f Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Mon, 11 Apr 2022 11:42:42 +0200
Subject: [PATCH 51/65] fix column typo

---
 drop/modules/aberrant-expression-pipeline/Counting/Summary.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 1bd807e6..90eb8abb 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -193,5 +193,5 @@ plotExpressedGenes(ods) +
 if(has_external){
     DT::datatable(expressed_genes[order(Rank)],rownames = F)
 } else{
-    DT::datatable(expressed_genes[order(Rank),-"is External"],rownames = F)
+    DT::datatable(expressed_genes[order(Rank),-"Is External"],rownames = F)
 }

From 0973a530bd793c3baa8a56ef0c79d27fcefe1605 Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Mon, 11 Apr 2022 16:52:30 +0200
Subject: [PATCH 52/65] update plots to match config

---
 drop/installRPackages.R                          |  2 ++
 .../Scripts/AberrantExpression/Overview.R        |  8 ++++++--
 .../template/Scripts/AberrantSplicing/Overview.R |  4 +++-
 .../Scripts/MonoallelicExpression/Overview.R     | 16 ++++++++--------
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drop/installRPackages.R b/drop/installRPackages.R
index 33455546..cab1e63f 100644
--- a/drop/installRPackages.R
+++ b/drop/installRPackages.R
@@ -1,3 +1,5 @@
+options(timeout = 600) 
+
 options(repos=structure(c(CRAN="https://cloud.r-project.org")), warn = -1)
 
 if (!requireNamespace('BiocManager', quietly = TRUE)) {
diff --git a/drop/template/Scripts/AberrantExpression/Overview.R b/drop/template/Scripts/AberrantExpression/Overview.R
index a3c6f167..2b25b548 100644
--- a/drop/template/Scripts/AberrantExpression/Overview.R
+++ b/drop/template/Scripts/AberrantExpression/Overview.R
@@ -90,10 +90,14 @@ sample <- res[1, sampleID]
 #' ### Volcano plot
 #' setting basePlot = FALSE creates an interactive plot
 #' that allows finding the gene(s) of interest
-OUTRIDER::plotVolcano(ods, sample, basePlot = TRUE)
+OUTRIDER::plotVolcano(ods, sample, basePlot = TRUE,
+                      zScoreCutoff = snakemake@config$aberrantExpression$zScoreCutoff,
+                      padjCutoff = snakemake@config$aberrantExpression$padjCutoff)
 
 #' ### Gene expression plot (normalized counts)
-OUTRIDER::plotExpressionRank(ods, gene, basePlot = TRUE)
+OUTRIDER::plotExpressionRank(ods, gene, basePlot = TRUE,
+                      zScoreCutoff = snakemake@config$aberrantExpression$zScoreCutoff,
+                      padjCutoff = snakemake@config$aberrantExpression$padjCutoff)
 
 #' ### Expected vs observed counts
 OUTRIDER::plotExpectedVsObservedCounts(ods, gene, basePlot = TRUE)
diff --git a/drop/template/Scripts/AberrantSplicing/Overview.R b/drop/template/Scripts/AberrantSplicing/Overview.R
index db90bd6c..ef8a3bed 100644
--- a/drop/template/Scripts/AberrantSplicing/Overview.R
+++ b/drop/template/Scripts/AberrantSplicing/Overview.R
@@ -87,7 +87,9 @@ siteIndex <- 4
 
 #' ### Volcano plot
 # set basePlot to FALSE to create an interactive plot
-FRASER::plotVolcano(fds, sample, type = 'psi3', basePlot = TRUE)
+FRASER::plotVolcano(fds, sample, type = 'psi3', basePlot = TRUE,
+                    deltaPsiCutoff = snakemake@config$aberrantSplicing$deltaPsiCutoff,
+                    padjCutoff = snakemake@config$aberrantSplicing$padjCutoff)
 
 #' ### Expression plot
 FRASER::plotExpression(fds, type = 'psi3', site = siteIndex, basePlot = TRUE)
diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R
index 03093953..29ee9212 100644
--- a/drop/template/Scripts/MonoallelicExpression/Overview.R
+++ b/drop/template/Scripts/MonoallelicExpression/Overview.R
@@ -98,18 +98,18 @@ library(tMAE)
 
 if(is.na(res_sample$rare)){
   g1 <- plotMA4MAE(res_sample,
-				   padjCutoff = snakemake@config$mae$padjCutoff,
-				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+                   padjCutoff = snakemake@config$mae$padjCutoff,
+                   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
   g2 <- plotAllelicCounts(res_sample,
-				   padjCutoff = snakemake@config$mae$padjCutoff,
-				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+                   padjCutoff = snakemake@config$mae$padjCutoff,
+                   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
 } else {
   g1 <- plotMA4MAE(res_sample, rare_column = 'rare',
-				   padjCutoff = snakemake@config$mae$padjCutoff,
-				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+                   padjCutoff = snakemake@config$mae$padjCutoff,
+                   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
   g2 <- plotAllelicCounts(res_sample, rare_column = 'rare',
-				   padjCutoff = snakemake@config$mae$padjCutoff,
-				   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
+                   padjCutoff = snakemake@config$mae$padjCutoff,
+                   allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff )
 }
 
 #' ### MA plot: fold change vs RNA coverage

From c363c11b991dfecfa17dfa5cab2a1a0fdd9e04fb Mon Sep 17 00:00:00 2001
From: Smith Nicholas <smith@in.tum.de>
Date: Mon, 11 Apr 2022 17:37:56 +0200
Subject: [PATCH 53/65] update

---
 drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index d392b134..4048d6ba 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -77,6 +77,7 @@ for(type in psiTypes){
     annotation_col = NA,
     annotation_row = NA,
     sampleCluster = NA,
+    minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
     plotMeanPsi=FALSE,
     plotCov = FALSE,
     annotation_legend = TRUE
@@ -93,6 +94,7 @@ for(type in psiTypes){
     annotation_col = NA,
     annotation_row = NA,
     sampleCluster = NA,
+    minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
     plotMeanPsi=FALSE,
     plotCov = FALSE,
     annotation_legend = TRUE

From 59c4d2aeba7a4cc93f5d967fca7395dcd6b3f74e Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Tue, 12 Apr 2022 14:18:07 +0200
Subject: [PATCH 54/65] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 783c25d8..2a1b0772 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,9 @@ snakemake aberrantExpression --cores 10
 
 If you use DROP in research, please cite our [manuscript](https://www.nature.com/articles/s41596-020-00462-5).
 
-Furthermore, if you use the aberrant expression module, also cite [OUTRIDER](https://doi.org/10.1016/j.ajhg.2018.10.025), and if you use the aberrant splicing module, also cite [FRASER](https://www.nature.com/articles/s41467-020-20573-7).
+Furthermore, if you use the aberrant expression module, also cite [OUTRIDER](https://doi.org/10.1016/j.ajhg.2018.10.025); if you use the aberrant splicing module, also cite [FRASER](https://www.nature.com/articles/s41467-020-20573-7); and if you use the MAE module, also cite the [Kremer, Bader et al study](https://www.nature.com/articles/ncomms15824) and [DESeq2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8).
+
+For the complete set of tools used by DROP (e.g. for counting), see the [manuscript](https://www.nature.com/articles/s41596-020-00462-5).
 
 ## Datasets
 The following publicly-available datasets of gene counts can be used as controls.

From 4c83f2de6d3c085ff8189b47dbba537f0eb23644 Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Tue, 12 Apr 2022 15:00:22 +0200
Subject: [PATCH 55/65] Clarifications added to possible QC values

---
 .../mae-pipeline/QC/DNA_RNA_matrix_plot.R     | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
index ec2df3db..83366d83 100644
--- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
+++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
@@ -28,7 +28,6 @@ qc_mat <- readRDS(snakemake@input$mat_qc)
 # hist(qc_mat, xlab = '% of overlapping variants from DNA and RNA', main = '')
 melt_mat <- as.data.table(reshape2::melt(qc_mat))
 
-#' Logarithmic scale of the y axis provides a better visualization
 identityCutoff <- .85
 
 ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.05, center = .025) + 
@@ -38,16 +37,32 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.
   expand_limits(x=c(0,1)) +
   geom_vline(xintercept=identityCutoff, linetype='dashed', color = 'firebrick')
 
+
+
+
+
 #' ## Identify matching samples
 
 #' Number of samples: `r nrow(qc_mat)`
 #' 
 #' Number of samples that match with another: `r length(qc_mat[qc_mat > identityCutoff])`
 #'
-#' Median of matching samples value: `r round(median(qc_mat[qc_mat > identityCutoff]), 2)`
+#' Median of proportion of matching variants in matching samples: `r round(median(qc_mat[qc_mat > identityCutoff]), 2)`
 #'
-#' Median of not matching samples value: `r round(median(qc_mat[qc_mat < identityCutoff]), 2)`
+#' Median of proportion of matching variants in not matching samples: `r round(median(qc_mat[qc_mat < identityCutoff]), 2)`
 #'
+#' **Considerations:**
+#' On our experience, the median of the proportion of matching variants in matching samples is around 0.95,
+#' and the median of the proportion of matching variants in not matching samples is around 0.58.
+#' Sometimes we do see some values between 0.7 - 0.85. That could mean that the DNA-RNA combination is 
+#' not from the same person, but from a relative. It could also be due to a technical error. For those cases, 
+#' check the following:
+#' 
+#' * RNA sequencing depth (low seq depth that can lead to variants not to be found in the RNA)
+#' * Number of variants (too many variants called due to sequencing errors)
+#' * Ratio of heterozygous/homozygous variants (usually too many called variants means too many heterozygous ones)
+#' * Is the sample a relative of the other?
+#' 
 
 sa <- fread(snakemake@config$sampleAnnotation)[, .(DNA_ID, RNA_ID)]
 sa[, ANNOTATED_MATCH := TRUE]

From 420d31cca2a7aa462eba222964d68dda1cc5fc63 Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Tue, 12 Apr 2022 15:00:53 +0200
Subject: [PATCH 56/65] Update DNA_RNA_matrix_plot.R

---
 drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
index 83366d83..25543292 100644
--- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
+++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
@@ -38,9 +38,6 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.
   geom_vline(xintercept=identityCutoff, linetype='dashed', color = 'firebrick')
 
 
-
-
-
 #' ## Identify matching samples
 
 #' Number of samples: `r nrow(qc_mat)`

From 2322f5f1427c3ce2e1aa2ecc57ced6dde856eba1 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 13 Apr 2022 15:56:27 +0200
Subject: [PATCH 57/65] code review formatting fixes

---
 docs/source/prepare.rst                       |  1 +
 .../Counting/Summary.R                        | 40 ++++++++++---------
 .../OUTRIDER/Summary.R                        | 12 +++---
 .../Counting/03_filter_expression_FraseR.R    |  2 +
 .../Counting/Summary.R                        | 37 +++++++++--------
 .../FRASER/Summary.R                          |  4 +-
 drop/modules/mae-pipeline/MAE/Results.R       | 13 ++++--
 7 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
index 5a092da4..c624fca7 100644
--- a/docs/source/prepare.rst
+++ b/docs/source/prepare.rst
@@ -220,6 +220,7 @@ Using ``exportCounts`` generates the necessary files in the appropriate
 ``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory
 
 ``SPLICE_COUNTS_DIR`` should contain the following:  
+
 * k_j_counts.tsv.gz  
 * k_theta_counts.tsv.gz  
 * n_psi3_counts.tsv.gz  
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 90eb8abb..917729ad 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -35,21 +35,26 @@ ods <- readRDS(snakemake@input$ods)
 
 has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
 if(has_external){
-    ods@colData$isExternal <- ods@colData$GENE_COUNTS_FILE != ""
+    ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "")
 }else{
-    ods@colData$isExternal <- FALSE
+    ods@colData$isExternal <- as.factor(FALSE)
 }
 
+# save ods with isExternal column
+saveRDS(ods,snakemake@input$ods)
+
 cnts_mtx_local <- counts(ods, normalized = F)[,!ods@colData$isExternal]
 cnts_mtx <- counts(ods, normalized = F)
 
 #' ## Number of samples:  
-#' Local (fromBam): `r sum(!ods@colData$isExternal)`  
-#' External: `r sum(ods@colData$isExternal)`  
+#' Local: `r sum(!as.logical(ods@colData$isExternal))`  
+#' External: `r sum(as.logical(ods@colData$isExternal))`  
 #' 
 #' # Count Quality Control
 #' 
-#' Compare number of records vs. read counts
+#' Compare number of records vs. read counts  
+#' `The Obtained Read Count Ratio` plot does not include external counts
+#' because there are no raw reads to be counted.
 #' 
 bam_coverage <- fread(snakemake@input$bam_cov)
 bam_coverage[, sampleID := as.character(sampleID)]
@@ -116,8 +121,8 @@ plot_grid(p_sf, p_sf_cov)
 #' **local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
 #' **all**: A pre-filtered summary of counts using only the merged local (from BAM) and external counts  
 #' **passed_FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes  
-#' **min_1**: minimum of 1 read expressed in 5% of genes  
-#' **min_10**: minimum of 10 reads expressed in 5% of genes  
+#' **min 1 read**: minimum of 1 read expressed in 5% of genes  
+#' **min 10 reads**: minimum of 10 reads expressed in 5% of genes  
 
 quant <- .95
 
@@ -125,27 +130,27 @@ if(has_external){
     filter_mtx <- list(
       local = cnts_mtx_local,
       all = cnts_mtx,
-      passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
-      min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
-      min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
+      `passed FPKM` = cnts_mtx[rowData(ods)$passedFilter,],
+      `min 1 read` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
+      `min 10 reads` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
     )
     filter_dt <- lapply(names(filter_mtx), function(filter_name) {
       mtx <- filter_mtx[[filter_name]]
       data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
     }) %>% rbindlist
-    filter_dt[, filter := factor(filter, levels = c('local', 'all', 'passed_FPKM', 'min_1', 'min_10'))]
+    filter_dt[, filter := factor(filter, levels = c('local', 'all', 'passed FPKM', 'min 1 read', 'min 10 reads'))]
 } else{
     filter_mtx <- list(
       all = cnts_mtx,
-      passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,],
-      min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
-      min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
+      `passed FPKM` = cnts_mtx[rowData(ods)$passedFilter,],
+      `min 1 read` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ],
+      `min 10 reads` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ]
     )
     filter_dt <- lapply(names(filter_mtx), function(filter_name) {
       mtx <- filter_mtx[[filter_name]]
       data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name)
     }) %>% rbindlist
-    filter_dt[, filter := factor(filter, levels = c('all', 'passed_FPKM', 'min_1', 'min_10'))]
+    filter_dt[, filter := factor(filter, levels = c('all', 'passed FPKM', 'min 1 read', 'min 10 reads'))]
 }
 
 binwidth <- .2
@@ -174,11 +179,10 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) +
 plot_grid(p_hist, p_dens)
 
 #' ### Expressed Genes
-exp_genes_cols <- c(`Expressed\ngenes` = "expressedGenes", 
+exp_genes_cols <- c(Rank = "expressedGenesRank",`Expressed\ngenes` = "expressedGenes", 
                     `Union of\nexpressed genes` = "unionExpressedGenes", 
                     `Intersection of\nexpressed genes` = "intersectionExpressedGenes", 
-                    `Genes passed\nfiltering` = "passedFilterGenes", Rank = "expressedGenesRank",
-                    `Is External` = "isExternal")
+                    `Genes passed\nfiltering` = "passedFilterGenes", `Is External` = "isExternal")
 
 expressed_genes <- as.data.table(colData(ods)[,exp_genes_cols])
 colnames(expressed_genes) <- names(exp_genes_cols)
diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
index f18ad3dc..8ba71c71 100644
--- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
@@ -63,18 +63,18 @@ plotAberrantPerSample(ods, main = dataset_title,
 
 #' ### Batch correction
 #+ countCorHeatmap, fig.height=8, fig.width=8
-plotCountCorHeatmap(ods, normalized = FALSE, 
+plotCountCorHeatmap(ods, normalized = FALSE, colGroups = "isExternal",
                     main = paste0('Raw Counts (', dataset_title, ')'))
-plotCountCorHeatmap(ods, normalized = TRUE, 
+plotCountCorHeatmap(ods, normalized = TRUE, ,colGroups = "isExternal",
                     main = paste0('Normalized Counts (', dataset_title, ')'))
 
 
 #' ### Expression by gene per sample
 #+ geneSampleHeatmap, fig.height=12, fig.width=6
-plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50,
+plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50, colGroups = "isExternal",
                            main = paste0('Raw Counts (', dataset_title, ')'),
                            bcvQuantile = .95, show_names = 'row')
-plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50,
+plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50, colGroups = "isExternal",
                            main = paste0('Normalized Counts (',dataset_title,')'),
                            bcvQuantile = .95, show_names = 'row')
 
@@ -134,8 +134,8 @@ fwrite(res, file, sep = '\t', quote = F)
 #+ echo=FALSE, results='asis'
 cat(paste0("<a href='./", basename(file), "'>Download OUTRIDER results table</a>"))
 
-res[, pValue := format(pValue, scientific = T, digits = 2)]
-res[, padjust := format(padjust, scientific = T, digits = 2)]
+res[, pValue := format(pValue, scientific = T, digits = 3)]
+res[, padjust := format(padjust, scientific = T, digits = 3)]
 
 DT::datatable(
   head(res, 1000),
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index 69ee5a61..2a734101 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -63,12 +63,14 @@ if(length(exCountIDs) > 0){
         # junctions from the external counts.
         fds <- mergeExternalData(fds=fds, countFiles=ctsFiles,
                 sampleIDs=exSampleIDs, annotation=exAnno)
+        fds@colData$isExternal <- as.factor(!is.na(fds@colData$SPLICE_COUNTS_DIR))
     }
 } else {
     message("symLink fraser dir")
     file.symlink(paste0(workingDirIn, "savedObjects/","raw-", dataset),
                  paste0(workingDirOut, "savedObjects/","raw-", dataset))
     
+    fds@colData$isExternal <- as.factor(FALSE)
     workingDir(fds) <- workingDirOut
     name(fds) <- paste0("raw-", dataset)
 }
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index a5c64d2f..4a089697 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -35,14 +35,16 @@ fdsMerge <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
 
 has_external <- !(all(is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)) || is.null(fdsMerge@colData$SPLICE_COUNTS_DIR))
 if(has_external){
-    fdsMerge@colData$isExternal <- !is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)
+    fdsMerge@colData$isExternal <- as.factor(!is.na(fdsMerge@colData$SPLICE_COUNTS_DIR))
 }else{
-    fdsMerge@colData$isExternal <- FALSE
+    fdsMerge@colData$isExternal <- as.factor(FALSE)
 }
+devNull <- saveFraserDataSet(fdsMerge,dir=workingDir, name=paste0("raw-", dataset))
+
 
 #' ## Number of samples:   
-#' Local (fromBam): `r sum(!fdsMerge@colData$isExternal)`  
-#' External: `r sum(fdsMerge@colData$isExternal)`  
+#' Local (fromBam): `r sum(!as.logical(fdsMerge@colData$isExternal))`  
+#' External: `r sum(as.logical(fdsMerge@colData$isExternal))`  
 #' 
 #' **Using external counts**  
 #' External counts introduce some complexity into the problem of counting junctions
@@ -63,23 +65,24 @@ if(has_external){
 
 #' ### Comparison of local and external counts  
 if(has_external){
-externalCountIDs <- colData(fdsMerge)[colData(fdsMerge)[,"isExternal"],"sampleID"]
-localCountIDs <- colData(fdsMerge)[!colData(fdsMerge)[,"isExternal"],"sampleID"]
+    externalCountIDs <- colData(fdsMerge)[as.logical(colData(fdsMerge)[,"isExternal"]),"sampleID"]
+    localCountIDs <- colData(fdsMerge)[!as.logical(colData(fdsMerge)[,"isExternal"]),"sampleID"]
 
-cts <- K(fdsMerge,"psi5")
-ctsLocal<- cts[,localCountIDs]
-ctsExt<- cts[,externalCountIDs]
+    cts <- K(fdsMerge,"psi5")
+    ctsLocal<- cts[,localCountIDs]
+    ctsExt<- cts[,externalCountIDs]
 
-rowlgmLocal <- rowMeans(log(ctsLocal + 1))
-rowlgmExt <- rowMeans(log(ctsExt + 1))
+    rowMeanLocal <- rowMeans(ctsLocal)
+    rowMeanExt <- rowMeans(ctsExt)
 
-dt <- data.table("Local log mean counts" = rowlgmLocal,
-                 "External log mean counts" = rowlgmExt)
+    dt <- data.table("Local log mean counts" = rowMeanLocal,
+                 "External log mean counts" = rowMeanExt)
                  
-ggplot(dt,aes(x = `Local log mean counts`, y= `External log mean counts`)) +
-   geom_point() + theme_cowplot(font_size = 16) +
-   geom_abline(slope = 1, intercept =0) +
-   scale_color_brewer(palette="Dark2") 
+    ggplot(dt,aes(x = `Local log mean counts`, y= `External log mean counts`)) +
+       geom_hex() + theme_cowplot(font_size = 16) +
+	   theme_bw() + scale_x_log10() + scale_y_log10() + 
+       geom_abline(slope = 1, intercept =0) +
+       scale_color_brewer(palette="Dark2") 
 }else{
 	print("No external counts, comparison is ommitted")
 }
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index 4048d6ba..d4ddb231 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -74,7 +74,7 @@ for(type in psiTypes){
     topJ = topJ,
     plotType = "sampleCorrelation",
     normalized = FALSE,
-    annotation_col = NA,
+    annotation_col = "isExternal",
     annotation_row = NA,
     sampleCluster = NA,
     minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
@@ -91,7 +91,7 @@ for(type in psiTypes){
     topJ = topJ,
     plotType = "sampleCorrelation",
     normalized = TRUE,
-    annotation_col = NA,
+    annotation_col = "isExternal",
     annotation_row = NA,
     sampleCluster = NA,
     minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R
index 51527afe..1607902e 100644
--- a/drop/modules/mae-pipeline/MAE/Results.R
+++ b/drop/modules/mae-pipeline/MAE/Results.R
@@ -149,14 +149,21 @@ ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
     annotation_logticks(sides = "l")
 
 #'
-#' ## Variant Frequency within Cohort Histogram
+#' ## Variant Frequency within Cohort
 ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02)  +
-  geom_vline(xintercept = maxCohortFreq, col = "red") +
-  xlab("Variant frequency in cohort") + ylab("Count")
+  geom_vline(xintercept = maxCohortFreq, col = "red",linetype="dashed") + theme_bw(base_size = 14) +
+  xlim(0,1) + xlab("Variant frequency in cohort") + ylab("Variants")
 
 #' Median of each category
 DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable])
 
+
+# round numbers
+if(nrow(res) > 0){
+  res[, pValue := signif(pValue, 3)]
+  res[, padjust := signif(padjust, 3)]
+  res[, log2FC := signif(deltaPsi, 3)]
+}
 #' 
 #' ## MAE Results table
 DT::datatable(

From 46df8271ca10364542a2c7cc24d4d8b33fccaef0 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 13 Apr 2022 16:02:37 +0200
Subject: [PATCH 58/65] update docs

---
 docs/source/output.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 3db5faf7..211444fe 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -72,11 +72,13 @@ tab at the top of the screen. Following that the Overview tab contains links to
         * For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
     * tsv files
         * For each aberrant splicing group
-            * results.tsv 
-                * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
             * results_per_junction.tsv 
                 * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
 
+Local result files
+##################
+Additionally the ``aberrantSplicing`` module creates the following file ``Output/processed_results/aberrant_splicing/results/{annotation}/fraser/{drop_group}/results.tsv``. 
+This tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
 
 Mono-allelic Expression
 +++++++++++++++++++++++

From 4264ff006e7675450e2610a34ef3a74d215fd6d4 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 13 Apr 2022 17:35:00 +0200
Subject: [PATCH 59/65] html outputs

---
 .../aberrant-expression-pipeline/Counting/Summary.R | 13 ++-----------
 .../Counting/filterCounts.R                         |  8 ++++++++
 drop/modules/mae-pipeline/MAE/Results.R             |  8 ++++----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 917729ad..63cd3a5c 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -33,17 +33,8 @@ suppressPackageStartupMessages({
 
 ods <- readRDS(snakemake@input$ods)
 
-has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
-if(has_external){
-    ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "")
-}else{
-    ods@colData$isExternal <- as.factor(FALSE)
-}
-
-# save ods with isExternal column
-saveRDS(ods,snakemake@input$ods)
-
-cnts_mtx_local <- counts(ods, normalized = F)[,!ods@colData$isExternal]
+has_external <- any(as.logical(colData(ods)$isExternal))
+cnts_mtx_local <- counts(ods, normalized = F)[,!as.logical(ods@colData$isExternal)]
 cnts_mtx <- counts(ods, normalized = F)
 
 #' ## Number of samples:  
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R
index c1a58e70..82405251 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R
@@ -36,5 +36,13 @@ ods <- filterExpression(ods, gtfFile=txdb, filter=FALSE,
 # add column for genes with at least 1 gene
 rowData(ods)$counted1sample = rowSums(assay(ods)) > 0
 
+has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
+if(has_external){
+    ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "")
+}else{
+    ods@colData$isExternal <- as.factor(FALSE)
+}
+
+
 # Save the ods before filtering to preserve the original number of genes
 saveRDS(ods, snakemake@output$ods)
diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R
index 1607902e..0d0ba8c9 100644
--- a/drop/modules/mae-pipeline/MAE/Results.R
+++ b/drop/modules/mae-pipeline/MAE/Results.R
@@ -152,7 +152,7 @@ ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
 #' ## Variant Frequency within Cohort
 ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02)  +
   geom_vline(xintercept = maxCohortFreq, col = "red",linetype="dashed") + theme_bw(base_size = 14) +
-  xlim(0,1) + xlab("Variant frequency in cohort") + ylab("Variants")
+  xlim(0,1.5) + xlab("Variant frequency in cohort") + ylab("Variants")
 
 #' Median of each category
 DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable])
@@ -160,9 +160,9 @@ DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable])
 
 # round numbers
 if(nrow(res) > 0){
-  res[, pValue := signif(pValue, 3)]
-  res[, padjust := signif(padjust, 3)]
-  res[, log2FC := signif(deltaPsi, 3)]
+  res[, pvalue := signif(pvalue, 3)]
+  res[, padj := signif(padj, 3)]
+  res[, log2FC := signif(log2FC, 3)]
 }
 #' 
 #' ## MAE Results table

From 9daef0e64b83cb1a09056ee4d9d539028445cf61 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Wed, 13 Apr 2022 17:54:07 +0200
Subject: [PATCH 60/65] MAE plot xlim

---
 drop/modules/mae-pipeline/MAE/Results.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R
index 0d0ba8c9..a2c095ab 100644
--- a/drop/modules/mae-pipeline/MAE/Results.R
+++ b/drop/modules/mae-pipeline/MAE/Results.R
@@ -152,7 +152,7 @@ ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
 #' ## Variant Frequency within Cohort
 ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02)  +
   geom_vline(xintercept = maxCohortFreq, col = "red",linetype="dashed") + theme_bw(base_size = 14) +
-  xlim(0,1.5) + xlab("Variant frequency in cohort") + ylab("Variants")
+  xlim(0,NA) + xlab("Variant frequency in cohort") + ylab("Variants")
 
 #' Median of each category
 DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable])

From 5917caacfb6a969430be2523affcfd6ee4c50ef4 Mon Sep 17 00:00:00 2001
From: Nick <smithnickh@gmail.com>
Date: Fri, 22 Apr 2022 16:14:32 +0200
Subject: [PATCH 61/65] code-review fixes

---
 docs/source/output.rst                        | 118 ++++++++----------
 drop/config/submodules/AberrantSplicing.py    |   4 +-
 .../Counting/Summary.R                        |   4 +-
 .../OUTRIDER/Summary.R                        |  21 ++--
 .../Counting/01_0_countRNA_init.R             |   8 +-
 .../01_1_countRNA_splitReads_samplewise.R     |   8 +-
 .../Counting/01_2_countRNA_splitReads_merge.R |  14 +--
 .../01_3_countRNA_nonSplitReads_samplewise.R  |   8 +-
 .../01_4_countRNA_nonSplitReads_merge.R       |  12 +-
 .../Counting/01_5_countRNA_collect.R          |  12 +-
 .../02_psi_value_calculation_FraseR.R         |   8 +-
 .../Counting/03_filter_expression_FraseR.R    |  35 +++---
 .../Counting/Summary.R                        |  28 ++---
 .../Counting/exportCounts.R                   |   2 +-
 .../FRASER/04_fit_hyperparameters_FraseR.R    |   6 +-
 .../FRASER/05_fit_autoencoder_FraseR.R        |   6 +-
 .../FRASER/06_calculation_stats_AE_FraseR.R   |   6 +-
 .../FRASER/07_extract_results_FraseR.R        |   8 +-
 .../FRASER/Summary.R                          |  11 +-
 drop/modules/mae-pipeline/MAE/Results.R       |  29 +++--
 .../Scripts/AberrantSplicing/Overview.R       |   2 +-
 tests/config/test_AS.py                       |   4 +-
 tests/pipeline/test_AS.py                     |   2 +-
 23 files changed, 170 insertions(+), 186 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 211444fe..67dae4e3 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -3,8 +3,8 @@ Results and Output of DROP
 
 DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
 aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
-easy to read and interpret html files and output files. This section is dedicated to explaining the relevant
-results files. We will use the results of the ``demo`` to explain the files generated.::
+easy to read and interpret HTML files and output files. This section explains the relevant
+results files. The paths of the output files correspond to the ones from the demo (that can be run with the following code snippet)::
 
     #install drop
     mamba create -n drop_env -c conda-forge -c bioconda drop
@@ -19,101 +19,85 @@ results files. We will use the results of the ``demo`` to explain the files gene
 Aberrant Expression
 +++++++++++++++++++
 
-html file
+HTML file
 #########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` 
-tab at the top of the screen. Following that the Overview tab contains links to the:  
+tab at the top of the screen. The Overview tab contains links to the:  
 
-* Counting Summaries 
+* Counts Summaries 
     * For each aberrant expression group
-        * split of local vs external sample counts
+        * number of local and external samples
         * QC relating to reads and size factors for each sample
-        * histograms relating to mean count distribution with different conditions
-        * information about the expressed genes within each sample and as a dataset
-* Outrider Summaries
-    * For each aberrant expression group
-        * the number of aberrantly expressed gene per sample
-        * how batch correction is done and the resulting lack of batch effects
-        * which samples contain outliers
-        * results table
-* Files
-    * OUTRIDER files for each aberrant expression group
-        * For each of these files you can follow the `OUTRIDER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_. 
-    * tsv files
-        * For each aberrant expression group
-            * results.tsv
-                * this tsv file contains only the significant genes and samples that meet the cutoffs defined in the ``config.yaml`` for ``padjCutoff`` and ``zScoreCutoff``
+        * histograms showing the mean count distribution with different conditions
+        * expressed genes within each sample and as a dataset
+* Outrider Summaries for each aberrant expression group
+    * aberrantly expressed genes per sample
+    * correlation between samples before and after the autoencoder
+    * biological coefficient of variation plot
+    * aberrant samples
+    * results table
+* Files for each aberrant expression group
+    * OUTRIDER datasets 
+        * Follow the `OUTRIDER vignette <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_ for individual OUTRIDER object file (ods) analysis.
+    * Results tables
+        * ``results.tsv`` this tsv file contains only the significant genes and samples that meet the cutoffs defined in the ``config.yaml`` for ``padjCutoff`` and ``zScoreCutoff``
 
 Local result files
 ##################
-Additionally the ``aberrantExpression`` module creates the file ``Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds`` this file is the Rds object containing the entire OUTRIDER results table regardless of significance.
+Additionally the ``aberrantExpression`` module creates the file ``Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds``. This file contains the entire OUTRIDER results table regardless of significance.
 
 Aberrant Splicing
 +++++++++++++++++
 
-html file
+HTML file
 ##########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantSplicing`` 
-tab at the top of the screen. Following that the Overview tab contains links to the:  
+tab at the top of the screen. The Overview tab contains links to the:  
 
-* Counting Summaries 
-    * For each aberrant splicing group
-        * split of local (from internal BAM files) vs external sample counts
-        * split of local vs merged with external sample splicing/intron counts
-        * comparison of local and external log mean counts
-        * histograms relating to junction expression before and after filtering and variability
-* FRASER Summaries
-    * For each aberrant splicing group
-        * the number of samples, introns, and splice sites 
-        * how batch correction is done and the resulting lack of batch effects
-        * result table
-* Files
-    * FRASER files for each aberrant splicing group
-        * For each of these files you can follow the `FRASER vignette for individual analysis <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_. 
-    * tsv files
-        * For each aberrant splicing group
-            * results_per_junction.tsv 
-                * this tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the junction level. 
+* Counting Summaries for each aberrant splicing group
+    * number of local and external samples
+    * number introns/splice sites before and after merging
+    * comparison of local and external mean counts
+    * histograms showing the junction expression before and after filtering and variability
+* FRASER Summaries for each aberrant splicing group
+    * the number of samples, introns, and splice sites 
+    * how batch correction is done and the resulting lack of batch effects
+    * result table
+* Files for each aberrant splicing group
+    * FRASER datasets (fds)
+        * Follow the `FRASER vignette <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_ for individual FRASER object file (fds) analysis.
+    * Results tables
+        * ``results_per_junction.tsv`` this tsv file contains only significant junctions that meet the cutoffs defined in the config file they are aggregated at the junction level. 
 
 Local result files
 ##################
-Additionally the ``aberrantSplicing`` module creates the following file ``Output/processed_results/aberrant_splicing/results/{annotation}/fraser/{drop_group}/results.tsv``. 
-This tsv file contains only significant junctions that meet the cutoffs defined in the ``config.yaml`` they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
+Additionally the ``aberrantSplicing`` module creates the following file ``Output/processed_results/aberrant_splicing/results/{annotation}/fraser/{drop_group}/results.tsv``.
+This tsv file contains only significant junctions that meet the cutoffs defined in the config file, they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
 
 Mono-allelic Expression
 +++++++++++++++++++++++
 
-html file
+HTML file
 ##########
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``MonoallelicExpression`` 
-tab at the top of the screen. Following that the Overview tab contains links to the:  
+tab at the top of the screen. The Overview tab contains links to the:  
 
-* Results
-    * For each mae group
-        * the number of samples, unique genes, and aberrant events
-        * a cascade plot that shows additional filters
-            * MAE for REF: the monoallelic expression favors the reference allele 
-            * MAE for ALT: the monoallelic expression favors the alternative allele 
-            * rare: 
-                * if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``
-                * additionally it must meet the inner-cohort frequency ``maxVarFreqCohort`` cutoff
-        * histogram of inner cohort frequency
-        * summary of cascade plots and results table
-* Files
+* Results for each mae group
+    * the number of samples, unique genes, and aberrant events
+    * a cascade plot that shows additional filters
+    * histogram of inner cohort frequency
+    * summary of cascade plots and results table
+* Files for each mae group
     * Allelic counts
         * a directory containing the allelic counts of heterozygous variants
     * Results data tables of each sample (.Rds)
         * Rds objects containing the full results table regardless of MAE status
     * Significant MAE results tables
-        * For each mae group
-            * a link to the results tsv file.
-            * Only contains significant MAE results based on ``config.yaml`` cutoffs for the alternative allele
+        * a link to the results tsv file.
+        * Only contains significant MAE for the alternative allele results and results that pass the config file cutoffs
 * Quality Control
     * QC Overview
         * For each mae group QC checks for DNA/RNA matching
-* Analyze Individual Results
-    * An example analaysis that can be run using the Rds objects linked in the files subsection
-    * performed on the first mae sample 
     
 Local result files
 ##################
@@ -122,8 +106,6 @@ Additionally the ``mae`` module creates the following files:
 * ``Output/processed_results/mae/{drop_group}/MAE_results_all_v29.tsv.gz``
     * this file is the tsv results of all heterozygous variants regardless of significance
 * ``Output/processed_results/mae/{drop_group}/MAE_results_v29.tsv``
-    * this is the file linked in the html document and described above
+    * this is the file linked in the HTML document and described above
 * ``Output/processed_results/mae/{drop_group}/MAE_results_v29_rare.tsv``
-    * this file is the subsetted tsv of ``MAE_results_v29.tsv`` with only the variants that pass the rare cutoffs
-        * if ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``
-        * inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
\ No newline at end of file
+    * this file is the subsetted tsv of ``MAE_results_v29.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py
index fc980ed4..228ba775 100644
--- a/drop/config/submodules/AberrantSplicing.py
+++ b/drop/config/submodules/AberrantSplicing.py
@@ -55,7 +55,7 @@ def getSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-local-{dataset}" / \
                      "sample_tmp" / "splitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
@@ -67,7 +67,7 @@ def getNonSplitCountFiles(self, dataset):
         :return: list of files
         """
         ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA")
-        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "fromBam" / "cache" / f"raw-{dataset}" / \
+        file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-local-{dataset}" / \
                      "sample_tmp" / "nonSplitCounts"
         done_files = str(file_stump / "sample_{sample_id}.done")
         return expand(done_files, sample_id=ids)
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
index 63cd3a5c..240e1682 100644
--- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -44,7 +44,7 @@ cnts_mtx <- counts(ods, normalized = F)
 #' # Count Quality Control
 #' 
 #' Compare number of records vs. read counts  
-#' `The Obtained Read Count Ratio` plot does not include external counts
+#' The `Obtained Read Count Ratio` plot does not include external counts
 #' because there are no raw reads to be counted.
 #' 
 bam_coverage <- fread(snakemake@input$bam_cov)
@@ -111,7 +111,7 @@ plot_grid(p_sf, p_sf_cov)
 #' # Filtering
 #' **local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
 #' **all**: A pre-filtered summary of counts using only the merged local (from BAM) and external counts  
-#' **passed_FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes  
+#' **passed FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes  
 #' **min 1 read**: minimum of 1 read expressed in 5% of genes  
 #' **min 10 reads**: minimum of 10 reads expressed in 5% of genes  
 
diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
index 8ba71c71..36fd13de 100644
--- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
+++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
@@ -52,7 +52,7 @@ plotEncDimSearch(ods) +
   labs(title = dataset_title) +
   theme_cowplot() +
   background_grid() +
-  scale_color_brewer(palette = "Set1")
+  scale_color_brewer(palette="Dark2")
 
 
 #' ### Aberrantly expressed genes per sample
@@ -63,18 +63,18 @@ plotAberrantPerSample(ods, main = dataset_title,
 
 #' ### Batch correction
 #+ countCorHeatmap, fig.height=8, fig.width=8
-plotCountCorHeatmap(ods, normalized = FALSE, colGroups = "isExternal",
+plotCountCorHeatmap(ods, normalized = FALSE, colGroups = "isExternal", colColSet = "Dark2",
                     main = paste0('Raw Counts (', dataset_title, ')'))
-plotCountCorHeatmap(ods, normalized = TRUE, ,colGroups = "isExternal",
+plotCountCorHeatmap(ods, normalized = TRUE, ,colGroups = "isExternal", colColSet = "Dark2",
                     main = paste0('Normalized Counts (', dataset_title, ')'))
 
 
 #' ### Expression by gene per sample
-#+ geneSampleHeatmap, fig.height=12, fig.width=6
-plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50, colGroups = "isExternal",
+#+ geneSampleHeatmap, fig.height=12, fig.width=8
+plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50, colGroups = "isExternal", colColSet = "Dark2",
                            main = paste0('Raw Counts (', dataset_title, ')'),
                            bcvQuantile = .95, show_names = 'row')
-plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50, colGroups = "isExternal",
+plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50, colGroups = "isExternal", colColSet = "Dark2",
                            main = paste0('Normalized Counts (',dataset_title,')'),
                            bcvQuantile = .95, show_names = 'row')
 
@@ -110,13 +110,14 @@ ggplot(bcv_dt, aes(when, BCV)) +
 #' ## Results
 res <- fread(snakemake@input$results)
 
-#' Samples with at least one outlier gene: `r res[, uniqueN(sampleID)]`
-#'
+#' Samples with at least one outlier gene: `r res[, uniqueN(sampleID)]`  
+
 #' ### Aberrant samples
+#' An aberrant sample is one that has more than 0.1% of the total genes called as outliers.
 if (nrow(res) > 0) {
-  ab_table <- res[AberrantBySample > nrow(ods)/1000, .N, by = .(sampleID)] %>% unique
+  ab_table <- res[AberrantBySample > nrow(ods)/1000, .("Outlier genes" = .N), by = .(sampleID)] %>% unique
   if (nrow(ab_table) > 0) {
-    setorder(ab_table, N) 
+    setorder(ab_table, "Outlier genes") 
     DT::datatable(ab_table)
   } else {
     print("no aberrant samples")
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
index c081c8d4..41e2ae5f 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R
@@ -6,15 +6,15 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_0_init.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  input:
 #'    - colData: '`sm cfg.getProcessedDataDir() + 
 #'                    "/aberrant_splicing/annotations/{dataset}.tsv"`'
 #'  output:
 #'   - fdsobj:  '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/fds-object.RDS"`'
+#'                   "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/fds-object.RDS"`'
 #'   - done_fds: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/fds.done" `'
+#'                "/aberrant_splicing/datasets/cache/raw-local-{dataset}/fds.done" `'
 #'  type: script
 #'---
 
@@ -31,7 +31,7 @@ col_data <- fread(colDataFile)
 
 fds <- FraserDataSet(colData = col_data,
                      workingDir = workingDir,
-                     name       = paste0("raw-", dataset))
+                     name       = paste0("raw-local-", dataset))
 
 # Add paired end and strand specificity to the fds
 pairedEnd(fds) <- colData(fds)$PAIRED_END
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
index ed63f8e2..36eeb798 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
@@ -6,13 +6,13 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "splitReads" / "{sample_id}.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  input:
 #'   - done_fds: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/fds.done"`'
+#'                "/aberrant_splicing/datasets/cache/raw-local-{dataset}/fds.done"`'
 #'  output:
 #'   - done_sample_splitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}"
+#'                "/aberrant_splicing/datasets/cache/raw-local-{dataset}"
 #'                +"/sample_tmp/splitCounts/sample_{sample_id}.done"`'
 #'  threads: 3
 #'  type: script
@@ -28,7 +28,7 @@ params <- snakemake@config$aberrantSplicing
 genomeAssembly <- snakemake@config$genomeAssembly
 
 # Read FRASER object
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 # Get sample id from wildcard
 sample_id <- snakemake@wildcards[["sample_id"]]
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
index 3f56d792..4fb08ce7 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R
@@ -6,19 +6,19 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_2_splitReadsMerge.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
 #'  threads: 20
 #'  input:
 #'   - sample_counts: '`sm lambda w: cfg.AS.getSplitCountFiles(w.dataset)`'
 #'  output:
 #'   - countsJ: '`sm cfg.getProcessedDataDir() +
-#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/rawCountsJ.h5"`'
+#'                          "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/rawCountsJ.h5"`'
 #'   - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_splitCounts.rds"`'
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_NonSplitCounts.rds"`'
 #'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'                   "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`'
 #'  type: script
 #'---
 
@@ -36,11 +36,11 @@ register(MulticoreParam(snakemake@threads))
 setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
 # Read FRASER object
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 # If samples are recounted, remove the merged ones
 splitCountsDir <- file.path(workingDir, "savedObjects", 
-                            paste0("raw-", dataset), 'splitCounts')
+                            paste0("raw-local-", dataset), 'splitCounts')
 if(params$recount == TRUE & dir.exists(splitCountsDir)){
   unlink(splitCountsDir, recursive = TRUE)
 }
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
index 8f290094..ce983be5 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R
@@ -6,13 +6,13 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "nonsplitReads" / "{sample_id}.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
 #'  input:
 #'   - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'                   "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`'
 #'  output:
 #'   - done_sample_nonSplitCounts : '`sm cfg.getProcessedDataDir() + 
-#'                   "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' 
+#'                   "/aberrant_splicing/datasets/cache/raw-local-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' 
 #'  threads: 3
 #'  type: script
 #'---
@@ -26,7 +26,7 @@ workingDir <- snakemake@params$workingDir
 params <- snakemake@config$aberrantSplicing
 
 # Read FRASER object
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 # Get sample id from wildcard
 sample_id <- snakemake@wildcards[["sample_id"]]
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
index bffecc51..85c330b6 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R
@@ -6,17 +6,17 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_4_nonSplitReadsMerge.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
 #'  threads: 20
 #'  input:
 #'   - sample_counts:  '`sm lambda w: cfg.AS.getNonSplitCountFiles(w.dataset)`'
 #'   - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_NonSplitCounts.rds"`'
 #'  output:
 ###   - countsSS: '`sm cfg.getProcessedDataDir() +
-###                   "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/rawCountsSS.h5"`'
+###                   "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/rawCountsSS.h5"`'
 #'   - done:     '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/merge_theta.done"`'
+#'                "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/merge_theta.done"`'
 #'  type: script
 #'---
 
@@ -32,14 +32,14 @@ register(MulticoreParam(snakemake@threads))
 setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
 # Read FRASER object
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 # Read splice site coordinates from RDS
 splitCounts_gRanges <- readRDS(snakemake@input$gRangesNonSplitCounts)
 
 # If samples are recounted, remove the merged ones
 nonSplitCountsDir <- file.path(workingDir, "savedObjects", 
-                            paste0("raw-", dataset), 'nonSplitCounts')
+                            paste0("raw-local-", dataset), 'nonSplitCounts')
 if(params$recount == TRUE & dir.exists(nonSplitCountsDir)){
   unlink(nonSplitCountsDir, recursive = TRUE)
 }
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
index 35c8fdf8..7724b478 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R
@@ -6,17 +6,17 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_5_collect.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`'
 #'  input:
 #'    - countsSSdone: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/merge_theta.done"`'
+#'                          "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/merge_theta.done"`'
 #'    - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/gRanges_splitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_splitCounts.rds"`'
 #'    - spliceSites: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/cache/raw-{dataset}/spliceSites_splitCounts.rds"`'
+#'                          "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`'
 #'  output:
 #'   - counting_done: '`sm cfg.getProcessedDataDir() + 
-#'                          "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/counting.done" `'
+#'                          "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/counting.done" `'
 #'  type: script
 #'---
 
@@ -28,7 +28,7 @@ workingDir <- snakemake@params$workingDir
 saveDir    <- dirname(snakemake@input$countsSSdone)
 
 # Read FRASER object
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 splitCounts_gRanges <- readRDS(snakemake@input$gRangesSplitCounts)
 spliceSiteCoords <- readRDS(snakemake@input$spliceSites)
 
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
index aafa766c..3f35f6aa 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R
@@ -6,14 +6,14 @@
 #'   - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "02_PSIcalc.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  threads: 30
 #'  input:
 #'   - counting_done: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/counting.done" `'
+#'                "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/counting.done" `'
 #'  output:
 #'  - theta:     '`sm cfg.getProcessedDataDir() +
-#'                    "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/theta.h5"`'
+#'                    "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`'
 #'  type: script
 #'--- 
 
@@ -28,7 +28,7 @@ register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
 setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
-fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 # Calculating PSI values
 fds <- calculatePSIValues(fds)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
index 2a734101..a9355b6b 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -6,18 +6,17 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "03_filter.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDirIn: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
-#'   - workingDirOut: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'   - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="SPLICE_COUNT")`'
 #'  input:
-#'   - theta:  '`sm cfg.getProcessedDataDir()+
-#'                  "/aberrant_splicing/datasets/fromBam/savedObjects/raw-{dataset}/theta.h5"`'
+#'   - theta:  '`sm cfg.getProcessedDataDir() +
+#'                  "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`'
 #'   - exCounts: '`sm lambda w: cfg.AS.getExternalCounts(w.dataset, "k_j_counts")`'
 #'  output:
 #'   - fds: '`sm cfg.getProcessedDataDir() +
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/fds-object.RDS"`'
-#'   - done: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
+#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`'
+#'   - done: '`sm cfg.getProcessedDataDir() +
+#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
 #'  threads: 3
 #'  type: script
 #'---
@@ -29,8 +28,7 @@ opts_chunk$set(fig.width=12, fig.height=8)
 
 # input
 dataset    <- snakemake@wildcards$dataset
-workingDirIn <- snakemake@params$workingDirIn
-workingDirOut <- snakemake@params$workingDirOut
+workingDir <- snakemake@params$workingDir
 params     <- snakemake@config$aberrantSplicing
 exCountIDs <- snakemake@params$exCountIDs
 exCountFiles <- snakemake@input$exCounts
@@ -38,7 +36,7 @@ sample_anno_file <- snakemake@config$sampleAnnotation
 minExpressionInOneSample <- params$minExpressionInOneSample
 minDeltaPsi <- params$minDeltaPsi
 
-fds <- loadFraserDataSet(dir=workingDirIn, name=paste0("raw-", dataset))
+fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 
 register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
@@ -47,8 +45,7 @@ setAutoBPPARAM(MulticoreParam(snakemake@threads))
 # Add external data if provided by dataset
 if(length(exCountIDs) > 0){
     message("create new merged fraser object")
-    workingDir(fds) <- workingDirOut
-    fds <- saveFraserDataSet(fds,dir = workingDirOut, name=paste0("raw-", dataset))
+    fds <- saveFraserDataSet(fds,dir = workingDir, name=paste0("raw-", dataset))
 
     for(resource in unique(exCountFiles)){
         exSampleIDs <- exCountIDs[exCountFiles == resource]
@@ -67,25 +64,21 @@ if(length(exCountIDs) > 0){
     }
 } else {
     message("symLink fraser dir")
-    file.symlink(paste0(workingDirIn, "savedObjects/","raw-", dataset),
-                 paste0(workingDirOut, "savedObjects/","raw-", dataset))
+    file.symlink(paste0(workingDir, "savedObjects/","raw-local-", dataset),
+                 paste0(workingDir, "savedObjects/","raw-", dataset))
     
     fds@colData$isExternal <- as.factor(FALSE)
-    workingDir(fds) <- workingDirOut
+    workingDir(fds) <- workingDir
     name(fds) <- paste0("raw-", dataset)
 }
 
 # filter for expression and write it out to disc.
-# 
-# TODO:   This will brake a rerun of step 01_5_countRNA_collect.R as it writes 
-#         out the rawCountsJ and rawCountsSS file including the external samples. 
-# 
 fds <- filterExpressionAndVariability(fds, 
         minExpressionInOneSample = minExpressionInOneSample,
         minDeltaPsi = minDeltaPsi,
         filter=FALSE)
 
-devNull <- saveFraserDataSet(fds,dir = workingDirOut)
+devNull <- saveFraserDataSet(fds,dir = workingDir)
 
 # Keep junctions that pass filter
 name(fds) <- dataset
@@ -97,5 +90,5 @@ if (params$filter == TRUE) {
 
 seqlevels(fds) <- seqlevelsInUse(fds)
 colData(fds)$sampleID <- as.character(colData(fds)$sampleID)
-fds <- saveFraserDataSet(fds,dir = workingDirOut)
+fds <- saveFraserDataSet(fds,dir = workingDir)
 file.create(snakemake@output$done)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
index 4a089697..b5d19e34 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R
@@ -6,11 +6,10 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "CountSummary.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
-#'   - workingDirLocal: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/fromBam/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  input:
 #'   - filter: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
+#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
 #'  output:
 #'   - wBhtml: '`sm config["htmlOutputPath"] + 
 #'                  "/AberrantSplicing/{dataset}_countSummary.html"`'
@@ -28,9 +27,8 @@ suppressPackageStartupMessages({
 #+ input
 dataset    <- snakemake@wildcards$dataset
 workingDir <- snakemake@params$workingDir
-workingDirLocal <- snakemake@params$workingDirLocal
 
-fdsLocal <- loadFraserDataSet(dir=workingDirLocal, name=paste0("raw-", dataset))
+fdsLocal <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset))
 fdsMerge <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
 
 has_external <- !(all(is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)) || is.null(fdsMerge@colData$SPLICE_COUNTS_DIR))
@@ -43,23 +41,23 @@ devNull <- saveFraserDataSet(fdsMerge,dir=workingDir, name=paste0("raw-", datase
 
 
 #' ## Number of samples:   
-#' Local (fromBam): `r sum(!as.logical(fdsMerge@colData$isExternal))`  
+#' Local: `r sum(!as.logical(fdsMerge@colData$isExternal))`  
 #' External: `r sum(as.logical(fdsMerge@colData$isExternal))`  
 #' 
 #' **Using external counts**  
 #' External counts introduce some complexity into the problem of counting junctions
-#' because it is ambiguous whether or not a junction is not counted (because there are no reads)
+#' because it is unknown whether or not a junction is not counted (because there are no reads)
 #' compared to filtered and not present due to legal/personal sharing reasons. As a result,
-#' after merging the local (fromBam) counts and the external counts, only the junctions that are exactly
-#' the same in both remain. As a result it is likely that the number of junctions will decrease after a merge.
+#' after merging the local (counted from BAM files) counts and the external counts, only the junctions that are 
+#' present in both remain. As a result it is likely that the number of junctions will decrease after merging.
 #' 
 #' 
 #' ### Number of introns (psi5 or psi3) before and after merging:  
-#' Local (fromBam): `r length(rowRanges(fdsLocal, type = "psi5"))`  
-#' Merged : `r length(rowRanges(fdsMerge, type = "psi5"))`  
+#' Local: `r length(rowRanges(fdsLocal, type = "psi5"))`  
+#' Merged: `r length(rowRanges(fdsMerge, type = "psi5"))`  
 #' 
 #' ### Number of splice sites (theta) before and after merging: 
-#' Local (fromBam): `r length(rowRanges(fdsLocal, type = "theta"))`  
+#' Local: `r length(rowRanges(fdsLocal, type = "theta"))`  
 #' Merged: `r length(rowRanges(fdsMerge, type = "theta"))`  
 #' 
 
@@ -75,10 +73,10 @@ if(has_external){
     rowMeanLocal <- rowMeans(ctsLocal)
     rowMeanExt <- rowMeans(ctsExt)
 
-    dt <- data.table("Local log mean counts" = rowMeanLocal,
-                 "External log mean counts" = rowMeanExt)
+    dt <- data.table("Mean counts of local samples" = rowMeanLocal,
+                     "Mean counts of external samples" = rowMeanExt)
                  
-    ggplot(dt,aes(x = `Local log mean counts`, y= `External log mean counts`)) +
+    ggplot(dt,aes(x = `Mean counts of local samples`, y= `Mean counts of external samples`)) +
        geom_hex() + theme_cowplot(font_size = 16) +
 	   theme_bw() + scale_x_log10() + scale_y_log10() + 
        geom_abline(slope = 1, intercept =0) +
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
index 4c5f0d98..052a9c2e 100644
--- a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
+++ b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
@@ -9,7 +9,7 @@
 #'  input:
 #'   - annotation: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
 #'   - fds_theta: '`sm cfg.getProcessedDataDir() + 
-#'                    "/aberrant_splicing/datasets/merged/savedObjects/raw-{dataset}/theta.h5"`'
+#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
 #'  output:
 #'    - k_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/k_{metric}_counts.tsv.gz", metric=["j", "theta"])`'
 #'    - n_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/n_{metric}_counts.tsv.gz", metric=["psi5", "psi3", "theta"])`'
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
index 573902c3..1a99b259 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/04_fit_hyperparameters_FraseR.R
@@ -6,14 +6,14 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "04_hyper.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  threads: 12
 #'  input:
 #'   - filter: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/filter.done" `'
+#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `'
 #'  output:
 #'   - hyper: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/hyper.done" `'
+#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
index bd6ad7d0..f5e760c8 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/05_fit_autoencoder_FraseR.R
@@ -6,14 +6,14 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "05_fit.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  threads: 20
 #'  input:
 #'   - hyper: '`sm cfg.getProcessedDataDir() + 
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/hyper.done" `'
+#'                "/aberrant_splicing/datasets/savedObjects/{dataset}/hyper.done" `'
 #'  output:
 #'   - fdsout: '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/predictedMeans_theta.h5"`'
+#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/predictedMeans_theta.h5"`'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
index 3046ac4c..84b222cb 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/06_calculation_stats_AE_FraseR.R
@@ -6,15 +6,15 @@
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "06_stats.Rds")`'
 #'  params:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  threads: 20
 #'  input:
 #'   - fdsin:  '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
+#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
 #'                  "predictedMeans_theta.h5"`'
 #'  output:
 #'   - fdsout: '`sm cfg.getProcessedDataDir() + 
-#'                  "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
+#'                  "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
 #'                  "padjBetaBinomial_theta.h5"`'
 #'  type: script
 #'---
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
index 4b9dace7..a4323d79 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
@@ -5,8 +5,8 @@
 #'  log:
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}--{annotation}" / "07_results.Rds")`'
 #'  params:
-#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/merged/"`'
-#'   - outputDir: '`sm cfg.getProcessedResultsDir() + "/aberrant_splicing/datasets/merged/"`'
+#'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
+#'   - outputDir: '`sm cfg.getProcessedResultsDir() + "/aberrant_splicing/datasets/"`'
 #'   - padjCutoff: '`sm cfg.AS.get("padjCutoff")`'
 #'   - zScoreCutoff: '`sm cfg.AS.get("zScoreCutoff")`'
 #'   - deltaPsiCutoff: '`sm cfg.AS.get("deltaPsiCutoff")`'
@@ -16,7 +16,7 @@
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - add_HPO_cols: '`sm str(projectDir / ".drop" / "helpers" / "add_HPO_cols.R")`'
 #'   - fdsin: '`sm cfg.getProcessedDataDir() +
-#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}/" +
+#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}/" +
 #'                 "padjBetaBinomial_theta.h5"`'
 #'   - txdb: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
 #'   - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/gene_name_mapping_{annotation}.tsv"`'
@@ -26,7 +26,7 @@
 #'   - resultTableGene: '`sm cfg.getProcessedResultsDir() +
 #'                          "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results.tsv"`'
 #'   - fds: '`sm cfg.getProcessedResultsDir() +
-#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
+#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
 #'  type: script
 #'---
 
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
index d4ddb231..91343626 100644
--- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
+++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R
@@ -8,7 +8,7 @@
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'  input:
 #'   - fdsin: '`sm cfg.getProcessedResultsDir() + 
-#'                 "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
+#'                 "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"`'
 #'   - results: '`sm cfg.getProcessedResultsDir() + 
 #'                   "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results.tsv"`'
 #'  output:
@@ -24,6 +24,7 @@ source(snakemake@params$setup, echo=FALSE)
 
 suppressPackageStartupMessages({
   library(cowplot)
+  library("RColorBrewer")
 })
 
 #+ input
@@ -35,6 +36,7 @@ deltaPsi_cutoff <- snakemake@config$aberrantSplicing$deltaPsiCutoff
 
 
 fds <- loadFraserDataSet(file=snakemake@input$fdsin)
+hasExternal <- length(levels(colData(fds)$isExternal) > 1)
 
 #' Number of samples: `r nrow(colData(fds))`
 #' 
@@ -65,6 +67,7 @@ plotAberrantPerSample(fds, padjCutoff = padj_cutoff, zScoreCutoff = zScore_cutof
 #' ## Batch Correlation: samples x samples
 topN <- 30000
 topJ <- 10000
+anno_color_scheme <- brewer.pal(n = 3, name = 'Dark2')[1:2]
 for(type in psiTypes){
   before <- plotCountCorHeatmap(
     object=fds,
@@ -80,7 +83,8 @@ for(type in psiTypes){
     minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
     plotMeanPsi=FALSE,
     plotCov = FALSE,
-    annotation_legend = TRUE
+    annotation_legend = TRUE,
+	annotation_colors = list(isExternal = c("FALSE" = anno_color_scheme[1],"TRUE" =  anno_color_scheme[2]))
   )
   before
   after <- plotCountCorHeatmap(
@@ -97,7 +101,8 @@ for(type in psiTypes){
     minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi,
     plotMeanPsi=FALSE,
     plotCov = FALSE,
-    annotation_legend = TRUE
+    annotation_legend = TRUE,
+	annotation_colors = list(isExternal = c("FALSE" = anno_color_scheme[1],"TRUE" =  anno_color_scheme[2]))
   )
   after
 }
diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R
index a2c095ab..d11aa255 100644
--- a/drop/modules/mae-pipeline/MAE/Results.R
+++ b/drop/modules/mae-pipeline/MAE/Results.R
@@ -120,20 +120,16 @@ fwrite(res[MAE_ALT == TRUE & rare == TRUE], snakemake@output$res_signif_rare,
 
 # Add columns for plot
 res[, N := .N, by = ID]
-res[,c("N_MAE","N_MAE_REF","N_MAE_ALT","N_MAE_REF_RARE","N_MAE_ALT_RARE") := 0,by = ID]
-res[MAE == TRUE, N_MAE := .N, by = ID]
-res[MAE == TRUE & MAE_ALT == FALSE, N_MAE_REF := .N, by = ID]
-res[MAE_ALT == TRUE, N_MAE_ALT := .N, by = ID]
-res[MAE == TRUE & MAE_ALT == FALSE & rare == TRUE, N_MAE_REF_RARE := .N, by = ID]
-res[MAE_ALT == TRUE & rare == TRUE, N_MAE_ALT_RARE := .N, by = ID]
+plot_res <- res[,.(N = .N,
+             N_MAE = sum(MAE==T),
+             N_MAE_REF=sum(MAE==T & MAE_ALT == F),
+             N_MAE_ALT=sum(MAE_ALT == T),
+             N_MAE_REF_RARE = sum(MAE ==T & MAE_ALT==F & rare == T),
+             N_MAE_ALT_RARE = sum(MAE_ALT ==T & rare ==T)
+			 ),by = ID]
 
-rd <- unique(res[,.(ID, N, N_MAE, N_MAE_REF, N_MAE_ALT, N_MAE_REF_RARE, N_MAE_ALT_RARE)])
 
-# rd contains duplicate entries for each ID. IE when MAE==F N_MAE for ID1 is both .N and 0
-# summarize these duplicates by taking the maximum of each column for each ID
-rd <- rd %>% group_by(ID) %>% summarize_all(max) %>% as.data.table()
-
-melt_dt <- melt(rd, id.vars = 'ID')
+melt_dt <- melt(plot_res, id.vars = 'ID')
 melt_dt[variable == 'N', variable := '>10 counts']
 melt_dt[variable == 'N_MAE', variable := '+MAE']
 melt_dt[variable == 'N_MAE_REF', variable := '+MAE for\nREF']
@@ -143,6 +139,15 @@ melt_dt[variable == 'N_MAE_ALT_RARE', variable := '+MAE for ALT\n& rare']
 
 #' 
 #' ## Cascade plot 
+#' a cascade plot that shows a progression of added filters  
+#'   - >10 counts: only variants supported by more than 10 counts
+#'   - +MAE: and shows mono allelic expression
+#'   - +MAE for REF : the monoallelic expression favors the reference allele
+#'   - +MAE for ALT : the monoallelic expression favors the alternative allele
+#'   - rare: 
+#'     - if `add_AF` is set to true in config file must meet minimum AF set by the config value `max_AF`
+#'     - must meet the inner-cohort frequency `maxVarFreqCohort` cutoff
+
 ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
   scale_y_log10(limits = c(1,NA)) + theme_bw(base_size = 14) +
   labs(y = 'Heterozygous SNVs per patient', x = '') + 
diff --git a/drop/template/Scripts/AberrantSplicing/Overview.R b/drop/template/Scripts/AberrantSplicing/Overview.R
index ef8a3bed..f00a7913 100644
--- a/drop/template/Scripts/AberrantSplicing/Overview.R
+++ b/drop/template/Scripts/AberrantSplicing/Overview.R
@@ -11,7 +11,7 @@
 #'  input:
 #'    - functions: '`sm cfg.workDir / "Scripts/html_functions.R"`'
 #'    - fds_files: '`sm expand(cfg.getProcessedResultsDir() +
-#'                "/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/" +
+#'                "/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/" +
 #'                "fds-object.RDS", dataset=cfg.AS.groups, annotation=cfg.genome.getGeneVersions())`'
 #'    - result_tables: '`sm expand(cfg.getProcessedResultsDir() +
 #'                    "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results_per_junction.tsv",
diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py
index 1e0559ae..5ab6d1e2 100644
--- a/tests/config/test_AS.py
+++ b/tests/config/test_AS.py
@@ -19,7 +19,7 @@ def test_config(self, dropConfig,demo_dir):
         assert dict_.items() <= dropConfig.AS.dict_.items()
 
     def test_getSplitCountFiles(self, demo_dir, dropConfig):
-        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/fromBam/cache/raw-fraser/sample_tmp/" \
+        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-local-fraser/sample_tmp/" \
                      "splitCounts"
         ids = [
             'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc',
@@ -35,7 +35,7 @@ def test_getSplitCountFiles(self, demo_dir, dropConfig):
         assert counts_files_true == counts_files_test
 
     def test_getNonSplitCountFiles(self, demo_dir, dropConfig):
-        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/fromBam/cache/raw-fraser/sample_tmp/" \
+        counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-local-fraser/sample_tmp/" \
                      "nonSplitCounts"
         ids = [
             'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc',
diff --git a/tests/pipeline/test_AS.py b/tests/pipeline/test_AS.py
index c62166b4..becd0e30 100644
--- a/tests/pipeline/test_AS.py
+++ b/tests/pipeline/test_AS.py
@@ -38,7 +38,7 @@ def pipeline_run(self, demo_dir):
     def test_counts(self, demo_dir):
         annotation = "v29"
         dataset = "fraser"
-        cnt_file = f"Output/processed_results/aberrant_splicing/datasets/merged/savedObjects/{dataset}--{annotation}/fds-object.RDS"
+        cnt_file = f"Output/processed_results/aberrant_splicing/datasets/savedObjects/{dataset}--{annotation}/fds-object.RDS"
         r_cmd = """
             library(FRASER)
             fds <- loadFraserDataSet(file="{}")

From 2d220f8157079ceb45bb79f5c4c74545fb3f2b00 Mon Sep 17 00:00:00 2001
From: nickhsmith <smithnickh@gmail.com>
Date: Fri, 22 Apr 2022 16:23:34 +0200
Subject: [PATCH 62/65] Update output.rst

---
 docs/source/output.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 67dae4e3..041a55b2 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -103,9 +103,9 @@ Local result files
 ##################
 Additionally the ``mae`` module creates the following files:
 
-* ``Output/processed_results/mae/{drop_group}/MAE_results_all_v29.tsv.gz``
+* ``Output/processed_results/mae/{drop_group}/MAE_results_all_{annotation}.tsv.gz``
     * this file is the tsv results of all heterozygous variants regardless of significance
-* ``Output/processed_results/mae/{drop_group}/MAE_results_v29.tsv``
+* ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}.tsv``
     * this is the file linked in the HTML document and described above
-* ``Output/processed_results/mae/{drop_group}/MAE_results_v29_rare.tsv``
-    * this file is the subsetted tsv of ``MAE_results_v29.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
+* ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}_rare.tsv``
+    * this file is the subsetted tsv of ``MAE_results_{annotation}.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff

From ea1b26d6ec1a594274e4d785777feb14a55b52c2 Mon Sep 17 00:00:00 2001
From: nickhsmith <smithnickh@gmail.com>
Date: Fri, 22 Apr 2022 16:24:56 +0200
Subject: [PATCH 63/65] Update output.rst

---
 docs/source/output.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index 041a55b2..da007b77 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -40,7 +40,7 @@ tab at the top of the screen. The Overview tab contains links to the:
     * OUTRIDER datasets 
         * Follow the `OUTRIDER vignette <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_ for individual OUTRIDER object file (ods) analysis.
     * Results tables
-        * ``results.tsv`` this tsv file contains only the significant genes and samples that meet the cutoffs defined in the ``config.yaml`` for ``padjCutoff`` and ``zScoreCutoff``
+        * ``results.tsv`` this tsv file contains only the significant genes and samples that meet the cutoffs defined in the config file for ``padjCutoff`` and ``zScoreCutoff``
 
 Local result files
 ##################
@@ -108,4 +108,4 @@ Additionally the ``mae`` module creates the following files:
 * ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}.tsv``
     * this is the file linked in the HTML document and described above
 * ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}_rare.tsv``
-    * this file is the subsetted tsv of ``MAE_results_{annotation}.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in ``config.yaml`` must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
+    * this file is the subsetted tsv of ``MAE_results_{annotation}.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in config file must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff

From d15a7021de98fd91f568ff17e4f2f78b6f2f75d3 Mon Sep 17 00:00:00 2001
From: nickhsmith <smithnickh@gmail.com>
Date: Fri, 22 Apr 2022 16:25:49 +0200
Subject: [PATCH 64/65] Update output.rst

---
 docs/source/output.rst | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index da007b77..fae9ac62 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -24,12 +24,11 @@ HTML file
 Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` 
 tab at the top of the screen. The Overview tab contains links to the:  
 
-* Counts Summaries 
-    * For each aberrant expression group
-        * number of local and external samples
-        * QC relating to reads and size factors for each sample
-        * histograms showing the mean count distribution with different conditions
-        * expressed genes within each sample and as a dataset
+* Counts Summaries for each aberrant expression group
+    * number of local and external samples
+    * QC relating to reads and size factors for each sample
+    * histograms showing the mean count distribution with different conditions
+    * expressed genes within each sample and as a dataset
 * Outrider Summaries for each aberrant expression group
     * aberrantly expressed genes per sample
     * correlation between samples before and after the autoencoder

From 1a628cb233fbee7f3079969d741a002a2f332dd8 Mon Sep 17 00:00:00 2001
From: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Date: Fri, 22 Apr 2022 16:57:14 +0200
Subject: [PATCH 65/65] Update output.rst

---
 docs/source/output.rst | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/source/output.rst b/docs/source/output.rst
index fae9ac62..c4b0e2a6 100644
--- a/docs/source/output.rst
+++ b/docs/source/output.rst
@@ -3,8 +3,8 @@ Results and Output of DROP
 
 DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression,
 aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide
-easy to read and interpret HTML files and output files. This section explains the relevant
-results files. The paths of the output files correspond to the ones from the demo (that can be run with the following code snippet)::
+easy-to-read HTML files and output files. This section explains the results files. The paths of the output
+files correspond to the ones from the demo (that can be run with the following code snippet)::
 
     #install drop
     mamba create -n drop_env -c conda-forge -c bioconda drop
@@ -26,20 +26,20 @@ tab at the top of the screen. The Overview tab contains links to the:
 
 * Counts Summaries for each aberrant expression group
     * number of local and external samples
-    * QC relating to reads and size factors for each sample
+    * Mapped reads and size factors for each sample
     * histograms showing the mean count distribution with different conditions
     * expressed genes within each sample and as a dataset
 * Outrider Summaries for each aberrant expression group
     * aberrantly expressed genes per sample
     * correlation between samples before and after the autoencoder
-    * biological coefficient of variation plot
+    * biological coefficient of variation
     * aberrant samples
     * results table
 * Files for each aberrant expression group
     * OUTRIDER datasets 
         * Follow the `OUTRIDER vignette <https://www.bioconductor.org/packages/devel/bioc/vignettes/OUTRIDER/inst/doc/OUTRIDER.pdf>`_ for individual OUTRIDER object file (ods) analysis.
     * Results tables
-        * ``results.tsv`` this tsv file contains only the significant genes and samples that meet the cutoffs defined in the config file for ``padjCutoff`` and ``zScoreCutoff``
+        * ``results.tsv`` this text file contains only the significant genes and samples that meet the cutoffs defined in the config file for ``padjCutoff`` and ``zScoreCutoff``
 
 Local result files
 ##################
@@ -60,18 +60,18 @@ tab at the top of the screen. The Overview tab contains links to the:
     * histograms showing the junction expression before and after filtering and variability
 * FRASER Summaries for each aberrant splicing group
     * the number of samples, introns, and splice sites 
-    * how batch correction is done and the resulting lack of batch effects
-    * result table
+    * correlation between samples before and after the autoencoder
+    * results table
 * Files for each aberrant splicing group
     * FRASER datasets (fds)
         * Follow the `FRASER vignette <https://www.bioconductor.org/packages/devel/bioc/vignettes/FRASER/inst/doc/FRASER.pdf>`_ for individual FRASER object file (fds) analysis.
     * Results tables
-        * ``results_per_junction.tsv`` this tsv file contains only significant junctions that meet the cutoffs defined in the config file they are aggregated at the junction level. 
+        * ``results_per_junction.tsv`` this text file contains only significant junctions that meet the cutoffs defined in the config file. 
 
 Local result files
 ##################
 Additionally the ``aberrantSplicing`` module creates the following file ``Output/processed_results/aberrant_splicing/results/{annotation}/fraser/{drop_group}/results.tsv``.
-This tsv file contains only significant junctions that meet the cutoffs defined in the config file, they are aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
+This text file contains only significant junctions that meet the cutoffs defined in the config file, aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction.
 
 Mono-allelic Expression
 +++++++++++++++++++++++
@@ -82,17 +82,17 @@ Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``M
 tab at the top of the screen. The Overview tab contains links to the:  
 
 * Results for each mae group
-    * the number of samples, unique genes, and aberrant events
+    * number of samples, genes, and mono-allelically expressed heterozygous SNVs
     * a cascade plot that shows additional filters
     * histogram of inner cohort frequency
-    * summary of cascade plots and results table
+    * summary of the cascade plot and results table
 * Files for each mae group
     * Allelic counts
         * a directory containing the allelic counts of heterozygous variants
     * Results data tables of each sample (.Rds)
         * Rds objects containing the full results table regardless of MAE status
     * Significant MAE results tables
-        * a link to the results tsv file.
+        * a link to the results file
         * Only contains significant MAE for the alternative allele results and results that pass the config file cutoffs
 * Quality Control
     * QC Overview
@@ -103,8 +103,8 @@ Local result files
 Additionally the ``mae`` module creates the following files:
 
 * ``Output/processed_results/mae/{drop_group}/MAE_results_all_{annotation}.tsv.gz``
-    * this file is the tsv results of all heterozygous variants regardless of significance
+    * this file contains the MAE results of all heterozygous SNVs regardless of significance
 * ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}.tsv``
     * this is the file linked in the HTML document and described above
 * ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}_rare.tsv``
-    * this file is the subsetted tsv of ``MAE_results_{annotation}.tsv`` with only the variants that pass the rare cutoffs. If ``add_AF`` is set to true in config file must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet ``maxVarFreqCohort`` cutoff
+    * this file is a subset of ``MAE_results_{annotation}.tsv`` with only the variants that pass the allele frequency cutoffs. If ``add_AF`` is set to ``true`` in config file must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet the ``maxVarFreqCohort`` cutoff