diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 00437a34..9be16b75 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -1,6 +1,6 @@ name: Build -on: [push] +on: [push,pull_request] jobs: build-linux: diff --git a/README.md b/README.md index f6ae7689..63f1acbb 100644 --- a/README.md +++ b/README.md @@ -3,26 +3,16 @@ [![Version](https://img.shields.io/github/v/release/gagneurlab/drop?include_prereleases)](https://github.com/gagneurlab/drop/releases) [![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest) -The detection of RNA Outliers Pipeline (DROP) is an integrative workflow to detect aberrant expression, aberrant splicing, and mono-allelic expression from raw sequencing files. +The detection of RNA Outliers Pipeline (DROP) is an integrative workflow to detect aberrant expression, aberrant splicing, and mono-allelic expression from raw sequencing data. -The manuscript is available in [Nature Protocols](https://www.nature.com/articles/s41596-020-00462-5). [SharedIt link.](https://rdcu.be/cdMmF) - -drop logo +The manuscript is available in [Nature Protocols](https://www.nature.com/articles/s41596-020-00462-5). +The website containing the different reports of the Geuvadis demo dataset described in the paper can be found [here](https://cmm.in.tum.de/public/paper/drop_analysis/webDir/drop_analysis_index.html). -## What's new +This [video](https://www.youtube.com/watch?v=XvgjiFQClhM&t=2761s) introduces the tools used in DROP and their application to rare disease diagnostics. -Versions 1.3.3, 1.3.2 and 1.3.1 fix some bugs. -Version 1.3.0 introduces the option to use FRASER 2.0 which is an improved version of FRASER that uses the Intron Jaccard Index metric instead of percent spliced in and splicing efficiency to quantify and later call aberrant splicing. To run FRASER 2.0, modify the `FRASER_version` parameter in the aberrantSplicing dictionary in the config file and adapt the `quantileForFiltering` and `deltaPsiCutoff` parameters. See the [config template](https://github.com/gagneurlab/drop/blob/master/drop/template/config.yaml) for more details. When switching between FRASER versions, we recommend running DROP in a -separate folder for each version. Moreover, DROP now allows users to provide lists of genes to focus on and do the multiple testing correction instead of the usual transcriptome-wide approach. Refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#limiting-fdr-correction-to-subsets-of-genes-of-interest). - -`Snakemake v.7.8` introduced some changes in which changes in parameters can cause rules to be re-executed. More info [here](https://github.com/snakemake/snakemake/issues/1694). This affects DROP and causes certain rules in the AS and QC modules to be triggered even if they were already completed and there were no changes in the sample annotation or scripts. The workaround is to run DROP by adding the parameter `--rerun-triggers mtime`, e.g. `snakemake -n --rerun-triggers mtime` or `snakemake --cores 10 --rerun-triggers mtime`. We will investigate the rules in DROP to fix this. - -Version 1.2.3 simplifies the plots in the AE Summary Script. In addition, there's a new heatmap in the sampleQC Summary that allows to better identify DNA-RNA mismatches. - -As of version 1.2.1 DROP has a new module that performs RNA-seq variant calling. The input are BAM files and the output either a single-sample or a multi-sample VCF file (option specified by the user) annotated with allele frequencies from gnomAD (if specified by the user). The sample annotation table does not need to be changed, but several new parameters in the config file have to be added and tuned. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#rna-variant-calling-dictionary). +drop logo -Also, as of version 1.2.1 the integration of external split and non-split counts to detect aberrant splicing is now possible. Simply specify in a new column in the sample annotation the directory containing the counts. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#external-count-examples). ## Quickstart DROP is available on [bioconda](https://anaconda.org/bioconda/drop). @@ -31,9 +21,9 @@ We recommend using a dedicated conda environment (`drop_env` in this example). I mamba create -n drop_env -c conda-forge -c bioconda drop --override-channels ``` -In the case of mamba/conda troubles we recommend using the fixed `DROP_.yaml` installation file we make available on our [public server](https://www.cmm.in.tum.de/public/paper/drop_analysis/). Install the current version and use the full path in the following command to install the conda environment `drop_env` +In the case of mamba/conda troubles, we recommend using the fixed `DROP_.yaml` installation file we make available on our [public server](https://www.cmm.in.tum.de/public/paper/drop_analysis/). Install the current version and use the full path in the following command to install the conda environment `drop_env` ``` -mamba env create -f DROP_1.3.3.yaml +mamba env create -f DROP_1.3.4.yaml ``` Test installation with demo project @@ -55,6 +45,20 @@ Expected runtime: 25 min For more information on different installation options, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/installation.html) +## What's new + +Due to snakemake updates affecting wBuild and the way we installed FRASER, installing DROP 1.3.3 no longer works. Version 1.3.4 simply fixes the FRASER version to ensure reproducibility and fixes certain scripts affected by the snakemake update. Running the pipeline with the version 1.3.4 should provide the same outlier results as 1.3.3. + +Version 1.3.0 introduces the option to use FRASER 2.0 which is an improved version of FRASER that uses the Intron Jaccard Index metric instead of percent spliced in and splicing efficiency to quantify and later call aberrant splicing. To run FRASER 2.0, modify the `FRASER_version` parameter in the aberrantSplicing dictionary in the config file and adapt the `quantileForFiltering` and `deltaPsiCutoff` parameters. See the [config template](https://github.com/gagneurlab/drop/blob/master/drop/template/config.yaml) for more details. When switching between FRASER versions, we recommend running DROP in a +separate folder for each version. Moreover, DROP now allows users to provide lists of genes to focus on and do the multiple testing correction instead of the usual transcriptome-wide approach. Refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#limiting-fdr-correction-to-subsets-of-genes-of-interest). + +`Snakemake v.7.8` introduced some changes in which changes in parameters can cause rules to be re-executed. More info [here](https://github.com/snakemake/snakemake/issues/1694). This affects DROP and causes certain rules in the AS and QC modules to be triggered even if they were already completed and there were no changes in the sample annotation or scripts. The workaround is to run DROP by adding the parameter `--rerun-triggers mtime`, e.g. `snakemake -n --rerun-triggers mtime` or `snakemake --cores 10 --rerun-triggers mtime`. We will investigate the rules in DROP to fix this. + +As of version 1.2.1 DROP has a new module that performs RNA-seq variant calling. The input are BAM files and the output either a single-sample or a multi-sample VCF file (option specified by the user) annotated with allele frequencies from gnomAD (if specified by the user). The sample annotation table does not need to be changed, but several new parameters in the config file have to be added and tuned. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#rna-variant-calling-dictionary). + +Also, as of version 1.2.1 the integration of external split and non-split counts to detect aberrant splicing is now possible. Simply specify in a new column in the sample annotation the directory containing the counts. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#external-count-examples). + + ## Set up a custom project Install the drop module according to [installation](#installation) and initialize the project in a custom project directory. ### Prepare the input data @@ -100,7 +104,10 @@ If you want to contribute with your own count matrices, please contact us: yepez If you use DROP in research, please cite our [manuscript](https://www.nature.com/articles/s41596-020-00462-5). -Furthermore, if you use the aberrant expression module, also cite [OUTRIDER](https://doi.org/10.1016/j.ajhg.2018.10.025); if you use the aberrant splicing module, also cite [FRASER](https://www.nature.com/articles/s41467-020-20573-7); and if you use the MAE module, also cite the [Kremer, Bader et al study](https://www.nature.com/articles/ncomms15824) and [DESeq2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8). +Furthermore, if you use: +* the aberrant expression module, also cite [OUTRIDER](https://doi.org/10.1016/j.ajhg.2018.10.025) +* the aberrant splicing module, also cite [FRASER](https://www.nature.com/articles/s41467-020-20573-7) or [FRASER2](https://www.sciencedirect.com/science/article/pii/S0002929723003671?dgcid=coauthor), depending on the version that you use +* the MAE module, also cite the [Kremer, Bader et al study](https://www.nature.com/articles/ncomms15824) and [DESeq2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8). For the complete set of tools used by DROP (e.g. for counting), see the [manuscript](https://www.nature.com/articles/s41596-020-00462-5). diff --git a/docs/source/conf.py b/docs/source/conf.py index 63e4ce56..749be82b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ author = 'Michaela Müller' # The full version, including alpha/beta/rc tags -release_ = '1.3.3' +release_ = '1.3.4' diff --git a/docs/source/help.rst b/docs/source/help.rst index 7c322d2c..5783ef33 100644 --- a/docs/source/help.rst +++ b/docs/source/help.rst @@ -8,11 +8,11 @@ Common errors The ``MAE:mae_allelicCounts`` step is susceptible to fail if: -1. the chromosomes styles of the reference genome and the BAM files do not match +1. The chromosomes styles of the reference genome and the BAM files do not match Solution: Identify the chromosomes style of the BAM file. Obtain an appropriate reference genome file and specify it in the config file. -2. the BAM file does not have the correct ``Read Groups`` documentation both the header and reads. You can often identify if the BAM file has any problems by using the command ``gatk ValidateSamFile -I path/to/bam_file.bam`` +2. The BAM file does not have the correct ``Read Groups`` documentation for both the header and reads. You can often identify if the BAM file has any problems by using the command ``gatk ValidateSamFile -I path/to/bam_file.bam`` Solution: To fix this is often dependent on the individual case, but some combination of the following tools is quite helpful: diff --git a/docs/source/index.rst b/docs/source/index.rst index 62e67717..8c7d3ca2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,7 +15,7 @@ Then, DROP can be executed in multiple ways (:doc:`pipeline`). pipeline output license - troubleshooting + help Quickstart ----------- diff --git a/docs/source/installation.rst b/docs/source/installation.rst index fea0d531..ad120274 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -20,7 +20,7 @@ Install the latest version and use the full path in the following command to ins .. code-block:: bash - mamba env create -f DROP_1.3.3.yaml + mamba env create -f DROP_1.3.4.yaml Installation time: ~ 10min diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst index 156edcf3..0bb67663 100644 --- a/docs/source/prepare.rst +++ b/docs/source/prepare.rst @@ -314,9 +314,9 @@ Limiting FDR correction to subsets of genes of interest ------------------------------------ In addition to returning transcriptome-wide results, DROP provides the option to limit the FDR correction to user-provided genes of interest in the -``aberrantExpression`` and ``aberrantSplicing`` modules. These could e.g. be all +``aberrantExpression`` and ``aberrantSplicing`` modules. These could, for example, be all OMIM genes. It is also possible to provide sample-specific genes such as all -genes with a rare splice region variant for each sample. +genes with a rare splice-region variant for each sample. To use this feature, a YAML file containing the set(s) of genes to test (per sample or for all samples) needs to be specified in the ``genesToTest`` parameter of the ``aberrantExpression`` and ``aberrantSplicing`` modules in the config file. @@ -330,11 +330,14 @@ Creating the YAML file specifying subsets of genes to test The file containing the list of genes (HGNC symbols) to be tested must be a YAML file, where the variable names specify the name of each set of tested genes. In the output of DROP, this name will be used to identify the set in the results table. Each set -can either be a list of genes, in which case the set will be tested for all samples. Alternatively -(and additionally), sample-specific sets can be created by giving the RNA_ID of the sample +can either be: i) a list of genes, in which case the set will be tested for all samples, or ii) +sample-specific sets that can be created by giving the RNA_ID of the sample for which the set should be used as the name (see example below). This YAML file can be created in R using ``yaml::write_yaml(subsetList, filepath)``, where ``subsetList`` is a named list of named lists containing the sets of genes to test. +The gene names must match those from the provided gtf file. We currently do not support Ensembl ids as input. +The table with extracted gene names from the gtf file is located under: +``root/processed_data/preprocess/{geneAnnotation}/gene_name_mapping_{geneAnnotation}.tsv``. In the following example, the name of the global set of genes is ``Genes_to_test_on_all_samples`` and the name of the sample-specific set is ``Genes_with_rare_splice_variants``: diff --git a/drop/__init__.py b/drop/__init__.py index 4a5f75c9..95a02cb1 100644 --- a/drop/__init__.py +++ b/drop/__init__.py @@ -4,5 +4,5 @@ from . import utils from . import demo -__version__ = "1.3.3" +__version__ = "1.3.4" diff --git a/drop/cli.py b/drop/cli.py index f3778598..0795ea87 100644 --- a/drop/cli.py +++ b/drop/cli.py @@ -17,7 +17,7 @@ @click.group() @click_log.simple_verbosity_option(logger) -@click.version_option('1.3.3',prog_name='drop') +@click.version_option('1.3.4',prog_name='drop') def main(): diff --git a/drop/installRPackages.R b/drop/installRPackages.R index 42148f58..6635ce61 100644 --- a/drop/installRPackages.R +++ b/drop/installRPackages.R @@ -23,18 +23,20 @@ if (file.exists(args[1])){ package=gsub("=.*", "", unlist(args)), version=gsub(".*=", "", unlist(args)), ref="") - packages[package == version, version:=NA] + packages[package == version, c("min_version", "max_version") := ""] } installed <- as.data.table(installed.packages()) for (pckg_name in packages$package) { package_dt <- packages[package == pckg_name] pckg_name <- gsub(".*/", "", pckg_name) - version <- package_dt$version + min_version <- package_dt$min_version + max_version <- package_dt$max_version branch <- package_dt$ref - if (!pckg_name %in% installed$Package || (!is.na(version) && compareVersion( - installed[Package == pckg_name, Version], version) < 0)) { + if (!pckg_name %in% installed$Package || (min_version != "" && (compareVersion( + installed[Package == pckg_name, Version], min_version) < 0 || compareVersion( + installed[Package == pckg_name, Version], max_version) > 0))) { package <- package_dt$package message(paste("install", package)) diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R index ea198be4..d8f8a9fd 100644 --- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R @@ -48,13 +48,14 @@ cnts_mtx <- counts(ods, normalized = F) #' Consider removing samples with too low or too high size factors. #' bam_coverage <- fread(snakemake@input$bam_cov) -bam_coverage[, sampleID := as.character(sampleID)] +bam_coverage[, RNA_ID := as.character(sampleID)] +bam_coverage[, sampleID := NULL] setnames(bam_coverage, 'record_count', 'total_count') -coverage_dt <- merge(bam_coverage, - data.table(sampleID = colnames(ods), +coverage_dt <- merge(data.table(RNA_ID = colnames(ods), read_count = colSums(cnts_mtx), isExternal = ods@colData$isExternal), - by = "sampleID", sort = FALSE) + bam_coverage, + by = "RNA_ID", sort = FALSE) # read counts coverage_dt[, count_rank := rank(read_count)] @@ -94,7 +95,7 @@ p_sf setnames(coverage_dt, old = c('total_count', 'read_count', 'size_factors'), new = c('Reads Mapped', 'Reads Counted', 'Size Factors')) -DT::datatable(coverage_dt[, .(sampleID, `Reads Mapped`, `Reads Counted`, `Size Factors`)][order(`Reads Mapped`)], +DT::datatable(coverage_dt[, .(RNA_ID, `Reads Mapped`, `Reads Counted`, `Size Factors`)][order(`Reads Mapped`)], caption = 'Reads summary statistics') #' # Filtering diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R index 3c3dec95..ee173f64 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R @@ -54,6 +54,10 @@ splitCountRanges <- rowRanges(splitCounts) # Annotate granges from the split counts splitCountRanges <- FRASER:::annotateSpliceSite(splitCountRanges) saveRDS(splitCountRanges, snakemake@output$gRangesSplitCounts) +# additionally save as tsv.gz (for easier AbSplice input) +fwrite(as.data.table(splitCountRanges), + gsub(".Rds", ".tsv.gz", snakemake@output$gRangesSplitCounts, + ignore.case=TRUE)) # Create ranges for non split counts # Subset by minExpression diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/08_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/08_extract_results_FraseR.R index 739c8839..87dba6ca 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/08_extract_results_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/08_extract_results_FraseR.R @@ -75,7 +75,7 @@ if(nrow(res_junc_dt) > 0){ res_junc_dt <- merge(res_junc_dt, as.data.table(colData(fds)), by = "sampleID") res_junc_dt[, c("bamFile", "pairedEnd", "STRAND", "RNA_BAM_FILE", "DNA_VCF_FILE", "COUNT_MODE", "COUNT_OVERLAPS") := NULL] } else{ - warning("The aberrant splicing pipeline gave 0 results for the ", dataset, " dataset.") + warning("The aberrant splicing pipeline gave 0 intron-level results for the ", dataset, " dataset.") } # Extract full results by gene @@ -102,7 +102,7 @@ res_genes_dt <- res_genes_dt[do.call(pmin, c(res_genes_dt[,padj_cols, with=FALSE abs(deltaPsi) >= snakemake@params$deltaPsiCutoff & totalCounts >= 5,] -if(length(res_gene) > 0){ +if(nrow(res_genes_dt) > 0){ res_genes_dt <- merge(res_genes_dt, as.data.table(colData(fds)), by = "sampleID") res_genes_dt[, c("bamFile", "pairedEnd", "STRAND", "RNA_BAM_FILE", "DNA_VCF_FILE", "COUNT_MODE", "COUNT_OVERLAPS") := NULL] @@ -115,34 +115,35 @@ if(length(res_gene) > 0){ } } } else{ - res_genes_dt <- data.table() warning("The aberrant splicing pipeline gave 0 gene-level results for the ", dataset, " dataset.") } # Annotate results with spliceEventType and blacklist region overlap -# load reference annotation -library(AnnotationDbi) txdb <- loadDb(snakemake@input$txdb) # annotate the type of splice event and UTR overlap -res_junc_dt <- annotatePotentialImpact(result=res_junc_dt, txdb=txdb, fds=fds) -res_genes_dt <- annotatePotentialImpact(result=res_genes_dt, txdb=txdb, fds=fds) - -# set genome assembly version to load correct blacklist region BED file (hg19 or hg38) -assemblyVersion <- snakemake@config$genomeAssembly -if(grepl("grch37", assemblyVersion, ignore.case=TRUE)){ - assemblyVersion <- "hg19" +if(nrow(res_junc_dt) > 0){ + res_junc_dt <- annotatePotentialImpact(result=res_junc_dt, txdb=txdb, fds=fds) } -if(grepl("grch38", assemblyVersion, ignore.case=TRUE)){ - assemblyVersion <- "hg38" +if(nrow(res_genes_dt) > 0){ + res_genes_dt <- annotatePotentialImpact(result=res_genes_dt, txdb=txdb, fds=fds) } +# set genome assembly version to load correct blacklist region BED file (hg19 or hg38) +assemblyVersion <- snakemake@config$genomeAssembly +if(grepl("grch37", assemblyVersion, ignore.case=TRUE)) assemblyVersion <- "hg19" +if(grepl("grch38", assemblyVersion, ignore.case=TRUE)) assemblyVersion <- "hg38" + # annotate overlap with blacklist regions if(assemblyVersion %in% c("hg19", "hg38")){ - res_junc_dt <- flagBlacklistRegions(result=res_junc_dt, + if(nrow(res_junc_dt) > 0){ + res_junc_dt <- flagBlacklistRegions(result=res_junc_dt, assemblyVersion=assemblyVersion) - res_genes_dt <- flagBlacklistRegions(result=res_genes_dt, + } + if(nrow(res_genes_dt) > 0){ + res_genes_dt <- flagBlacklistRegions(result=res_genes_dt, assemblyVersion=assemblyVersion) + } } else{ message(date(), ": cannot annotate blacklist regions as no blacklist region\n", "BED file is available for genome assembly version ", assemblyVersion, diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R index 4897e3b4..1be74698 100644 --- a/drop/modules/mae-pipeline/MAE/Results.R +++ b/drop/modules/mae-pipeline/MAE/Results.R @@ -107,7 +107,7 @@ res[, MAE_ALT := MAE == TRUE & altRatio >= allelicRatioCutoff] #' #' Number of samples with significant MAE for alternative events: `r uniqueN(res[MAE_ALT == TRUE, ID])` -#+echo=F +#+ echo=F # Save full results zipped res[, altRatio := round(altRatio, 3)] diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R index 34298a11..97bc612e 100644 --- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R +++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R @@ -12,7 +12,7 @@ #' type: noindex #'--- -#+echo=F +#+ echo=F saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ @@ -39,6 +39,8 @@ sa[, ANNOTATED_MATCH := TRUE] qc_mat <- readRDS(snakemake@input$mat_qc) melt_mat <- as.data.table(reshape2::melt(qc_mat)) colnames(melt_mat)[1:2] <- c('DNA_ID', 'RNA_ID') +melt_mat[, RNA_ID := as.character(RNA_ID)] +melt_mat[, DNA_ID := as.character(DNA_ID)] ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.05, center = .025) + theme_bw(base_size = 14) + @@ -70,9 +72,10 @@ qc_dt[, PREDICTED_MATCH := value > identityCutoff] check_matches <- function(annot_col, pred_col){ if(sum(pred_col) == 0) return('no match') if(identical(annot_col,pred_col)) return('match') - if(sum(annot_col)==1 & sum(pred_col)==1) return('matches other') - if(sum(annot_col)>1 & sum(pred_col)==1) return('matches less') - if(sum(annot_col)==1 & sum(pred_col)>1) return('matches more') + if(all(rowSums(cbind(annot_col, pred_col)) < 2)) return('matches other') # the pred was never the same as the annot + if(sum(annot_col) > sum(pred_col)) return('matches less') + if(sum(annot_col) < sum(pred_col)) return('matches more') + else return('matches other') } # check DNA and RNA matches (not necessarily the same) @@ -128,6 +131,8 @@ if(nrow(qc_mat) > 1 || ncol(qc_mat) > 1){ #' * Is the sample a relative of the other? #' +melt_mat[, value := round(value, 3)] + #' ### Samples that were annotated to match but do not false_matches <- merge(sa, melt_mat, by = c('DNA_ID', 'RNA_ID'), sort = FALSE, all.x = TRUE) @@ -136,5 +141,6 @@ DT::datatable(false_matches[value < identityCutoff]) #' ### Samples that were not annotated to match but actually do false_mismatches <- merge(melt_mat, sa, by = c('DNA_ID', 'RNA_ID'), sort = FALSE, all.x = TRUE) -DT::datatable(false_mismatches[is.na(ANNOTATED_MATCH) & value > identityCutoff]) +false_mismatches[is.na(ANNOTATED_MATCH), ANNOTATED_MATCH := FALSE] +DT::datatable(false_mismatches[ANNOTATED_MATCH == F & value > identityCutoff]) diff --git a/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R index 16ad18e5..17ab6998 100644 --- a/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R +++ b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R @@ -96,7 +96,7 @@ lp <- bplapply(1:N, function(i){ mat <- do.call(rbind, lp) row.names(mat) <- dna_samples colnames(mat) <- rna_samples -mat <- mat[sa[rows_in_group, DNA_ID], sa[rows_in_group, RNA_ID],drop=FALSE] +# mat <- mat[sa[rows_in_group, DNA_ID], sa[rows_in_group, RNA_ID],drop=FALSE] saveRDS(mat, snakemake@output$mat_qc) diff --git a/drop/modules/mae-pipeline/Snakefile b/drop/modules/mae-pipeline/Snakefile index 18505a5f..22ed3fdd 100644 --- a/drop/modules/mae-pipeline/Snakefile +++ b/drop/modules/mae-pipeline/Snakefile @@ -20,19 +20,6 @@ rule mae_dependency: rule sampleQC: input: cfg.getHtmlFromScript(MAE_WORKDIR / "QC" / "Datasets.R") -rule create_dict: - input: cfg.genome.getFastaList() - output: cfg.genome.getDictList() - shell: - """ - for fasta in {input} - do - if [ ! -f "${{fasta%.*}}.dict" ]; then - gatk CreateSequenceDictionary --REFERENCE $fasta - fi - done - """ - ## MAE rule mae_createSNVs: diff --git a/drop/modules/rvc-pipeline/GATK_BASH/changeHeader.sh b/drop/modules/rvc-pipeline/GATK_BASH/changeHeader.sh index 3e97d2db..d61b11c7 100755 --- a/drop/modules/rvc-pipeline/GATK_BASH/changeHeader.sh +++ b/drop/modules/rvc-pipeline/GATK_BASH/changeHeader.sh @@ -35,8 +35,8 @@ while read header ; do echo "Internal Header $SM_internalHeader matches $sample" |tee $log echo "Internal Header is designated: $SM_internalHeader" |tee -a $log echo "SampleID is $sample" |tee -a $log - ln -f $input_bam $output_bam - ln -f $input_bai $output_bai + ln -s -f $input_bam $output_bam + ln -s -f $input_bai $output_bai echo "Done Linking files" samtools view -H $input_bam > $output_newHeader else diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt index eb65d037..6461207f 100644 --- a/drop/requirementsR.txt +++ b/drop/requirementsR.txt @@ -1,8 +1,8 @@ -package version ref +package min_version max_version ref devtools -gagneurlab/OUTRIDER 1.17.2 HEAD -gagneurlab/FRASER 1.99.1 HEAD -gagneurlab/tMAE 1.0.4 HEAD +gagneurlab/OUTRIDER 1.20.1 1.20.1 1.20.1 +gagneurlab/FRASER 1.99.3 1.99.3 d6a422c +gagneurlab/tMAE 1.0.4 1.0.4 1.0.4 VariantAnnotation rmarkdown knitr diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R index b1ffe222..85d2c1c6 100644 --- a/drop/template/Scripts/MonoallelicExpression/Overview.R +++ b/drop/template/Scripts/MonoallelicExpression/Overview.R @@ -86,7 +86,7 @@ qc_links <- sapply(qc_groups, function(v) build_link_list( #' #' ## Analyze Individual Results -#+echo=FALSE +#+ echo=FALSE # Read the first results table res_sample <- readRDS(snakemake@input$results_sample[[1]]) sample <- unique(res_sample$ID) @@ -95,7 +95,7 @@ library(tMAE) library(ggplot2) rare_column <- 'rare' if(any(is.na(res_sample$rare))) rare_column <- NULL -#+echo=TRUE +#+ echo=TRUE #' ### MA plot: fold change vs RNA coverage plotMA4MAE(res_sample, rare_column = rare_column, diff --git a/drop/template/Scripts/Pipeline/SampleAnnotation.R b/drop/template/Scripts/Pipeline/SampleAnnotation.R index 7963bdc0..eea616ac 100644 --- a/drop/template/Scripts/Pipeline/SampleAnnotation.R +++ b/drop/template/Scripts/Pipeline/SampleAnnotation.R @@ -16,7 +16,7 @@ #' code_download: TRUE #'--- -#+echo=F +#+ echo=F saveRDS(snakemake, snakemake@log$snakemake) suppressPackageStartupMessages({ @@ -76,7 +76,7 @@ unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>% table %>% barplot(xlab = 'DROP groups', ylab = 'Number of samples') # Obtain genes that overlap with HPO terms -#+echo=F +#+ echo=F if(!is.null(sa$HPO_TERMS) & !all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){ sa2 <- sa[, .SD[1], by = RNA_ID] diff --git a/drop/template/Snakefile b/drop/template/Snakefile index 7ff1bd2c..74ed703e 100644 --- a/drop/template/Snakefile +++ b/drop/template/Snakefile @@ -72,28 +72,35 @@ rule dependencyGraph: rule publish_local: shell: "rsync -Ort {config[htmlOutputPath]} {config[webDir]}" -rule prepFasta_fa: - priority: 1 - input: - "{path_to_ref}.fa" - output: - dict = "{path_to_ref}.dict" , - fai = "{path_to_ref}.fa.fai" +rule index_fa: + input: "{path_to_ref}.fa" + output: "{path_to_ref}.fa.fai" shell: """ - samtools faidx {input} -o {output.fai} - gatk CreateSequenceDictionary -R {input} -O {output.dict} + samtools faidx {input} -o {output} """ -rule prepFasta_fasta: - priority: 1 - input: - "{path_to_ref}.fasta" - output: - dict = "{path_to_ref}.dict" , - fai = "{path_to_ref}.fasta.fai" +rule dict_fa: + input: "{path_to_ref}.fa" + output: "{path_to_ref}.dict" shell: """ - samtools faidx {input} -o {output.fai} - gatk CreateSequenceDictionary -R {input} -O {output.dict} + gatk CreateSequenceDictionary --REFERENCE {input} --OUTPUT {output} + """ + +rule index_fasta: + input: "{path_to_ref}.fasta" + output: "{path_to_ref}.fasta.fai" + shell: """ + samtools faidx {input} -o {output} + """ + +rule dict_fasta: + input: "{path_to_ref}.fasta" + output: "{path_to_ref}.dict" + shell: + """ + gatk CreateSequenceDictionary --REFERENCE {input} --OUTPUT {output} + """ + diff --git a/environment.yml b/environment.yml index 53be3e1c..44e1a3a6 100644 --- a/environment.yml +++ b/environment.yml @@ -3,9 +3,10 @@ channels: - conda-forge - bioconda dependencies: - - python==3.8 + - python>=3.8 - pip - drop + - snakemake>=5,<8 - flake8 - bioconductor-bsgenome.hsapiens.ucsc.hg19 diff --git a/setup.cfg b/setup.cfg index 09af7983..ae813b4e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.3.3 +current_version = 1.3.4 commit = True [bumpversion:file:setup.py] diff --git a/setup.py b/setup.py index 65b687c0..9499792c 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setuptools.setup( name="drop", - version="1.3.3", + version="1.3.4", author="Vicente A. Yépez, Michaela Müller, Nicholas H. Smith, Daniela Klaproth-Andrade, Luise Schuller, Ines Scheller, Christian Mertes , Julien Gagneur ", author_email="yepez@in.tum.de",