Skip to content

Commit

Permalink
Mae filter (#241)
Browse files Browse the repository at this point in the history
* Update to version 1.1.0 (#229)

Major changes included in this update is:
* adapt to new snakemake version (#208)
* Speedup CI (use micromamba, #218)
* Add HTML cutoff for results (#217)
* Separate modules (#209)
* Update documentation (#228)

On top of this many small bug-fixes are included:
* #175 #192 #198 #231
* Simplify result reporting


Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Co-authored-by: Vicente <yepez@in.tum.de>
Co-authored-by: Michaela Mueller <51025211+mumichae@users.noreply.github.com>
Co-authored-by: Michaela Mueller <mumichae@in.tum.de>
Co-authored-by: Smith Nicholas <smith@in.tum.de>
Co-authored-by: nickhsmith <smithnickh@gmail.com>
Co-authored-by: Christian Mertes <mertes@in.tum.de>

* add splitting to MAE filterSNVs.sh

* fix broken link and maxVarFreqCohort

* fix broken link and maxVarFreqCohort

* Update test_MAE.py

* code review fixes

Co-authored-by: Michaela Mueller <51025211+mumichae@users.noreply.github.com>
Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Co-authored-by: Vicente <yepez@in.tum.de>
Co-authored-by: Michaela Mueller <mumichae@in.tum.de>
Co-authored-by: Smith Nicholas <smith@in.tum.de>
Co-authored-by: Christian Mertes <mertes@in.tum.de>
  • Loading branch information
7 people authored Aug 11, 2021
1 parent 2b4bf5d commit 8df2d1e
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 7 deletions.
7 changes: 5 additions & 2 deletions drop/config/submodules/MonoallelicExpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
super().__init__(config, sampleAnnotation, processedDataDir, processedResultsDir, workDir)
self.CONFIG_KEYS = [
"groups", "genome", "qcVcf", "qcGroups", "gatkIgnoreHeaderCheck", "padjCutoff",
"allelicRatioCutoff", "maxAF", "gnomAD"
"allelicRatioCutoff", "maxAF", "gnomAD","maxVarFreqCohort"
]
self.name = "MonoallelicExpression"
# if self.run is false return without doing any config/sa checks for completeness
Expand Down Expand Up @@ -173,7 +173,10 @@ def setGenomeDict(self, genomeFiles):
# look up for a sampleID genomeFiles{ncbi -> path} and sampleGenomes {sampleID -> ncbi}
def getGenomePath(self, sampleID):
try:
return self.genomeFiles[self.sampleGenomes[sampleID]]
if len(self.genomeFiles) == 1:
return list(self.genomeFiles.values())[0]
else:
return self.genomeFiles[self.sampleGenomes[sampleID]]
except KeyError:
raise KeyError(
f"The Config file has defined specific key,value for genome path "
Expand Down
2 changes: 1 addition & 1 deletion drop/demo/config_relative.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ mae:
allelicRatioCutoff: 0.8
addAF: false
maxAF: .001
maxVarFreqCohort: .04
maxVarFreqCohort: 1
# VCF-BAM matching
qcVcf: Data/qc_vcf_1000G.vcf.gz
qcGroups:
Expand Down
2 changes: 1 addition & 1 deletion drop/modules/mae-pipeline/MAE/Datasets.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ devNull <- sapply(datasets, function(name){
cat(paste0(
"<h1>Dataset: ", name, "</h1>",
"<p>",
"</br>", "<a href='MAE/", name, "--", version, "_results.html' >MAE results</a>",
"</br>", "<a href='MonoallelicExpression/", name, "--", version, "_results.html' >MAE results</a>",
"</br>", "</p>"
))
})
Expand Down
24 changes: 22 additions & 2 deletions drop/modules/mae-pipeline/MAE/Results.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#' params:
#' - allelicRatioCutoff: '`sm cfg.MAE.get("allelicRatioCutoff")`'
#' - padjCutoff: '`sm cfg.MAE.get("padjCutoff")`'
#' - maxCohortFreq: '`sm cfg.MAE.get("maxVarFreqCohort")`'
#' input:
#' - mae_res: '`sm lambda w: expand(cfg.getProcessedResultsDir() +
#' "/mae/samples/{id}_res.Rds", id=cfg.MAE.getMaeByGroup({w.dataset}))`'
Expand All @@ -18,6 +19,8 @@
#' "/mae/{dataset}/MAE_results_all_{annotation}.tsv.gz"`'
#' - res_signif: '`sm cfg.getProcessedResultsDir() +
#' "/mae/{dataset}/MAE_results_{annotation}.tsv"`'
#' - res_signif_rare: '`sm cfg.getProcessedResultsDir() +
#' "/mae/{dataset}/MAE_results_{annotation}_rare.tsv"`'
#' - wBhtml: '`sm config["htmlOutputPath"] +
#' "/MonoallelicExpression/{dataset}--{annotation}_results.html"`'
#' type: noindex
Expand Down Expand Up @@ -77,12 +80,19 @@ res[, c('aux') := NULL]
# Bring gene_name column front
res <- cbind(res[, .(gene_name)], res[, -"gene_name"])

# Calculate variant frequency within cohort
maxCohortFreq <-snakemake@params$maxCohortFreq
res[, N_var := .N, by = .(gene_name, contig, position)]
res[, cohort_freq := N_var / uniqueN(ID)]

res[,rare := (rare | is.na(rare)) & cohort_freq <= maxCohortFreq]

# Add significance columns
allelicRatioCutoff <- snakemake@params$allelicRatioCutoff
res[, MAE := padj <= snakemake@params$padjCutoff &
(altRatio >= allelicRatioCutoff | altRatio <= (1-allelicRatioCutoff))]
(altRatio >= allelicRatioCutoff | altRatio <= (1-allelicRatioCutoff))
]
res[, MAE_ALT := MAE == TRUE & altRatio >= allelicRatioCutoff]

#'
#' Number of samples: `r uniqueN(res$ID)`
#'
Expand All @@ -100,6 +110,10 @@ fwrite(res, snakemake@output$res_all, sep = '\t',
fwrite(res[MAE_ALT == TRUE], snakemake@output$res_signif,
sep = '\t', row.names = F, quote = F)

# Save significant results
fwrite(res[MAE_ALT == TRUE & rare == TRUE], snakemake@output$res_signif_rare,
sep = '\t', row.names = F, quote = F)


# Add columns for plot
res[, N := .N, by = ID]
Expand All @@ -125,6 +139,12 @@ ggplot(melt_dt, aes(variable, value)) + geom_boxplot() +
labs(y = 'Heterozygous SNVs per patient', x = '') +
annotation_logticks(sides = "l")

#'
#' ## Variant Frequency within Cohort Histogram
ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02) +
geom_vline(xintercept = maxCohortFreq, col = "red") +
xlab("Variant frequency in cohort") + ylab("Count")

#' Median of each category
DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable])

Expand Down
1 change: 1 addition & 0 deletions drop/modules/mae-pipeline/MAE/filterSNVs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ if [ $vcf_id != 'QC' ]; then sample_flag="-s ${vcf_id}"; fi
$bcftools view $vcf_file | \
grep -vP '^##INFO=' | \
awk -F'\t' 'BEGIN {OFS = FS} { if($1 ~ /^[^#]/){ $8 = "." }; print $0 }' | \
$bcftools norm -m-both | \
$bcftools view ${sample_flag} -m2 -M2 -v snps -O z -o $tmp
$bcftools index -t $tmp

Expand Down
2 changes: 1 addition & 1 deletion tests/config/test_MAE.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_config(self,dropConfig,demo_dir):
'allelicRatioCutoff': 0.8,
'maxAF': 0.001,
'addAF': False,
'maxVarFreqCohort': 0.04,
'maxVarFreqCohort': 1,
'gnomAD': False
}
assert dict_.items() <= dropConfig.MAE.dict_.items()
Expand Down

0 comments on commit 8df2d1e

Please sign in to comment.