diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py index 91ef5e04..5d87fb96 100644 --- a/drop/config/submodules/MonoallelicExpression.py +++ b/drop/config/submodules/MonoallelicExpression.py @@ -17,7 +17,7 @@ def __init__( super().__init__(config, sampleAnnotation, processedDataDir, processedResultsDir, workDir) self.CONFIG_KEYS = [ "groups", "genome", "qcVcf", "qcGroups", "gatkIgnoreHeaderCheck", "padjCutoff", - "allelicRatioCutoff", "maxAF", "gnomAD" + "allelicRatioCutoff", "maxAF", "gnomAD","maxVarFreqCohort" ] self.name = "MonoallelicExpression" # if self.run is false return without doing any config/sa checks for completeness @@ -173,7 +173,10 @@ def setGenomeDict(self, genomeFiles): # look up for a sampleID genomeFiles{ncbi -> path} and sampleGenomes {sampleID -> ncbi} def getGenomePath(self, sampleID): try: - return self.genomeFiles[self.sampleGenomes[sampleID]] + if len(self.genomeFiles) == 1: + return list(self.genomeFiles.values())[0] + else: + return self.genomeFiles[self.sampleGenomes[sampleID]] except KeyError: raise KeyError( f"The Config file has defined specific key,value for genome path " diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml index 4f09bd7d..f79852d0 100755 --- a/drop/demo/config_relative.yaml +++ b/drop/demo/config_relative.yaml @@ -57,7 +57,7 @@ mae: allelicRatioCutoff: 0.8 addAF: false maxAF: .001 - maxVarFreqCohort: .04 + maxVarFreqCohort: 1 # VCF-BAM matching qcVcf: Data/qc_vcf_1000G.vcf.gz qcGroups: diff --git a/drop/modules/mae-pipeline/MAE/Datasets.R b/drop/modules/mae-pipeline/MAE/Datasets.R index 451c1906..a6bb8eb1 100644 --- a/drop/modules/mae-pipeline/MAE/Datasets.R +++ b/drop/modules/mae-pipeline/MAE/Datasets.R @@ -24,7 +24,7 @@ devNull <- sapply(datasets, function(name){ cat(paste0( "

Dataset: ", name, "

", "

", - "
", "MAE results", + "
", "MAE results", "
", "

" )) }) diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R index a7aa67b7..55de53d9 100644 --- a/drop/modules/mae-pipeline/MAE/Results.R +++ b/drop/modules/mae-pipeline/MAE/Results.R @@ -7,6 +7,7 @@ #' params: #' - allelicRatioCutoff: '`sm cfg.MAE.get("allelicRatioCutoff")`' #' - padjCutoff: '`sm cfg.MAE.get("padjCutoff")`' +#' - maxCohortFreq: '`sm cfg.MAE.get("maxVarFreqCohort")`' #' input: #' - mae_res: '`sm lambda w: expand(cfg.getProcessedResultsDir() + #' "/mae/samples/{id}_res.Rds", id=cfg.MAE.getMaeByGroup({w.dataset}))`' @@ -18,6 +19,8 @@ #' "/mae/{dataset}/MAE_results_all_{annotation}.tsv.gz"`' #' - res_signif: '`sm cfg.getProcessedResultsDir() + #' "/mae/{dataset}/MAE_results_{annotation}.tsv"`' +#' - res_signif_rare: '`sm cfg.getProcessedResultsDir() + +#' "/mae/{dataset}/MAE_results_{annotation}_rare.tsv"`' #' - wBhtml: '`sm config["htmlOutputPath"] + #' "/MonoallelicExpression/{dataset}--{annotation}_results.html"`' #' type: noindex @@ -77,12 +80,19 @@ res[, c('aux') := NULL] # Bring gene_name column front res <- cbind(res[, .(gene_name)], res[, -"gene_name"]) +# Calculate variant frequency within cohort +maxCohortFreq <-snakemake@params$maxCohortFreq +res[, N_var := .N, by = .(gene_name, contig, position)] +res[, cohort_freq := N_var / uniqueN(ID)] + +res[,rare := (rare | is.na(rare)) & cohort_freq <= maxCohortFreq] + # Add significance columns allelicRatioCutoff <- snakemake@params$allelicRatioCutoff res[, MAE := padj <= snakemake@params$padjCutoff & - (altRatio >= allelicRatioCutoff | altRatio <= (1-allelicRatioCutoff))] + (altRatio >= allelicRatioCutoff | altRatio <= (1-allelicRatioCutoff)) + ] res[, MAE_ALT := MAE == TRUE & altRatio >= allelicRatioCutoff] - #' #' Number of samples: `r uniqueN(res$ID)` #' @@ -100,6 +110,10 @@ fwrite(res, snakemake@output$res_all, sep = '\t', fwrite(res[MAE_ALT == TRUE], snakemake@output$res_signif, sep = '\t', row.names = F, quote = F) +# Save significant results +fwrite(res[MAE_ALT == TRUE & rare == TRUE], snakemake@output$res_signif_rare, + sep = '\t', row.names = F, quote = F) + # Add columns for plot res[, N := .N, by = ID] @@ -125,6 +139,12 @@ ggplot(melt_dt, aes(variable, value)) + geom_boxplot() + labs(y = 'Heterozygous SNVs per patient', x = '') + annotation_logticks(sides = "l") +#' +#' ## Variant Frequency within Cohort Histogram +ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02) + + geom_vline(xintercept = maxCohortFreq, col = "red") + + xlab("Variant frequency in cohort") + ylab("Count") + #' Median of each category DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable]) diff --git a/drop/modules/mae-pipeline/MAE/filterSNVs.sh b/drop/modules/mae-pipeline/MAE/filterSNVs.sh index 54386d47..c59449c3 100755 --- a/drop/modules/mae-pipeline/MAE/filterSNVs.sh +++ b/drop/modules/mae-pipeline/MAE/filterSNVs.sh @@ -27,6 +27,7 @@ if [ $vcf_id != 'QC' ]; then sample_flag="-s ${vcf_id}"; fi $bcftools view $vcf_file | \ grep -vP '^##INFO=' | \ awk -F'\t' 'BEGIN {OFS = FS} { if($1 ~ /^[^#]/){ $8 = "." }; print $0 }' | \ + $bcftools norm -m-both | \ $bcftools view ${sample_flag} -m2 -M2 -v snps -O z -o $tmp $bcftools index -t $tmp diff --git a/tests/config/test_MAE.py b/tests/config/test_MAE.py index 54bf1d80..54a2d8e6 100644 --- a/tests/config/test_MAE.py +++ b/tests/config/test_MAE.py @@ -11,7 +11,7 @@ def test_config(self,dropConfig,demo_dir): 'allelicRatioCutoff': 0.8, 'maxAF': 0.001, 'addAF': False, - 'maxVarFreqCohort': 0.04, + 'maxVarFreqCohort': 1, 'gnomAD': False } assert dict_.items() <= dropConfig.MAE.dict_.items()