Skip to content
This repository has been archived by the owner on Nov 19, 2024. It is now read-only.

Commit

Permalink
Merge branch 'master' of github.com:johanneskoester/alpaca
Browse files Browse the repository at this point in the history
  • Loading branch information
johanneskoester committed May 25, 2015
2 parents ec9b634 + c680d5e commit 0d1654b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 30 deletions.
65 changes: 36 additions & 29 deletions analysis/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import time
import json
import math

import pandas as pd
import numpy as np
import pysam
import vcf
Expand Down Expand Up @@ -142,13 +143,13 @@ include:
"https://bitbucket.org/johanneskoester/snakemake-workflows/raw/master/bio/ngs/rules/variant_calling/bcftools.rules"


target_fpr_vs_tpr = expand(
"plots/fpr_vs_tpr/{query_fpr}.{query_tpr}.{layout}.pdf",
target_fpr_vs_tpr = expand(expand(
"plots/fpr_vs_tpr/{{depth}}/{query_fpr}.{query_tpr}.{layout}.pdf",
zip,
query_fpr=config["fpr"]["queries"],
query_tpr=config["tpr"]["queries"],
layout=config["query_plot_layouts"]
)
), depth=[4, 20])


########################### Target rules #######################################
Expand Down Expand Up @@ -210,7 +211,7 @@ rule plot_coverages:

############################### FPR vs TPR #####################################

fpr_vs_tpr_caller = "alpaca/10 alpaca/0.05 gatk freebayes samtools".split()
fpr_vs_tpr_caller = "alpaca/1 alpaca/0.05 gatk freebayes samtools".split()
fpr_vs_tpr_labels = "ALPACA ALPACA(FDR) GATK FreeBayes SAMtools".split()


Expand Down Expand Up @@ -249,13 +250,12 @@ rule fpr_fp:
extract_depth_qual(input[0], output[0], get_samples_from_query(wildcards.query, filter_only=True))


def covered_depths(bams, output, positions=""):
def pileup_depths(bams, output, positions=""):
if positions:
positions = "--positions " + positions
shell(r"echo -e 'count\tdepth' > '{output}'; "
"samtools mpileup {positions} {bams} | "
r"awk -F '\t' '{{depth=0}} {{for (i=4; i<=NF; i+=3) depth+=$i}} {{print depth}}' | "
r"sort -n | uniq -c | awk '{{print $1,$2}}' OFS='\t' >> '{output}'")
shell(r"echo depth > '{output}'; "
"samtools mpileup -q0 -Q0 {positions} {bams} | "
r"awk -F '\t' '{{depth=0}} {{for (i=4; i<=NF; i+=3) depth+=$i}} {{print depth}}' >> '{output}'")


rule fpr_covered_depths:
Expand All @@ -271,28 +271,29 @@ rule fpr_covered_depths:
"fpr/tn/covered_depths/{query}.txt"
resources: benchmark=1
run:
covered_depths(input.bams, output[0])
pileup_depths(input.bams, output[0])


def rate(positive, negative_depths, label, min_depth=10):
def rate(positive, negative_depths, label, min_depth=1):
positive = positive[positive["depth"] >= min_depth]
negative_depths = negative_depths[negative_depths["depth"] >= min_depth]
#positive = positive[positive["qual"] >= 10]
negative_total = (negative_depths["depth"] >= min_depth).sum()

positive_cum = np.bincount(np.rint(positive["qual"]))[::-1].cumsum()[::-1]
negative_total = negative_depths["count"].sum()
positive_cum = np.bincount(np.asarray(np.rint(positive["qual"]), dtype=np.int64))[::-1].cumsum()[::-1]
negative_cum = negative_total - positive_cum

total_cum = positive_cum + negative_cum
rate_cum = positive_cum / total_cum

return pd.DataFrame({"qual": np.arange(len(rate_cum)), label: rate_cum})
first_small = np.argmax(positive_cum < 20)
return pd.DataFrame({"qual": np.arange(len(rate_cum)), label: rate_cum}), first_small


def fpr(fp, covered_depths, min_depth=10):
def fpr(fp, covered_depths, min_depth=4):
return rate(fp, covered_depths, "fpr", min_depth=min_depth)


def tpr(tp, covered_depths, min_depth=10):
def tpr(tp, covered_depths, min_depth=4):
return rate(tp, covered_depths, "tpr", min_depth=min_depth)


Expand Down Expand Up @@ -331,7 +332,7 @@ rule tpr_tp_calls:
"tpr/tp/{caller}/{query}.vcf"
resources: benchmark=1
shell:
"bedtools intersect -header -A -a '{input.vcf}' -b {input.gold} > '{output}'"
"bedtools intersect -header -a '{input.vcf}' -b {input.gold} > '{output}'"


rule tpr_tp:
Expand All @@ -358,7 +359,7 @@ rule tpr_covered_depths:
"tpr/fn/covered_depths/{query}.txt"
resources: benchmark=1
run:
covered_depths(input.bams, output[0], positions=input.gold)
pileup_depths(input.bams, output[0], positions=input.gold)


rule fpr_vs_tpr:
Expand All @@ -368,28 +369,33 @@ rule fpr_vs_tpr:
fn_cov="tpr/fn/covered_depths/{query_tpr}.txt",
tn_cov="fpr/tn/covered_depths/{query_fpr}.txt"
output:
"plots/fpr_vs_tpr/{query_fpr}.{query_tpr}.{layout,[lxy]+}.pdf"
"plots/fpr_vs_tpr/{depth,[0-9]+}/{query_fpr}.{query_tpr}.{layout,[lxy]+}.pdf"
run:
tn_cov = pd.read_table(input.tn_cov)
fn_cov = pd.read_table(input.fn_cov)
depth = int(wildcards.depth)

figure(figsize=(2.8, 2.5))
styles = "- -b -- : -.".split()
styles = "- x -- : -.".split()
for tp, fp, style, label in zip(input.tp, input.fp, styles, fpr_vs_tpr_labels):
tp = pd.read_table(tp)
fp = pd.read_table(fp)
d = pd.merge(fpr(fp, tn_cov), tpr(tp, fn_cov))

plt.plot(d["fpr"], d["tpr"], style, label=label)
_fpr, fpr_small = fpr(fp, tn_cov, depth)
_tpr, tpr_small = tpr(tp, fn_cov, depth)
d = pd.merge(_fpr, _tpr)
if label == "ALPACA(FDR)":
plt.semilogx(d["fpr"][0], d["tpr"][0], style, mew=1)
else:
plt.semilogx(d["fpr"], d["tpr"], style, label=label)

if "x" in wildcards.layout:
plt.xlabel("FPR")
if "y" in wildcards.layout:
plt.ylabel("TPR")
plt.ylim((0, 1))
plt.xlim((0, 1))
#plt.ylim((0, 1))
#plt.xlim((0, 0.0001))
if "l" in wildcards.layout:
plt.legend(loc="lower left", handlelength=2.5)
plt.legend(loc="lower right", handlelength=2.5)
savefig(output[0], bbox_inches="tight")

############################### run time performance ###########################
Expand Down Expand Up @@ -604,7 +610,7 @@ rule alpaca_preprocess:
benchmark:
"benchmarks/alpaca/{sample}.preprocess.json"
shell:
"alpaca preprocess --threads {threads} "
"alpaca preprocess --no-BAQ --threads {threads} "
"{input.ref} {input.bam} > {output} 2> {log}"


Expand Down Expand Up @@ -728,6 +734,7 @@ rule gatk_genotyping:
resources: benchmark=100
shell:
"gatk -T GenotypeGVCFs {params.gvcfs} -nt {threads} -R {input.ref} "
"-stand_call_conf 1 -stand_emit_conf 1 "
"-o '{output}' &> '{log}'"


Expand Down Expand Up @@ -834,7 +841,7 @@ rule samtools:
resources: benchmark=100
shell:
"cut -f1 {input.fai} | parallel -j {threads} "
"'samtools mpileup -t DP -gu -C50 -f {input.ref} "
"'samtools mpileup -t DP -gu -B -C50 -Q0 -q1 -f {input.ref} "
"-r {{}} {input.bams} | "
"bcftools call -m -v - > \"{output}.{{}}.bcf\"'; "
"cut -f1 {input.fai} | "
Expand Down
3 changes: 2 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ pub fn preprocess<P: AsRef<Path> + Sync>(fasta: &P, bams: &[P], threads: usize,
.arg("--fasta-ref").arg(fasta.as_ref())
.arg("--BCF").arg("--uncompressed")
.arg("--output-tags").arg("DP")
.arg("--min-MQ").arg("1") // minimum mapping quality
.arg("--min-MQ").arg("1") // minimum mapping quality (discard ambiguous reads)
.arg("--min-BQ").arg("0") // use all bases (the model cares for bad quality)
.arg("--adjust-MQ").arg("50") // mapping quality downgrade in case of excessive mismatches
.arg("--output").arg(&fifo)
.args(&bams.iter().map(|bam| bam.as_ref()).collect_vec());
Expand Down

0 comments on commit 0d1654b

Please sign in to comment.