Merge branch 'master' of github.com:johanneskoester/alpaca

johanneskoester · May 25, 2015 · 0d1654b · 0d1654b
2 parents ec9b634 + c680d5e
commit 0d1654b
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 30 deletions.
diff --git a/analysis/Snakefile b/analysis/Snakefile
@@ -7,6 +7,7 @@ import time
 import json
 import math
 
+import pandas as pd
 import numpy as np
 import pysam
 import vcf
@@ -142,13 +143,13 @@ include:
     "https://bitbucket.org/johanneskoester/snakemake-workflows/raw/master/bio/ngs/rules/variant_calling/bcftools.rules"
 
 
-target_fpr_vs_tpr =  expand(
-    "plots/fpr_vs_tpr/{query_fpr}.{query_tpr}.{layout}.pdf",
+target_fpr_vs_tpr =  expand(expand(
+    "plots/fpr_vs_tpr/{{depth}}/{query_fpr}.{query_tpr}.{layout}.pdf",
     zip,
     query_fpr=config["fpr"]["queries"],
     query_tpr=config["tpr"]["queries"],
     layout=config["query_plot_layouts"]
-)
+), depth=[4, 20])
 
 
 ########################### Target rules #######################################
@@ -210,7 +211,7 @@ rule plot_coverages:
 
 ############################### FPR vs TPR #####################################
 
-fpr_vs_tpr_caller = "alpaca/10 alpaca/0.05 gatk freebayes samtools".split()
+fpr_vs_tpr_caller = "alpaca/1 alpaca/0.05 gatk freebayes samtools".split()
 fpr_vs_tpr_labels = "ALPACA ALPACA(FDR) GATK FreeBayes SAMtools".split()
 
 
@@ -249,13 +250,12 @@ rule fpr_fp:
         extract_depth_qual(input[0], output[0], get_samples_from_query(wildcards.query, filter_only=True))
 
 
-def covered_depths(bams, output, positions=""):
+def pileup_depths(bams, output, positions=""):
     if positions:
         positions = "--positions " + positions
-    shell(r"echo -e 'count\tdepth' > '{output}'; "
-    "samtools mpileup {positions} {bams} | "
-    r"awk -F '\t' '{{depth=0}} {{for (i=4; i<=NF; i+=3) depth+=$i}} {{print depth}}' | "
-    r"sort -n | uniq -c | awk '{{print $1,$2}}' OFS='\t' >> '{output}'")
+    shell(r"echo depth > '{output}'; "
+    "samtools mpileup -q0 -Q0 {positions} {bams} | "
+    r"awk -F '\t' '{{depth=0}} {{for (i=4; i<=NF; i+=3) depth+=$i}} {{print depth}}' >> '{output}'")
 
 
 rule fpr_covered_depths:
@@ -271,28 +271,29 @@ rule fpr_covered_depths:
         "fpr/tn/covered_depths/{query}.txt"
     resources: benchmark=1
     run:
-        covered_depths(input.bams, output[0])
+        pileup_depths(input.bams, output[0])
 
 
-def rate(positive, negative_depths, label, min_depth=10):
+def rate(positive, negative_depths, label, min_depth=1):
     positive = positive[positive["depth"] >= min_depth]
-    negative_depths = negative_depths[negative_depths["depth"] >= min_depth]
+    #positive = positive[positive["qual"] >= 10]
+    negative_total = (negative_depths["depth"] >= min_depth).sum()
 
-    positive_cum = np.bincount(np.rint(positive["qual"]))[::-1].cumsum()[::-1]
-    negative_total = negative_depths["count"].sum()
+    positive_cum = np.bincount(np.asarray(np.rint(positive["qual"]), dtype=np.int64))[::-1].cumsum()[::-1]
     negative_cum = negative_total - positive_cum
 
     total_cum = positive_cum + negative_cum
     rate_cum = positive_cum / total_cum
 
-    return pd.DataFrame({"qual": np.arange(len(rate_cum)), label: rate_cum})
+    first_small = np.argmax(positive_cum < 20)
+    return pd.DataFrame({"qual": np.arange(len(rate_cum)), label: rate_cum}), first_small
 
 
-def fpr(fp, covered_depths, min_depth=10):
+def fpr(fp, covered_depths, min_depth=4):
     return rate(fp, covered_depths, "fpr", min_depth=min_depth)
 
 
-def tpr(tp, covered_depths, min_depth=10):
+def tpr(tp, covered_depths, min_depth=4):
     return rate(tp, covered_depths, "tpr", min_depth=min_depth)
 
 
@@ -331,7 +332,7 @@ rule tpr_tp_calls:
         "tpr/tp/{caller}/{query}.vcf"
     resources: benchmark=1
     shell:
-        "bedtools intersect -header -A -a '{input.vcf}' -b {input.gold} > '{output}'"
+        "bedtools intersect -header -a '{input.vcf}' -b {input.gold} > '{output}'"
 
 
 rule tpr_tp:
@@ -358,7 +359,7 @@ rule tpr_covered_depths:
         "tpr/fn/covered_depths/{query}.txt"
     resources: benchmark=1
     run:
-        covered_depths(input.bams, output[0], positions=input.gold)
+        pileup_depths(input.bams, output[0], positions=input.gold)
 
 
 rule fpr_vs_tpr:
@@ -368,28 +369,33 @@ rule fpr_vs_tpr:
         fn_cov="tpr/fn/covered_depths/{query_tpr}.txt",
         tn_cov="fpr/tn/covered_depths/{query_fpr}.txt"
     output:
-        "plots/fpr_vs_tpr/{query_fpr}.{query_tpr}.{layout,[lxy]+}.pdf"
+        "plots/fpr_vs_tpr/{depth,[0-9]+}/{query_fpr}.{query_tpr}.{layout,[lxy]+}.pdf"
     run:
         tn_cov = pd.read_table(input.tn_cov)
         fn_cov = pd.read_table(input.fn_cov)
+        depth = int(wildcards.depth)
 
         figure(figsize=(2.8, 2.5))
-        styles = "- -b -- : -.".split()
+        styles = "- x -- : -.".split()
         for tp, fp, style, label in zip(input.tp, input.fp, styles, fpr_vs_tpr_labels):
             tp = pd.read_table(tp)
             fp = pd.read_table(fp)
-            d = pd.merge(fpr(fp, tn_cov), tpr(tp, fn_cov))
-
-            plt.plot(d["fpr"], d["tpr"], style, label=label)
+            _fpr, fpr_small = fpr(fp, tn_cov, depth)
+            _tpr, tpr_small = tpr(tp, fn_cov, depth)
+            d = pd.merge(_fpr, _tpr)
+            if label == "ALPACA(FDR)":
+                plt.semilogx(d["fpr"][0], d["tpr"][0], style, mew=1)
+            else:
+                plt.semilogx(d["fpr"], d["tpr"], style, label=label)
 
         if "x" in wildcards.layout:
             plt.xlabel("FPR")
         if "y" in wildcards.layout:
             plt.ylabel("TPR")
-        plt.ylim((0, 1))
-        plt.xlim((0, 1))
+        #plt.ylim((0, 1))
+        #plt.xlim((0, 0.0001))
         if "l" in wildcards.layout:
-            plt.legend(loc="lower left", handlelength=2.5)
+            plt.legend(loc="lower right", handlelength=2.5)
         savefig(output[0], bbox_inches="tight")
 
 ############################### run time performance ###########################
@@ -604,7 +610,7 @@ rule alpaca_preprocess:
     benchmark:
         "benchmarks/alpaca/{sample}.preprocess.json"
     shell:
-        "alpaca preprocess --threads {threads} "
+        "alpaca preprocess --no-BAQ --threads {threads} "
         "{input.ref} {input.bam} > {output} 2> {log}"
 
 
@@ -728,6 +734,7 @@ rule gatk_genotyping:
     resources: benchmark=100
     shell:
         "gatk -T GenotypeGVCFs {params.gvcfs} -nt {threads} -R {input.ref} "
+        "-stand_call_conf 1 -stand_emit_conf 1 "
         "-o '{output}' &> '{log}'"
 
 
@@ -834,7 +841,7 @@ rule samtools:
     resources: benchmark=100
     shell:
         "cut -f1 {input.fai} | parallel -j {threads} "
-        "'samtools mpileup -t DP -gu -C50 -f {input.ref} "
+        "'samtools mpileup -t DP -gu -B -C50 -Q0 -q1 -f {input.ref} "
         "-r {{}} {input.bams} | "
         "bcftools call -m -v - > \"{output}.{{}}.bcf\"'; "
         "cut -f1 {input.fai} | "

diff --git a/src/cli.rs b/src/cli.rs
@@ -46,7 +46,8 @@ pub fn preprocess<P: AsRef<Path> + Sync>(fasta: &P, bams: &[P], threads: usize,
             .arg("--fasta-ref").arg(fasta.as_ref())
             .arg("--BCF").arg("--uncompressed")
             .arg("--output-tags").arg("DP")
-            .arg("--min-MQ").arg("1")  // minimum mapping quality
+            .arg("--min-MQ").arg("1")  // minimum mapping quality (discard ambiguous reads)
+            .arg("--min-BQ").arg("0")  // use all bases (the model cares for bad quality)
             .arg("--adjust-MQ").arg("50")  // mapping quality downgrade in case of excessive mismatches
             .arg("--output").arg(&fifo)
             .args(&bams.iter().map(|bam| bam.as_ref()).collect_vec());