Skip to content

Commit

Permalink
add --drop
Browse files Browse the repository at this point in the history
  • Loading branch information
brentp committed Sep 19, 2018
1 parent fe2ddc2 commit 29bec5c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ v0.0.4 (dev)
============
+ small bug-fixes
+ duphold now uses about half as much memory
+ add -d/--drop flag which will drop all samples from a VCF except the
one matching the sample in --bam. this simplifies per-sample
parallelization followed by merge.


v0.0.3
======
Expand Down
33 changes: 28 additions & 5 deletions src/duphold.nim
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ proc update*(s:var Stats, d:int, include_zero:bool) {.inline.} =
s.m += (df - s.m) / s.n.float64
s.S += (df - s.m) * (df - mprev)

proc clear*(s:var Stats) =
s.n = 0
s.S = 0
s.m = 0
s.t = 0

proc stddev*(s:Stats): float64 {.inline.} =
sqrt(s.S/s.n.float64)

proc addm*(s:Stats, d:int) {.inline.} =
# update only the t and n.
# not that this method is never used on a struct where the update method is also used.
Expand Down Expand Up @@ -58,7 +67,7 @@ proc count_gc(s:string): float32 =
result += 1
result /= s.len.float32

proc gc_content(fai:Fai, chrom:string, step:int): seq[float32] =
proc gc_content*(fai:Fai, chrom:string, step:int): seq[float32] =
var s = fai.get(chrom).toUpperAscii()
result = newSeq[float32]((s.len/step+1).int)

Expand Down Expand Up @@ -166,7 +175,8 @@ proc check_rapid_depth_change[T](start:int, stop:int, values: var seq[T], w:int=
if changes > 2:
result = 0

proc duphold*[T](variant:Variant, values:var seq[T], sample_i: int, stats:var Stats, gc_stats:var seq[Stats], fai:Fai) =
proc duphold*[T](variant:Variant, values:var seq[T], sample_i: int, stats:var Stats, gc_stats:var seq[Stats], fai:Fai): float64 =
## sets FORMAT fields for sample i in the variant and returns the DHBFC value
var
s = variant.start
e = variant.stop
Expand Down Expand Up @@ -201,6 +211,7 @@ proc duphold*[T](variant:Variant, values:var seq[T], sample_i: int, stats:var St

get_or_empty(variant, "DHBFC", floats)
var gfc = local_stats.m / gc_stat.m
result = gfc
floats[sample_i] = gfc.float32
if variant.format.set("DHBFC", floats) != Status.OK:
quit "error setting DHBFC in VCF"
Expand All @@ -212,8 +223,16 @@ proc duphold*[T](variant:Variant, values:var seq[T], sample_i: int, stats:var St
quit "error setting DHD in VCF"

proc fill_stats*[T](depths: var seq[T], stats:var Stats, gc_stats:var seq[Stats], gc_count:var seq[float32], step:int, target_length:int) =
stats.clear()
for v in depths:
stats.update(v, false)

var dmax = int(stats.m + 4 * sqrt(stats.m))
stats.clear()
for v in depths:
if v < dmax:
stats.update(v, false)

# for each window of length step, gc_count holds the proportion of bases that were G or C


Expand Down Expand Up @@ -243,7 +262,7 @@ iterator duphold*(bam:Bam, vcf:VCF, fai:Fai, sample_i:int, step:int=STEP): Varia

for variant in vcf:
if variant.CHROM == last_chrom:
variant.duphold(depths.values, sample_i, stats, gc_stats, fai)
discard variant.duphold(depths.values, sample_i, stats, gc_stats, fai)

yield variant
continue
Expand All @@ -270,7 +289,7 @@ iterator duphold*(bam:Bam, vcf:VCF, fai:Fai, sample_i:int, step:int=STEP): Varia
gc_count = fai.gc_content(last_chrom, step)
depths.values.fill_stats(stats, gc_stats, gc_count, step, target.length.int)

variant.duphold(depths.values, sample_i, stats, gc_stats, fai)
discard variant.duphold(depths.values, sample_i, stats, gc_stats, fai)
yield variant

proc sample_name(b:Bam): string =
Expand All @@ -294,6 +313,7 @@ Options:
-f --fasta <path> indexed fasta reference.
-t --threads <int> number of decompression threads. [default: 4]
-o --output <string> output VCF/BCF (default is VCF to stdout) [default: -]
-d --drop drop all samples from a multi-sample --vcf *except* the sample in --bam. useful for parallelization by sample followed by merge.
-h --help show help
""")

Expand Down Expand Up @@ -333,7 +353,6 @@ Options:
if vcf.header.add_format("DHD", "1", "Integer", "duphold rapid change in depth at one of the break-points (1 for higher. 0 for no or conflicting changes. -1 for drop, 2 for both break points)") != Status.OK:
quit "unable to add to header"

ovcf.header = vcf.header

open(bam, $args["--bam"], index=true, threads=parseInt($args["--threads"]), fai=($args["--fasta"]))
if bam == nil:
Expand All @@ -344,6 +363,10 @@ Options:
sample_i = vcf.samples.find(bam.sample_name)
if sample_i == -1:
quit "couldn't find sample:" & bam.sample_name & " in vcf which had:" & join(vcf.samples, ",")
if args["--drop"]:
vcf.set_samples(@[bam.sample_name])
sample_i = 0
ovcf.header = vcf.header

if not ovcf.write_header():
quit "couldn't write vcf header"
Expand Down

0 comments on commit 29bec5c

Please sign in to comment.