Skip to content

Commit

Permalink
ancestry: allow globs
Browse files Browse the repository at this point in the history
closes #59
  • Loading branch information
brentp committed Sep 11, 2020
1 parent bd81c4b commit 05e3b8c
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ v0.2.12 (dev)
=============
+ add checkbox to HTML to scale IBS0, IBS2, etc by number of sites shared by the samples. this almost always
results in a scaling that is better across (pairs of) samples.
+ ancestry: allow globs for ancestry files (#59)


v0.2.11
Expand Down
3 changes: 3 additions & 0 deletions src/somalierpkg/ancestry.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import strformat
import arraymancer
import ./relate
import ./depthview
import ./common
import argparse
import sets
import arraymancer
Expand Down Expand Up @@ -104,6 +105,8 @@ proc ancestry_main*() =
echo p.help
quit "send argument for extracted files"

opts.extracted.update_with_glob

var (labeled_samples, query_samples) = opts.extracted.split_labeled_samples

var train_mat = newSeq[seq[float32]](labeled_samples.len)
Expand Down
16 changes: 16 additions & 0 deletions src/somalierpkg/common.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import strformat
import algorithm
import hts/private/hts_concat
import hts/fai
import os

const formatVersion* = 2'u8

Expand All @@ -16,6 +17,21 @@ type Site* = object
## counts if swap is true.
flip*:bool


proc update_with_glob*(files: var seq[string]) =

This comment has been minimized.

Copy link
@hdashnow

hdashnow Nov 4, 2021

@brentp why did you decide to do something for <=10 vs >10 files here?
I'm going to copy this for strling merge

This comment has been minimized.

Copy link
@brentp

brentp Nov 5, 2021

Author Owner

well, for somalier, there could be thousands of files and doing a glob on each one seemed like it might be expensive. it's probably an unneeded optimization, even for somalier, but definitely so for strling which has a much longer run-time.

var toadd = newSeqOfCap[string](256)
for i in 0..<min(files.len, 10):
if files[i] == "++":
toadd.add(files[i])
continue
for w in files[i].walkFiles:
toadd.add(w)

if files.len > 10:
files = toadd & files[10..files.high]
else:
files = toadd

{.push checks: off, optimization:speed.}
proc toSite(toks: seq[string]): Site =
result = Site()
Expand Down
13 changes: 1 addition & 12 deletions src/somalierpkg/relate.nim
Original file line number Diff line number Diff line change
Expand Up @@ -868,17 +868,6 @@ proc toj(sample_names: seq[string], stats: seq[Stat4], gt_counts: array[5, seq[u
))
result.add("]")

proc update_with_glob(files: var seq[string]) =
var toadd = newSeqOfCap[string](256)
for i in 0..<min(files.len, 10):
for w in files[i].walkFiles:
toadd.add(w)

if files.len > 10:
files = toadd & files[10..files.high]
else:
files = toadd

proc add_prefixed_samples(groups: var seq[pair], samples: seq[string], prefixes: seq[string]) =
# update groups so that sample == ${prefix}sample
#if len(prefixes) == 0: return
Expand Down Expand Up @@ -1034,7 +1023,7 @@ specified as comma-separated groups per line e.g.:
relGt0p2.mgetOrPut(rel.sample_a, @[]).add(rel.sample_b)
relGt0p2.mgetOrPut(rel.sample_b, @[]).add(rel.sample_a)

let ra = random(1'f32)
let ra = rand(1'f32)
let interesting = expected_relatedness != -1 or rr > 0.05
if (ra > proportion_sampled) and not interesting:
continue
Expand Down

0 comments on commit 05e3b8c

Please sign in to comment.