Skip to content

Commit

Permalink
Merge pull request #162 from chrovis/feature/genotype
Browse files Browse the repository at this point in the history
Add utility functions for manipulating genotypes
  • Loading branch information
athos authored May 20, 2019
2 parents 17060cd + 25a9883 commit f4ab045
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 15 deletions.
6 changes: 3 additions & 3 deletions src/cljam/io/vcf/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
protocols/IRegionReader
(read-in-region [this region]
(protocols/read-in-region this region {}))
(read-in-region [this {:keys [chr start end]} {:keys [depth] :as option}]
(read-in-region [this {:keys [chr start end]} option]
(logging/warn "May cause degradation of performance.")
(filter
(fn [v] (and (if chr (= (:chr v) chr) true)
Expand Down Expand Up @@ -141,7 +141,7 @@
;; ------------------

(defn- parse-data-line
[line header kws]
[line kws]
;; When splitting a string with single-character delimiter,
;; `java.lang.String#split` is slightly faster than `clojure.string/split`.
;; For more details, please refer to https://github.com/chrovis/cljam/pull/29.
Expand All @@ -165,7 +165,7 @@
[^java.io.BufferedReader rdr header kws]
(when-let [line (.readLine rdr)]
(if-not (or (meta-line? line) (header-line? line))
(cons (parse-data-line line header kws)
(cons (parse-data-line line kws)
(lazy-seq (read-data-lines rdr header kws)))
(read-data-lines rdr header kws))))

Expand Down
72 changes: 64 additions & 8 deletions src/cljam/io/vcf/util.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
(ns cljam.io.vcf.util
(:require [clojure.string :as cstr]
[cljam.util :as util]))
(:require [clojure.string :as cstr]))

(definline dot-or-nil?
"Checks if given string is equal to \".\" or nil."
Expand Down Expand Up @@ -90,9 +89,9 @@
[^String gt]
(when-not (dot-or-nil? gt)
(->> gt
(re-seq #"([\||/])?(\d+)")
(map (fn [[_ phase allele]]
[(Integer/parseInt allele)
(re-seq #"([\||/])?(.|\d+)")
(map (fn [[_ phase ^String allele]]
[(when-not (dot-or-nil? allele) (Integer/parseInt allele))
(if phase
(= phase "|")
(neg? (.indexOf ^String gt "/")))])))))
Expand All @@ -101,7 +100,64 @@
"Stringifies genotype map into VCF-style GT string."
[gt-seq]
(when-not (or (nil? gt-seq) (empty? gt-seq))
(apply str (rest (mapcat (fn [[allele phase]] [(if phase "|" "/") allele]) gt-seq)))))
(->> gt-seq
(mapcat
(fn [[allele phase]]
[(if phase "|" "/") (or allele \.)]))
rest
(apply str))))

(defn genotype-seq
"Returns a sequence of genotypes represented as sequences of integers."
([^long ploidy ^long n-alt-alleles]
(genotype-seq ploidy n-alt-alleles []))
([^long ploidy ^long n-alt-alleles s]
(mapcat
(fn [i]
(if (= 1 ploidy)
[(cons i s)]
(genotype-seq (dec ploidy) i (cons i s))))
(range (inc n-alt-alleles)))))

(defn genotype-index
"Returns an index for given genotype."
^long [genotype]
{:pre [(seq genotype)
(every? (complement neg?) genotype)]}
(letfn [(combination ^long [^long n ^long k]
(cond
(< n k) 0
(= k n) 1
(or (= k 1) (= k (dec n))) n
(< (quot n 2) k) (recur n (- n k))
:else (loop [i (inc (- n k)), j 1, r 1]
(if (<= j k)
(recur (inc i) (inc j) (quot (* r i) j))
r))))]
(->> genotype
sort
(map-indexed (fn [^long m ^long k] (combination (+ k m) (inc m))))
(apply +))))

(defn biallelic-genotype
"Converts a multiallelic `genotype` string into a biallelic one. Ignores all
alleles other than the reference allele and the `target-allele`."
[genotype ^long target-allele]
(->> genotype
parse-genotype
(map (fn [[allele phased?]]
[(when allele (if (= target-allele allele) 1 0)) phased?]))
stringify-genotype))

(defn biallelic-coll
"Picks up elements in multiallelic `coll` and make a biallelic one. Ignores
all alleles other than the reference allele and the `target-allele`."
[^long ploidy ^long n-alt-alleles ^long target-allele coll]
(keep-indexed
(fn [i gt]
(when (every? (fn [^long a] (or (= a 0) (= a target-allele))) gt)
(nth coll i)))
(genotype-seq ploidy n-alt-alleles)))

(defn genotype->ints
"Convert genotype to a sequence of integers."
Expand Down Expand Up @@ -140,8 +196,8 @@
(->> formats
(map (fn [k] [k (sample-map k)]))
reverse
(drop-while (fn [[k v]] (or (nil? v) (= [nil] v))))
(map (fn [[k v]]
(drop-while (fn [[_ v]] (or (nil? v) (= [nil] v))))
(map (fn [[_ v]]
(cond
(sequential? v) (cstr/join "," (map (fn [i] (if (nil? i) "." i)) v))
(nil? v) "."
Expand Down
96 changes: 92 additions & 4 deletions test/cljam/io/vcf/util_test.clj
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
(ns cljam.io.vcf.util-test
(:require [clojure.test :refer :all]
[clojure.string :as cstr]
[cljam.test-common :refer :all]
[cljam.io.vcf.util :as vcf-util]))
(:require [clojure.test :refer :all]
[cljam.test-common :refer :all]
[cljam.io.vcf.util :as vcf-util]))

(deftest about-parse-info
(let [parse-info (vcf-util/info-parser [{:id "NS", :number 1, :type "Integer"}
Expand Down Expand Up @@ -74,9 +73,11 @@
"0/0" [[0 false] [0 false]]
"0/1" [[0 false] [1 false]]
"1/1" [[1 false] [1 false]]
"./." [[nil false] [nil false]]
"0|0" [[0 true] [0 true]]
"0|1" [[0 true] [1 true]]
"1|1" [[1 true] [1 true]]
".|." [[nil true] [nil true]]
"0/1/2" [[0 false] [1 false] [2 false]]
"0/1|2" [[0 false] [1 false] [2 true]]))

Expand All @@ -91,12 +92,99 @@
[[0 true] [0 true]] "0|0"
[[0 true] [1 true]] "0|1"
[[1 true] [1 true]] "1|1"
[[nil true] [nil true]] ".|."
[[0 false] [0 false]] "0/0"
[[0 false] [1 false]] "0/1"
[[1 false] [1 false]] "1/1"
[[nil false] [nil false]] "./."
[[0 false] [1 false] [2 false]] "0/1/2"
[[0 false] [1 false] [2 true]] "0/1|2"))

(deftest genotype-seq
(are [?ploidy ?n-alt-alleles ?expected]
(= ?expected (vcf-util/genotype-seq ?ploidy ?n-alt-alleles))
1 1 [[0] [1]]
1 2 [[0] [1] [2]]
2 1 [[0 0] [0 1] [1 1]]
2 2 [[0 0] [0 1] [1 1] [0 2] [1 2] [2 2]]
2 3 [[0 0] [0 1] [1 1] [0 2] [1 2] [2 2] [0 3] [1 3] [2 3] [3 3]]
3 1 [[0 0 0] [0 0 1] [0 1 1] [1 1 1]]
3 2 [[0 0 0] [0 0 1] [0 1 1] [1 1 1] [0 0 2]
[0 1 2] [1 1 2] [0 2 2] [1 2 2] [2 2 2]]))

(deftest genotype-index
(are [?genotype ?expected]
(= ?expected (vcf-util/genotype-index ?genotype))
[0] 0
[1] 1
[0 0] 0
[0 1] 1
[1 1] 2
[0 2] 3
[1 2] 4
[2 2] 5
[3 3] 9
[0 0 0] 0
[1 1 2] 6
[1 2 2] 8))

(deftest about-genotypes
(are [?ploidy ?n-alt-alleles]
(let [x (vcf-util/genotype-seq ?ploidy ?n-alt-alleles)]
(= (range (count x)) (map vcf-util/genotype-index x)))
1 0
1 1
1 2
1 3
2 0
2 1
2 2
2 3
2 4
3 0
3 1
3 2
3 3
3 4
4 0
4 1
4 2
4 3
4 4))

(deftest biallelic-genotype
(are [?genotype ?target-allele ?expected]
(= ?expected (vcf-util/biallelic-genotype ?genotype ?target-allele))
"0" 1 "0"
"1" 1 "1"
"2" 1 "0"
"0/0" 1 "0/0"
"0/1" 1 "0/1"
"0/2" 1 "0/0"
"1/2" 1 "1/0"
"2|2" 1 "0|0"
"0/1" 2 "0/0"
"0|2" 2 "0|1"
"1/2" 2 "0/1"
"0/1/2" 2 "0/0/1"
"2|3|4" 3 "0|1|0"
;; a call cannot be made
nil 0 nil
"./." 1 "./."
;; hemizygous
"./1" 1 "./1"
"./2" 2 "./1"
"./1" 2 "./0"))

(deftest biallelic-coll
(are [?ploidy ?n-alt-alleles ?target-allele ?coll ?expected]
(= ?expected
(vcf-util/biallelic-coll ?ploidy ?n-alt-alleles ?target-allele ?coll))
2 1 1 [10 20 30] [10 20 30]
2 2 1 [10 20 30 40 50 60] [10 20 30]
2 2 2 [10 20 30 40 50 60] [10 40 60]
3 2 2 [1 2 3 4 5 6 7 8 9 10] [1 5 8 10]))

(deftest about-parse-variant-v4_3
(let [parse-variant (vcf-util/variant-parser test-vcf-v4_3-meta-info test-vcf-v4_3-header)]
(are [?variant ?expected]
Expand Down

0 comments on commit f4ab045

Please sign in to comment.