Skip to content

Commit

Permalink
Add support for breakends in VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
alumi committed May 20, 2019
1 parent f4ab045 commit 05543f4
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 1 deletion.
54 changes: 53 additions & 1 deletion src/cljam/io/vcf/util.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.vcf.util
(:require [clojure.string :as cstr]))
(:require [clojure.string :as cstr]
[proton.core :as p]))

(definline dot-or-nil?
"Checks if given string is equal to \".\" or nil."
Expand Down Expand Up @@ -233,3 +234,54 @@
(update :info stringify-info)
(update fmt-kw stringify-format)
(merge (into {} (for [k sample-kws] [k (stringify-sample (v fmt-kw) (v k))])))))))

(def ^:private long-breakend-regexp
;; pre-seq [ or ] chr pos [ or ] post-seq
#"((?:[ACGTN]*|\.))([\[\]])(.+?)(?::(\d+))([\[\]])((?:[ACGTN]*|\.))")

(def ^:private short-breakend-regexp
#"(\.?)([ATGCN]+)(\.?)")

(defn parse-breakend
"Parses an ALT allele string of SVTYPE=BND.
Returns a map with mandatory keys:
- `:bases` bases that replaces the reference place
- `:join` `:before` or `:after`
and optional keys for a mate:
- `:chr` chromosome name of the mate sequence
- `:pos` genomic position of the mate sequence
- `:strand` strand of the mate sequence
Returns `nil` if input `alt` is not a breakend."
[alt]
(if-let [[_ pre-seq [left-bracket] chr pos [right-bracket] post-seq]
(re-matches long-breakend-regexp alt)]
(let [pre (not-empty pre-seq)
post (not-empty post-seq)]
(when (and (= left-bracket right-bracket)
(or pre post)
(not (and pre post)))
(when-let [p (p/as-long pos)]
{:chr chr,
:pos p,
:join (if post :before :after),
:bases (or pre post),
:strand (if (= left-bracket \])
(if post :forward :reverse)
(if post :reverse :forward))})))
(when-let [[_ [pre-dot] bases [post-dot]]
(re-matches short-breakend-regexp alt)]
(when (and (or pre-dot post-dot)
(not (and pre-dot post-dot)))
{:bases bases, :join (if pre-dot :before :after)}))))

(defn stringify-breakend
"Returns a string representation of a breakend."
[{:keys [chr pos strand join] s :bases}]
(when (and (not-empty s) (#{:before :after} join))
(if (and chr pos (#{:forward :reverse} strand))
(let [before? (= join :before)
bracket (if (= strand :forward)
(if before? \] \[)
(if before? \[ \]))]
(str (when-not before? s) bracket chr \: pos bracket (when before? s)))
(str (when (= :before join) \.) s (when (= :after join) \.)))))
95 changes: 95 additions & 0 deletions test/cljam/io/vcf/util_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,98 @@
;; trailing fields can be dropped. Result differs but it's OK.
;; (nth test-vcf-v4_0-variants 10) (nth test-vcf-v4_0-variants-deep 10)
(nth test-vcf-v4_0-variants 11) (nth test-vcf-v4_0-variants-deep 11))))

(deftest parse-breakend
(are [?alt ?expected]
(= ?expected (vcf-util/parse-breakend ?alt))

"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
"]13:123456]AGTNNNNNCAT" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "AGTNNNNNCAT"}
"AGTNNNNNCA[2:321682[" {:chr "2", :pos 321682, :strand :forward,
:join :after, :bases "AGTNNNNNCA"}
"C[<ctg1>:1[" {:chr "<ctg1>", :pos 1, :strand :forward,
:join :after, :bases "C"}
"[13:123457[C" {:chr "13", :pos 123457, :strand :reverse,
:join :before, :bases "C"}
"G]17:198982]" {:chr "17", :pos 198982, :strand :reverse,
:join :after, :bases "G"}
".[13:123457[" {:chr "13", :pos 123457, :strand :forward,
:join :after, :bases "."}
"]2:321681]A" {:chr "2", :pos 321681, :strand :forward,
:join :before, :bases "A"}
"C[1:1[" {:chr "1", :pos 1, :strand :forward,
:join :after, :bases "C"}
"]1:0]A" {:chr "1", :pos 0, :strand :forward,
:join :before, :bases "A"}
"C[<ct[g1:123>:1[" {:chr "<ct[g1:123>", :pos 1, :strand :forward,
:join :after, :bases "C"}

".A" {:join :before, :bases "A"}
".TGCA" {:join :before, :bases "TGCA"}
"G." {:join :after, :bases "G"}
"TCC." {:join :after, :bases "TCC"}

"G" nil
"ATT" nil
"[1:12T" nil
"[1:2[T[" nil
"[1:[T" nil
"[:1[T" nil
"[:[T" nil
"[1:1e[T" nil
"[1:2]T[" nil
"]1:1]" nil
"]1:1[T" nil
"]1:1T" nil
"]12]T" nil ;; possibly not malformed
"N[<[>[>[" nil ;; possibly not malformed
"." nil
"*" nil
".G." nil
"..G" nil
"<>" nil
"C<ctg1>" nil ;; INS
"<ctg1>C" nil ;; INS
"<DUP>" nil
"<INV>" nil))

(deftest stringify-breakend
(are [?expected ?bnd]
(= ?expected (vcf-util/stringify-breakend ?bnd))
"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
"]13:123456]AGTNNNNNCAT" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "AGTNNNNNCAT"}
"AGTNNNNNCA[2:321682[" {:chr "2", :pos 321682, :strand :forward,
:join :after, :bases "AGTNNNNNCA"}
"C[<ctg1>:1[" {:chr "<ctg1>", :pos 1, :strand :forward,
:join :after, :bases "C"}
"[13:123457[C" {:chr "13", :pos 123457, :strand :reverse,
:join :before, :bases "C"}
"G]17:198982]" {:chr "17", :pos 198982, :strand :reverse,
:join :after, :bases "G"}
".[13:123457[" {:chr "13", :pos 123457, :strand :forward,
:join :after, :bases "."}
"]2:321681]A" {:chr "2", :pos 321681, :strand :forward,
:join :before, :bases "A"}
"C[1:1[" {:chr "1", :pos 1, :strand :forward,
:join :after, :bases "C"}
"]1:0]A" {:chr "1", :pos 0, :strand :forward,
:join :before, :bases "A"}
"C[<ct[g1:123>:1[" {:chr "<ct[g1:123>", :pos 1, :strand :forward,
:join :after, :bases "C"}

".A" {:join :before, :bases "A"}
".TGCA" {:join :before, :bases "TGCA"}
"G." {:join :after, :bases "G"}
"TCC." {:join :after, :bases "TCC"}

nil nil
nil {}
nil {:bases "A"}

".A" {:bases "A", :join :before, :chr "1", :pos 1}
".A" {:bases "A", :join :before, :chr "1", :strand :forward}
".A" {:bases "A", :join :before, :pos 1, :strand :forward}))

0 comments on commit 05543f4

Please sign in to comment.