Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for breakends in VCF #163

Merged
merged 4 commits into from
May 22, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion src/cljam/io/vcf/util.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.vcf.util
(:require [clojure.string :as cstr]))
(:require [clojure.string :as cstr]
[proton.core :as p]))

(definline dot-or-nil?
"Checks if given string is equal to \".\" or nil."
Expand Down Expand Up @@ -233,3 +234,54 @@
(update :info stringify-info)
(update fmt-kw stringify-format)
(merge (into {} (for [k sample-kws] [k (stringify-sample (v fmt-kw) (v k))])))))))

(def ^:private long-breakend-regexp
;; pre-seq [ or ] chr pos [ or ] post-seq
#"((?:[ACGTN]*|\.))([\[\]])(.+?)(?::(\d+))([\[\]])((?:[ACGTN]*|\.))")
r6eve marked this conversation as resolved.
Show resolved Hide resolved

(def ^:private short-breakend-regexp
#"(\.?)([ATGCN]+)(\.?)")

(defn parse-breakend
"Parses an ALT allele string of SVTYPE=BND.
Returns a map with mandatory keys:
- `:bases` bases that replaces the reference place
- `:join` `:before` or `:after`
and optional keys for a mate:
- `:chr` chromosome name of the mate sequence
- `:pos` genomic position of the mate sequence
- `:strand` strand of the mate sequence
athos marked this conversation as resolved.
Show resolved Hide resolved
Returns `nil` if input `alt` is not a breakend."
[alt]
(if-let [[_ pre-seq [left-bracket] chr pos [right-bracket] post-seq]
(re-matches long-breakend-regexp alt)]
(let [pre (not-empty pre-seq)
post (not-empty post-seq)]
(when (and (= left-bracket right-bracket)
(or pre post)
(not (and pre post)))
(when-let [p (p/as-long pos)]
{:chr chr,
:pos p,
:join (if post :before :after),
:bases (or pre post),
:strand (if (= left-bracket \])
(if post :forward :reverse)
(if post :reverse :forward))})))
(when-let [[_ [pre-dot] bases [post-dot]]
(re-matches short-breakend-regexp alt)]
(when (and (or pre-dot post-dot)
(not (and pre-dot post-dot)))
{:bases bases, :join (if pre-dot :before :after)}))))

(defn stringify-breakend
"Returns a string representation of a breakend."
athos marked this conversation as resolved.
Show resolved Hide resolved
[{:keys [chr pos strand join] s :bases}]
(when (and (not-empty s) (#{:before :after} join))
(if (and chr pos (#{:forward :reverse} strand))
(let [before? (= join :before)
bracket (if (= strand :forward)
(if before? \] \[)
(if before? \[ \]))]
(str (when-not before? s) bracket chr \: pos bracket (when before? s)))
(str (when (= :before join) \.) s (when (= :after join) \.)))))
95 changes: 95 additions & 0 deletions test/cljam/io/vcf/util_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,98 @@
;; trailing fields can be dropped. Result differs but it's OK.
;; (nth test-vcf-v4_0-variants 10) (nth test-vcf-v4_0-variants-deep 10)
(nth test-vcf-v4_0-variants 11) (nth test-vcf-v4_0-variants-deep 11))))

(deftest parse-breakend
(are [?alt ?expected]
(= ?expected (vcf-util/parse-breakend ?alt))

"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
"]13:123456]AGTNNNNNCAT" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "AGTNNNNNCAT"}
"AGTNNNNNCA[2:321682[" {:chr "2", :pos 321682, :strand :forward,
:join :after, :bases "AGTNNNNNCA"}
"C[<ctg1>:1[" {:chr "<ctg1>", :pos 1, :strand :forward,
:join :after, :bases "C"}
"[13:123457[C" {:chr "13", :pos 123457, :strand :reverse,
:join :before, :bases "C"}
"G]17:198982]" {:chr "17", :pos 198982, :strand :reverse,
:join :after, :bases "G"}
".[13:123457[" {:chr "13", :pos 123457, :strand :forward,
:join :after, :bases "."}
"]2:321681]A" {:chr "2", :pos 321681, :strand :forward,
:join :before, :bases "A"}
"C[1:1[" {:chr "1", :pos 1, :strand :forward,
:join :after, :bases "C"}
"]1:0]A" {:chr "1", :pos 0, :strand :forward,
:join :before, :bases "A"}
"C[<ct[g1:123>:1[" {:chr "<ct[g1:123>", :pos 1, :strand :forward,
:join :after, :bases "C"}

".A" {:join :before, :bases "A"}
".TGCA" {:join :before, :bases "TGCA"}
"G." {:join :after, :bases "G"}
"TCC." {:join :after, :bases "TCC"}

"G" nil
"ATT" nil
"[1:12T" nil
"[1:2[T[" nil
"[1:[T" nil
"[:1[T" nil
"[:[T" nil
"[1:1e[T" nil
"[1:2]T[" nil
"]1:1]" nil
"]1:1[T" nil
"]1:1T" nil
"]12]T" nil ;; possibly not malformed
"N[<[>[>[" nil ;; possibly not malformed
"." nil
"*" nil
".G." nil
"..G" nil
"<>" nil
"C<ctg1>" nil ;; INS
"<ctg1>C" nil ;; INS
"<DUP>" nil
"<INV>" nil))

(deftest stringify-breakend
(are [?expected ?bnd]
(= ?expected (vcf-util/stringify-breakend ?bnd))
"]13:123456]T" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "T"}
"]13:123456]AGTNNNNNCAT" {:chr "13", :pos 123456, :strand :forward,
:join :before, :bases "AGTNNNNNCAT"}
"AGTNNNNNCA[2:321682[" {:chr "2", :pos 321682, :strand :forward,
:join :after, :bases "AGTNNNNNCA"}
"C[<ctg1>:1[" {:chr "<ctg1>", :pos 1, :strand :forward,
:join :after, :bases "C"}
"[13:123457[C" {:chr "13", :pos 123457, :strand :reverse,
:join :before, :bases "C"}
"G]17:198982]" {:chr "17", :pos 198982, :strand :reverse,
:join :after, :bases "G"}
".[13:123457[" {:chr "13", :pos 123457, :strand :forward,
:join :after, :bases "."}
"]2:321681]A" {:chr "2", :pos 321681, :strand :forward,
:join :before, :bases "A"}
"C[1:1[" {:chr "1", :pos 1, :strand :forward,
:join :after, :bases "C"}
"]1:0]A" {:chr "1", :pos 0, :strand :forward,
:join :before, :bases "A"}
"C[<ct[g1:123>:1[" {:chr "<ct[g1:123>", :pos 1, :strand :forward,
:join :after, :bases "C"}

".A" {:join :before, :bases "A"}
".TGCA" {:join :before, :bases "TGCA"}
"G." {:join :after, :bases "G"}
"TCC." {:join :after, :bases "TCC"}

nil nil
nil {}
nil {:bases "A"}

".A" {:bases "A", :join :before, :chr "1", :pos 1}
".A" {:bases "A", :join :before, :chr "1", :strand :forward}
".A" {:bases "A", :join :before, :pos 1, :strand :forward}))