Skip to content

Commit

Permalink
Merge pull request #164 from chrovis/feature/inspect-allele
Browse files Browse the repository at this point in the history
Add a function to inspect an ALT allele of VCF
  • Loading branch information
r6eve authored May 24, 2019
2 parents 7b5d627 + dc63967 commit 55353b7
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 0 deletions.
85 changes: 85 additions & 0 deletions src/cljam/io/vcf/util.clj
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,88 @@
(if before? \[ \]))]
(str (when-not before? s) bracket chr \: pos bracket (when before? s)))
(str (when (= :before join) \.) s (when (= :after join) \.)))))

(defn- inspect-nucleotides-allele
[ref alt]
(let [ref-length (count ref)
alt-length (count alt)
upper-ref (cstr/upper-case ref)
upper-alt (cstr/upper-case alt)
left-match (->> (map = upper-ref upper-alt)
(take-while true?)
count)
right-match (->> (cstr/reverse upper-alt)
(map = (cstr/reverse upper-ref))
(take-while true?)
count
long
(Math/min (- (Math/min ref-length alt-length)
left-match)))
matched-length (+ left-match right-match)]
(cond
(= left-match ref-length alt-length)
{:type :ref}

(and (= left-match ref-length) (< left-match alt-length))
{:type :insertion, :offset (dec left-match),
:n-bases (- alt-length left-match), :inserted (subs alt left-match)}

(and (< left-match ref-length) (= left-match alt-length))
{:type :deletion, :offset (dec left-match),
:n-bases (- ref-length left-match), :deleted (subs ref left-match)}

(= (inc matched-length) ref-length alt-length)
{:type :snv, :ref (nth ref left-match),
:alt (nth alt left-match), :offset left-match}

(= matched-length alt-length)
{:type :deletion, :offset (dec left-match),
:n-bases (- ref-length right-match left-match),
:deleted (subs ref left-match (- ref-length right-match))}

(= matched-length ref-length)
{:type :insertion, :offset (dec left-match),
:n-bases (- alt-length right-match left-match),
:inserted (subs alt left-match (- alt-length right-match))}

(= ref-length alt-length)
{:type :mnv, :offset left-match,
:ref (subs ref left-match (- ref-length right-match)),
:alt (subs alt left-match (- alt-length right-match))}

:else {:type :complex})))

(defn inspect-allele
"Inspects an `alt` allele by comparing to a `ref` allele string.
Returns a map containing `:type` and other detailed information.
A value of the key `:type` can be one of the followings:
- `:no-call` No variant called
- `:spanning-deletion` Placeholder to signify an absent sequence
- `:unspecified` Unspecified non-ref allele
- `:ref` Duplicated allele of REF
- `:id` Symbolic reference
- `:snv` Single nucleotide variant
- `:mnv` Multiple nucleotide variants
- `:insertion` Insertion of a short base sequence
- `:deletion` Deletion of a short base sequence
- `:complete-insertion` Complete insertion of a long sequence
- `:breakend` Breakend of a complex rearrangement
- `:complex` Complex nucleotide variants other than snv/mnv/indel
- `:other` Can't categorize the allele, might be malformed"
[ref alt]
(or
(when (re-matches #"(?i)[ACGTN]+" (or ref ""))
(condp re-matches (or (not-empty alt) ".")
#"\." {:type :no-call}
#"\*" {:type :spanning-deletion}
#"(X|<\*>|<X>)" {:type :unspecified}
#"<(.+)>" :>> (fn [[_ id]] {:type :id, :id id})
#"(?i)([ACGTN])<(.+)>" :>> (fn [[_ [base] id]]
{:type :complete-insertion,
:join :after, :base base, :id id})
#"(?i)<(.+)>([ACGTN])" :>> (fn [[_ id [base]]]
{:type :complete-insertion,
:join :before, :base base, :id id})
#"(?i)[ACGTN]+" (inspect-nucleotides-allele ref alt)
(some-> (parse-breakend alt) (assoc :type :breakend))))
{:type :other}))
82 changes: 82 additions & 0 deletions test/cljam/io/vcf/util_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,85 @@
".A" {:bases "A", :join :before, :chr "1", :pos 1}
".A" {:bases "A", :join :before, :chr "1", :strand :forward}
".A" {:bases "A", :join :before, :pos 1, :strand :forward}))

(deftest inspect-allele
(are [?ref ?alt ?expected]
(= ?expected (vcf-util/inspect-allele ?ref ?alt))

"A" "" {:type :no-call} ;; malformed
"A" "." {:type :no-call}
"A" nil {:type :no-call}

"A" "*" {:type :spanning-deletion}

"A" "X" {:type :unspecified}
"A" "<*>" {:type :unspecified}
"A" "<X>" {:type :unspecified}

"A" "A" {:type :ref}
"A" "a" {:type :ref}
"AA" "AA" {:type :ref}
"AA" "Aa" {:type :ref}

"A" "<DUP>" {:type :id, :id "DUP"}
"A" "<INV>" {:type :id, :id "INV"}
"A" "<NON_REF>" {:type :id, :id "NON_REF"} ;; not :unspecified

"A" "T" {:type :snv, :ref \A, :alt \T, :offset 0}
"TC" "TG" {:type :snv, :ref \C, :alt \G, :offset 1}
"TC" "GC" {:type :snv, :ref \T, :alt \G, :offset 0}
"TCA" "TAA" {:type :snv, :ref \C, :alt \A, :offset 1}
"TCAG" "tcAC" {:type :snv, :ref \G, :alt \C, :offset 3}
"TGTAT" "TGcAT" {:type :snv, :ref \T, :alt \c, :offset 2}

"TG" "Gc" {:type :mnv, :ref "TG", :alt "Gc", :offset 0}
"TCG" "tGA" {:type :mnv, :ref "CG", :alt "GA", :offset 1}
"TGCA" "TCCC" {:type :mnv, :ref "GCA", :alt "CCC", :offset 1} ;; two snvs?
"TGGTA" "TCcAA" {:type :mnv, :ref "GGT", :alt "CcA", :offset 1}

"TC" "T" {:type :deletion, :n-bases 1, :offset 0, :deleted "C"}
"TTC" "TC" {:type :deletion, :n-bases 1, :offset 0, :deleted "T"}
"GTC" "G" {:type :deletion, :n-bases 2, :offset 0, :deleted "TC"}
"TCG" "TG" {:type :deletion, :n-bases 1, :offset 0, :deleted "C"}
"TGCA" "TGC" {:type :deletion, :n-bases 1, :offset 2, :deleted "A"}
;; ambiguous
"TCGCG" "TCG" {:type :deletion, :n-bases 2, :offset 2, :deleted "CG"}
;; ambiguous
"TAAACCCTAAA" "TAA" {:type :deletion, :n-bases 8,
:offset 2, :deleted "ACCCTAAA"}

"C" "CTAG" {:type :insertion, :n-bases 3, :offset 0, :inserted "TAG"}
"TC" "TTC" {:type :insertion, :n-bases 1, :offset 0, :inserted "T"}
"GTC" "GTCT" {:type :insertion, :n-bases 1, :offset 2, :inserted "T"}
"TCG" "TCAG" {:type :insertion, :n-bases 1, :offset 1, :inserted "A"}
;; ambiguous
"GCG" "GCGCG" {:type :insertion, :n-bases 2, :offset 2, :inserted "CG"}
;; ambiguous
"ATTTTTTTTTTTTT" "ATTTTTTTTTTTTTTT" {:type :insertion, :n-bases 2,
:offset 13, :inserted "TT"}

"T" "<ctg1>C" {:type :complete-insertion, :join :before,
:base \C, :id "ctg1"}
"T" "c<ctg1>" {:type :complete-insertion, :join :after,
:base \c, :id "ctg1"}

"G" "G." {:type :breakend, :join :after, :bases "G"}
"A" ".A" {:type :breakend, :join :before, :bases "A"}
"T" "TCC." {:type :breakend, :join :after, :bases "TCC"}
"A" ".TGCA" {:type :breakend, :join :before, :bases "TGCA"}
"T" "]13:123456]T" {:type :breakend, :join :before,
:strand :forward, :chr "13", :pos 123456, :bases "T"}

"T" "<ctg1>CT" {:type :other} ;; malformed
"T" "<CT" {:type :other} ;; malformed
"T" "C,A" {:type :other}
"." "A" {:type :other} ;; malformed
"" "A" {:type :other} ;; malformed
nil "A" {:type :other} ;; malformed
nil nil {:type :other} ;; malformed
"R" "A" {:type :other} ;; malformed

"TAC" "GC" {:type :complex}
"TG" "TAC" {:type :complex}
"TCA" "GGGG" {:type :complex}
"AAAA" "CAC" {:type :complex}))

0 comments on commit 55353b7

Please sign in to comment.