Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
SymbolRange, cyclic arithmetic, and more informative show method. (#…
Browse files Browse the repository at this point in the history
…190)

* add SymbolRange type

* make symbol arithmetic cyclic

* make show(nucleotide|amino acid) more informative

* fix seq.md
  • Loading branch information
bicycle1885 committed May 20, 2016
1 parent bc2fe43 commit 1688351
Show file tree
Hide file tree
Showing 8 changed files with 274 additions and 49 deletions.
96 changes: 71 additions & 25 deletions docs/src/man/seq.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,16 @@ Symbols are accessible as constants with `DNA_` or `RNA_` prefix:

```jlcon
julia> DNA_A
A
DNA_A
julia> DNA_T
T
DNA_T
julia> RNA_U
U
RNA_U
julia> DNA_Gap
-
DNA_Gap
julia> typeof(DNA_A)
Bio.Seq.DNANucleotide
Expand All @@ -86,7 +86,7 @@ Symbols can be constructed by converting regular characters:

```jlcon
julia> convert(DNANucleotide, 'C')
C
DNA_C
julia> convert(DNANucleotide, 'C') === DNA_C
true
Expand Down Expand Up @@ -134,13 +134,13 @@ Set of amino acid symbols also covers IUPAC amino acid symbols plus a gap symbol
Symbols are accessible as constants with `AA_` prefix:
```jlcon
julia> AA_A
A
AA_A
julia> AA_Q
Q
AA_Q
julia> AA_Term
*
AA_Term
julia> typeof(AA_A)
Bio.Seq.AminoAcid
Expand All @@ -150,7 +150,7 @@ Bio.Seq.AminoAcid
Symbols can be constructed by converting regular characters:
```jlcon
julia> convert(AminoAcid, 'A')
A
AA_A
julia> convert(AminoAcid, 'P') === AA_P
true
Expand All @@ -169,26 +169,72 @@ julia> DNA_A < DNA_C < DNA_G < DNA_T # order
true
julia> DNA_A + 1 # addition
C
DNA_C
julia> DNA_T - 3 # subtraction
A
DNA_A
julia> DNA_T - DNA_C # difference
2
```

Note that these operations do **not** check bounds of valid range:
Note that these operations are cyclic:
```jlcon
julia> DNA_C + 15
DNA_A
julia> DNA_A - 1
DNA_Gap
```


### Symbol Ranges

Consecutive symbol sets can be created using a colon like integer ranges:
```jlcon
julia> DNA_A:DNA_T # unambiguous DNA nucleotides (A, C, G, T)
DNA_A:DNA_T
julia> DNA_A:DNA_N # all DNA nucleotides except gap
DNA_A:DNA_N
julia> DNA_A:DNA_Gap # all DNA nucleotides including gap
DNA_A:DNA_Gap
julia> AA_A:AA_V # standard amino acids
AA_A:AA_V
julia> AA_A:AA_U # standard amino acids + pyrrolysine (O) + selenocysteine (U)
AA_A:AA_U
julia> AA_A:AA_X # all amino acids except teminal codon (*) and gap
AA_A:AA_X
julia> AA_A:AA_Gap # all amino acids including teminal codon (*) and gap
AA_A:AA_Gap
```

Most range operations are supported, especially the iterator interface and the
membership operator (`in` or ``) will be useful in many situations:
```jlcon
julia> DNA_C - 2
Invalid DNA Nucleotide
julia> for nt in DNA_A:DNA_T; println(nt); end
A
C
G
T
julia> DNA_C in DNA_A:DNA_T # DNA_C is in the range of unambiguous DNA nucleotides
true
julia> DNA_A + 30
Invalid DNA Nucleotide
julia> DNA_N in DNA_A:DNA_T # DNA_N is not in it
false
```


### Other functions

```@docs
Expand Down Expand Up @@ -282,13 +328,13 @@ julia> convert(ASCIIString, dna"TTANGTA")
julia> convert(Vector{DNANucleotide}, dna"TTANGTA")
7-element Array{Bio.Seq.DNANucleotide,1}:
T
T
A
N
G
T
A
DNA_T
DNA_T
DNA_A
DNA_N
DNA_G
DNA_T
DNA_A
```

Expand Down Expand Up @@ -342,7 +388,7 @@ julia> seq = dna"ACGTTTANAGTNNAGTACC"
ACGTTTANAGTNNAGTACC
julia> seq[5]
T
DNA_T
julia> seq[6:end]
14nt DNA Sequence:
Expand All @@ -365,7 +411,7 @@ julia> subseq = seq[1:2] # create a subsequence from `seq`
AA
julia> subseq[2] = DNA_T # modify the second element of it
T
DNA_T
julia> subseq # the subsequence is modified
2nt DNA Sequence:
Expand Down
1 change: 1 addition & 0 deletions src/seq/Seq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ function gap end

gap(::Type{Char}) = '-'

include("symbolrange.jl")
include("nucleotide.jl")
include("aminoacid.jl")
include("alphabet.jl")
Expand Down
28 changes: 23 additions & 5 deletions src/seq/aminoacid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ Base.convert{T<:Number}(::Type{AminoAcid}, aa::T) = convert(AminoAcid, UInt8(aa)
# like iteration, sort, comparison, and so on.
@compat begin
Base.:-(x::AminoAcid, y::AminoAcid) = Int(x) - Int(y)
Base.:-(x::AminoAcid, y::Integer) = reinterpret(AminoAcid, UInt8(x) - UInt8(y))
Base.:+(x::AminoAcid, y::Integer) = reinterpret(AminoAcid, UInt8(x) + UInt8(y))
# 0x1c is the size of the amino acid alphabet
Base.:-(x::AminoAcid, y::Integer) = x + mod(-y, 0x1c)
Base.:+(x::AminoAcid, y::Integer) = reinterpret(AminoAcid, mod((UInt8(x) + y) % UInt8, 0x1c))
Base.:+(x::Integer, y::AminoAcid) = y + x
end
Base.isless(x::AminoAcid, y::AminoAcid) = isless(UInt8(x), UInt8(y))
Expand Down Expand Up @@ -107,6 +108,8 @@ char_to_aa[Int('-') + 1] = AA_Gap
aa_to_char[0x1b+1] = '-'
compatbits_aa[0x1b+1] = 0

Base.colon(start::AminoAcid, stop::AminoAcid) = SymbolRange(start, stop)

Base.isvalid(::Type{AminoAcid}, x::Integer) = 0 x 0x1b
Base.isvalid(aa::AminoAcid) = aa AA_Gap
isambiguous(aa::AminoAcid) = AA_B aa AA_X
Expand All @@ -132,11 +135,26 @@ Base.convert(::Type{Char}, aa::AminoAcid) = aa_to_char[convert(UInt8, aa) + 1]
# ---------------

function Base.show(io::IO, aa::AminoAcid)
if aa == AA_INVALID
write(io, "Invalid Amino Acid")
if isvalid(aa)
if aa == AA_Term
write(io, "AA_Term")
elseif aa == AA_Gap
write(io, "AA_Gap")
else
write(io, "AA_", Char(aa))
end
else
write(io, Char(aa))
write(io, "Invalid Amino Acid")
end
return
end

function Base.print(io::IO, aa::AminoAcid)
if !isvalid(aa)
throw(ArgumentError("invalid amino acid"))
end
write(io, Char(aa))
return
end

# lookup table of 20 standard amino acids
Expand Down
2 changes: 1 addition & 1 deletion src/seq/fasta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function Base.write{T}(io::IO, seqrec::SeqRecord{T,FASTAMetadata})
counter = 1
len = length(seqrec.seq)
for nt in seqrec.seq
show(io, nt)
print(io, nt)
if counter % maxchars == 0 && counter < len
write(io, "\n")
end
Expand Down
4 changes: 2 additions & 2 deletions src/seq/fastq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Show a `FASTQSeqRecord` to `io`, with graphical display of quality scores.
function Base.show(io::IO, seqrec::FASTQSeqRecord)
write(io, "@", seqrec.name, " ", seqrec.metadata.description, "\n")
for c in seqrec.seq
show(io, c)
print(io, c)
end
write(io, '\n')
# print quality scores as a unicode bar chart
Expand Down Expand Up @@ -92,7 +92,7 @@ function Base.write(io::IO, seqrec::FASTQSeqRecord;
write(io, "\n")

for c in seqrec.seq
show(io, c)
print(io, c)
end
write(io, "\n")

Expand Down
38 changes: 30 additions & 8 deletions src/seq/nucleotide.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ Base.convert{T<:Number,S<:Nucleotide}(::Type{S}, nt::T) = convert(S, UInt8(nt))
# like iteration, sort, comparison, and so on.
@compat begin
Base.:-{N<:Nucleotide}(x::N, y::N) = Int(x) - Int(y)
Base.:-{N<:Nucleotide}(x::N, y::Integer) = reinterpret(N, UInt8(x) - UInt8(y))
Base.:+{N<:Nucleotide}(x::N, y::Integer) = reinterpret(N, UInt8(x) + UInt8(y))
Base.:-{N<:Nucleotide}(x::N, y::Integer) = x + (-y)
Base.:+{N<:Nucleotide}(x::N, y::Integer) = reinterpret(N, (UInt8(x) + y % UInt8) & 0b1111)
Base.:+{N<:Nucleotide}(x::Integer, y::N) = y + x
end
Base.isless{N<:Nucleotide}(x::N, y::N) = isless(UInt8(x), UInt8(y))
Expand Down Expand Up @@ -90,6 +90,8 @@ compatbits_nuc[0b1111 + 1] = 0b0000
nnucleotide(::Type{DNANucleotide}) = DNA_N
invalid_nucleotide(::Type{DNANucleotide}) = DNA_INVALID

Base.colon(start::DNANucleotide, stop::DNANucleotide) = SymbolRange(start, stop)

Base.isvalid(::Type{DNANucleotide}, x::Integer) = 0 x < 16
Base.isvalid(nt::DNANucleotide) = nt DNA_Gap
isambiguous(nt::DNANucleotide) = nt > DNA_T
Expand Down Expand Up @@ -138,6 +140,8 @@ rna_to_char[0b1111 + 1] = '-'
nnucleotide(::Type{RNANucleotide}) = RNA_N
invalid_nucleotide(::Type{RNANucleotide}) = RNA_INVALID

Base.colon(start::RNANucleotide, stop::RNANucleotide) = SymbolRange(start, stop)

Base.isvalid(::Type{RNANucleotide}, x::Integer) = 0 x < 16
Base.isvalid(nt::RNANucleotide) = nt RNA_Gap
isambiguous(nt::RNANucleotide) = nt > RNA_U
Expand Down Expand Up @@ -209,17 +213,35 @@ Base.convert(::Type{Char}, nt::RNANucleotide) = rna_to_char[convert(UInt8, nt) +
# ---------------

function Base.show(io::IO, nt::DNANucleotide)
if !isvalid(nt)
write(io, "Invalid DNA Nucleotide")
if isvalid(nt)
if nt == DNA_Gap
write(io, "DNA_Gap")
else
write(io, "DNA_", Char(nt))
end
else
write(io, Char(nt))
write(io, "Invalid DNA Nucleotide")
end
return
end

function Base.show(io::IO, nt::RNANucleotide)
if !isvalid(nt)
write(io, "Invalid RNA Nucleotide")
if isvalid(nt)
if nt == RNA_Gap
write(io, "RNA_Gap")
else
write(io, "RNA_", Char(nt))
end
else
write(io, Char(nt))
write(io, "Invalid RNA Nucleotide")
end
return
end

function Base.print(io::IO, nt::Nucleotide)
if !isvalid(nt)
throw(ArgumentError("nucleotide is invalid"))
end
write(io, Char(nt))
return
end
66 changes: 66 additions & 0 deletions src/seq/symbolrange.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SymbolRange
# ===========
#
# Range type of biological symbols.
#
# This file is a part of BioJulia.
# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md

"""
Range type of biological symbols (especially `DNANucleotide`, `RNANucleotide`,
and `AminoAcid`).
"""
immutable SymbolRange{T} <: Range{T}
start::UInt8
stop::UInt8

function SymbolRange(start::T, stop::T)
if start > stop
# normalize empty range
start = convert(T, 0x01)
stop = convert(T, 0x00)
end
return new(start, stop)
end
end

function SymbolRange{T}(start::T, stop::T)
return SymbolRange{T}(start, stop)
end

function Base.show(io::IO, r::SymbolRange)
show(io, first(r))
write(io, ':')
show(io, last(r))
end


# Iterator
# --------

Base.start(r::SymbolRange) = r.start
Base.done(r::SymbolRange, i) = i > r.stop
Base.next(r::SymbolRange, i) = reinterpret(eltype(r), i), i + 0x01

Base.size(r::SymbolRange) = (r.stop - r.start + 1,)
Base.first(r::SymbolRange) = convert(eltype(r), r.start)
Base.last(r::SymbolRange) = convert(eltype(r), r.stop)
Base.in{T}(x::T, r::SymbolRange{T}) = first(r) x last(r)


# Indexing
# --------

function Base.getindex(r::SymbolRange, i::Integer)
checkbounds(r, i)
return convert(eltype(r), r.start + i - 1)
end

function Base.getindex{T}(r::SymbolRange{T}, ir::UnitRange)
checkbounds(r, ir)
if isempty(ir)
return SymbolRange(T(0x01), T(0x00))
else
return SymbolRange(r[first(ir)], r[last(ir)])
end
end
Loading

0 comments on commit 1688351

Please sign in to comment.