-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMashTun.jl
161 lines (128 loc) · 3.75 KB
/
MashTun.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
module MashTun
import Base.length,
Base.size,
Base.start,
Base.next,
Base.done
export minhash,
jaccarddist,
mashdist,
MASHSketch
using Bio.Seq
using DataStructures
type MASHSketch
sketch::Vector{UInt64}
kmersize::Int
function MASHSketch(sketch::Vector, kmersize::Int)
length(sketch) > 0 ? true : error("Sketch cannot be empty")
kmersize > 0 ? true : error("Kmersize must be greater than 0")
new(sketch, kmersize)
end
end
function length(s::MASHSketch)
return length(s.sketch)
end
function size(s::MASHSketch)
return (length(s), s.kmersize)
end
function start(s::MASHSketch)
return start(s.sketch)
end
function next(s::MASHSketch, state)
return next(s.sketch, state)
end
function done(s::MASHSketch, state)
return done(s.sketch, state)
end
function rchash(kmer::Kmer)
k = minimum((kmer, reverse_complement(kmer)))
return hash(k)
end
function kmerminhash(seq::BioSequence, kmerset, kmerhashes::Vector{UInt64}, k::Int, s::Int)
typeof(kmerset) <: Set || typeof(kmerset) <: SortedSet ? true : error("Kmerset must be a `Set` or `SortedSet`")
for kmer in each(DNAKmer{k}, seq)
if length(kmerhashes) == 0
if length(kmerset) < s
push!(kmerset, hash(kmer[2]))
elseif length(kmerset) == s
kmerset = SortedSet(kmerset)
for i in kmerset
push!(kmerhashes, pop!(kmerset))
end
end
end
if length(kmerhashes) == s
h = hash(kmer[2])
if h < kmerhashes[end]
i = searchsortedlast(kmerhashes, h)
if i == 0 && i != kmerhashes[1]
pop!(kmerhashes)
unshift!(kmerhashes, h)
elseif h != kmerhashes[i]
pop!(kmerhashes)
insert!(kmerhashes, i+1, h)
end
end
end
end
return (kmerset, kmerhashes)
end
function minhash(seq::BioSequence, k::Int, s::Int)
kmerset = Set{UInt64}()
kmerhashes = Vector{UInt64}()
kmerset, kmerhashes = kmerminhash(seq, kmerset, kmerhashes, k, s)
return MASHSketch(kmerhashes, k)
end
function minhash{T<:BioSequence}(seqs::Vector{T}, k::Int, s::Int)
kmerset = Set{UInt64}()
kmerhashes = Vector{UInt64}()
for seq in seqs
kmerset, kmerhashes = kmerminhash(seq, kmerset, kmerhashes, k, s)
end
return MASHSketch(kmerhashes, k)
end
function minhash{T<:BioSequence}(seqs::FASTAReader{T}, k::Int, s::Int)
kmerset = Set{UInt64}()
kmerhashes = Vector{UInt64}()
for seq in seqs
kmerset, kmerhashes = kmerminhash(seq.seq, kmerset, kmerhashes, k, s)
end
return MASHSketch(kmerhashes, k)
end
function jaccarddist(sketch1::MASHSketch, sketch2::MASHSketch)
sketch1.kmersize == sketch2.kmersize ? true : error("sketches must have same kmer length")
i = 0
matches = 0
sk1 = copy(sketch1.sketch)
sk2 = copy(sketch2.sketch)
n1 = shift!(sk1)
n2 = shift!(sk2)
while i < s && length(sk1) != 0 && length(sk2) != 0
if n1 == n2
matches += 1
i += 1
n1 = shift!(sk1)
n2 = shift!(sk2)
elseif n1 < n2
while n1 < n2
i += 1
n1 = shift!(sk1)
end
elseif n2 < n1
while n2 < n1
i += 1
n2 = shift!(sk2)
end
end
end
return matches / i
end
function mashdist(k::Int, j::Float64)
return 1/k * log(2j / (1+j))
end
function mashdist(sketch1::MASHSketch, sketch2::MASHSketch)
j = jaccarddist(sketch1, sketch2)
k = sketch1.kmersize
return mashdist(k, j)
end
end # module MashTun