-
Notifications
You must be signed in to change notification settings - Fork 82
/
query.jl
448 lines (390 loc) · 14.2 KB
/
query.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
### Format registry infrastructure
@compat abstract type OS end
@compat abstract type Unix <: OS end
struct Windows <: OS end
struct OSX <: Unix end
struct Linux <: Unix end
struct LOAD end
struct SAVE end
split_predicates(list) = filter(x-> x <: OS, list), filter(x-> !(x <: OS), list)
applies_to_os(os::Vector) = isempty(os) || any(applies_to_os, os)
applies_to_os{O <: OS}(os::Type{O}) = false
applies_to_os{U <: Unix}(os::Type{U}) = Compat.Sys.isunix()
applies_to_os(os::Type{Windows}) = Compat.Sys.iswindows()
applies_to_os(os::Type{OSX}) = Compat.Sys.isapple()
applies_to_os(os::Type{Linux}) = Compat.Sys.islinux()
function add_loadsave(format, predicates)
library = shift!(predicates)
os, loadsave = split_predicates(predicates)
if applies_to_os(os)
if isempty(loadsave) || (LOAD in loadsave)
add_loader(format, library)
end
if isempty(loadsave) || (SAVE in loadsave)
add_saver(format, library)
end
end
end
"""
`DataFormat{sym}()` indicates a known binary or text format of kind `sym`,
where `sym` is always a symbol. For example, a .csv file might have
`DataFormat{:CSV}()`.
An easy way to write `DataFormat{:CSV}` is `format"CSV"`.
"""
struct DataFormat{sym} end
macro format_str(s)
:(DataFormat{$(Expr(:quote, Symbol(s)))})
end
const unknown_df = DataFormat{:UNKNOWN}
"""
`unknown(f)` returns true if the format of `f` is unknown.
"""
unknown(::Type{format"UNKNOWN"}) = true
unknown{sym}(::Type{DataFormat{sym}}) = false
const ext2sym = Dict{String, Union{Symbol,Vector{Symbol}}}()
const magic_list = Vector{Pair}(0) # sorted, see magic_cmp below
const sym2info = Dict{Symbol,Any}() # Symbol=>(magic, extension)
const magic_func = Vector{Pair}(0) # for formats with complex magic #s
function add_format(fmt, magic, extension, load_save_libraries...)
add_format(fmt, magic, extension)
for library in load_save_libraries
add_loadsave(fmt, library)
end
fmt
end
"""
`add_format(fmt, magic, extention)` registers a new `DataFormat`.
For example:
add_format(format"PNG", (UInt8[0x4d,0x4d,0x00,0x2b], UInt8[0x49,0x49,0x2a,0x00]), [".tiff", ".tif"])
add_format(format"PNG", [0x89,0x50,0x4e,0x47,0x0d,0x0a,0x1a,0x0a], ".png")
add_format(format"NRRD", "NRRD", [".nrrd",".nhdr"])
Note that extensions, magic numbers, and format-identifiers are case-sensitive.
"""
function add_format{sym}(fmt::Type{DataFormat{sym}}, magic::Union{Tuple,AbstractVector,String}, extension)
haskey(sym2info, sym) && error("format ", fmt, " is already registered")
m = canonicalize_magic(magic)
rng = searchsorted(magic_list, m, lt=magic_cmp)
if !isempty(m) && !isempty(rng)
error("magic bytes ", m, " are already registered")
end
insert!(magic_list, first(rng), Pair(m, sym)) # m=>sym in 0.4
sym2info[sym] = (m, extension)
add_extension(extension, sym)
fmt
end
# for multiple magic bytes
function add_format{sym, T <: Vector{UInt8}}(fmt::Type{DataFormat{sym}},
magics::Tuple{T,Vararg{T}}, extension)
haskey(sym2info, sym) && error("format ", fmt, " is already registered")
magics = map(canonicalize_magic, magics)
for magic in magics
rng = searchsorted(magic_list, magic, lt=magic_cmp)
if !isempty(magic) && !isempty(rng)
error("magic bytes ", magic, " are already registered")
end
insert!(magic_list, first(rng), Pair(magic, sym)) # m=>sym in 0.4
end
sym2info[sym] = (magics, extension)
add_extension(extension, sym)
fmt
end
# For when "magic" is supplied as a function (see the HDF5 example in
# registry.jl)
function add_format{sym}(fmt::Type{DataFormat{sym}}, magic, extension)
haskey(sym2info, sym) && error("format ", fmt, " is already registered")
push!(magic_func, Pair(magic,sym)) # magic=>sym in 0.4
sym2info[sym] = (magic, extension)
add_extension(extension, sym)
fmt
end
"""
`del_format(fmt::DataFormat)` deletes `fmt` from the format registry.
"""
function del_format{sym}(fmt::Type{DataFormat{sym}})
magic, extension = sym2info[sym]
del_magic(magic, sym)
delete!(sym2info, sym)
del_extension(extension)
nothing
end
# Deletes multiple magic bytes
del_magic(magic::Tuple, sym) = for m in magic
del_magic(m, sym)
end
# Deletes single magic bytes
function del_magic{N}(magic::NTuple{N, UInt8}, sym)
rng = searchsorted(magic_list, magic, lt=magic_cmp)
if length(magic) == 0
fullrng = rng
found = false
for idx in fullrng
if last(magic_list[idx]) == sym
rng = idx:idx
found = true
break
end
end
found || error("format ", sym, " not found")
end
@assert length(rng) == 1
deleteat!(magic_list, first(rng))
nothing
end
function del_magic(magic::Function, sym)
deleteat!(magic_func, findfirst(magic_func, Pair(magic,sym)))
nothing
end
"""
`info(fmt)` returns the magic bytes/extension information for
`DataFormat` `fmt`.
"""
Base.info{sym}(::Type{DataFormat{sym}}) = sym2info[sym]
canonicalize_magic{N}(m::NTuple{N,UInt8}) = m
canonicalize_magic(m::AbstractVector{UInt8}) = tuple(m...)
canonicalize_magic(m::String) = canonicalize_magic(Vector{UInt8}(m))
function add_extension(ext::String, sym)
if haskey(ext2sym, ext)
v = ext2sym[ext]
if isa(v, Symbol)
ext2sym[ext] = Symbol[v, sym]
else
push!(ext2sym[ext], sym)
end
return
end
ext2sym[ext] = sym
end
function add_extension(ext::Union{Array,Tuple}, sym)
for e in ext
add_extension(e, sym)
end
end
del_extension(ext::String) = delete!(ext2sym, ext)
function del_extension(ext::Union{Array,Tuple})
for e in ext
del_extension(e)
end
end
# magic_cmp results in magic_list being sorted in order of increasing
# length(magic), then (among tuples with the same length) in
# dictionary order. This ordering has the advantage that you can
# incrementally read bytes from the stream without worrying that
# you'll encounter an EOF yet still have potential matches later in
# the list.
function magic_cmp(p::Pair, t::Tuple)
pt = first(p)
lp, lt = length(pt), length(t)
lp < lt && return true
lp > lt && return false
pt < t
end
function magic_cmp(t::Tuple, p::Pair)
pt = first(p)
lp, lt = length(pt), length(t)
lt < lp && return true
lt > lp && return false
t < pt
end
@compat abstract type Formatted{F<:DataFormat} end # A specific file or stream
"""
`File(fmt, filename)` indicates that `filename` is a file of known
DataFormat `fmt`. For example, `File{fmtpng}(filename)` would indicate a PNG
file.
"""
struct File{F<:DataFormat} <: Formatted{F}
filename::String
end
File{sym}(fmt::Type{DataFormat{sym}}, filename) = File{fmt}(filename)
"""
`filename(file)` returns the filename associated with `File` `file`.
"""
filename(f::File) = f.filename
"""
`file_extension(file)` returns the file extension associated with `File` `file`.
"""
file_extension(f::File) = splitext(filename(f))[2]
"""
`Stream(fmt, io, [filename])` indicates that the stream `io` is
written in known `Format`. For example, `Stream{PNG}(io)` would
indicate PNG format. If known, the optional `filename` argument can
be used to improve error messages, etc.
"""
struct Stream{F<:DataFormat,IOtype<:IO} <: Formatted{F}
io::IOtype
filename::Nullable{String}
end
Stream{F<:DataFormat}(::Type{F}, io::IO) = Stream{F,typeof(io)}(io, Nullable{String}())
Stream{F<:DataFormat}(::Type{F}, io::IO, filename::AbstractString) = Stream{F,typeof(io)}(io,String(filename))
Stream{F<:DataFormat}(::Type{F}, io::IO, filename) = Stream{F,typeof(io)}(io,filename)
Stream{F}(file::File{F}, io::IO) = Stream{F,typeof(io)}(io,filename(file))
"`stream(s)` returns the stream associated with `Stream` `s`"
stream(s::Stream) = s.io
"""
`filename(stream)` returns a nullable-string of the filename
associated with `Stream` `stream`.
"""
filename(s::Stream) = s.filename
"""
`file_extension(file)` returns a nullable-string for the file extension associated with `Stream` `stream`.
"""
function file_extension(f::Stream)
isnull(filename(f)) && return filename(f)
splitext(get(filename(f)))[2]
end
# Note this closes the stream. It's useful when you've opened
# the file to check the magic bytes, but don't want to leave
# a dangling stream.
function file!{F}(strm::Stream{F})
f = filename(strm)
if isnull(f)
error("filename unknown")
end
close(strm.io)
File{F}(get(f))
end
# Implement standard I/O operations for File and Stream
@inline function Base.open{F<:DataFormat}(file::File{F}, args...)
fn = filename(file)
Stream(F, open(fn, args...), abspath(fn))
end
Base.close(s::Stream) = close(stream(s))
Base.position(s::Stream) = position(stream(s))
Base.seek(s::Stream, offset::Integer) = (seek(stream(s), offset); s)
Base.seekstart(s::Stream) = (seekstart(stream(s)); s)
Base.seekend(s::Stream) = (seekend(stream(s)); s)
Base.skip(s::Stream, offset::Integer) = (skip(stream(s), offset); s)
Base.eof(s::Stream) = eof(stream(s))
@inline Base.read(s::Stream, args...) = read(stream(s), args...)
Base.read!(s::Stream, array::Array) = read!(stream(s), array)
@inline Base.write(s::Stream, args...) = write(stream(s), args...)
# Note: we can't sensibly support the all keyword. If you need that,
# call read(stream(s), ...; all=value) manually
Base.readbytes!(s::Stream, b) = readbytes!(stream(s), b)
Base.readbytes!(s::Stream, b, nb) = readbytes!(stream(s), b, nb)
Base.read(s::Stream) = read(stream(s))
Base.read(s::Stream, nb) = read(stream(s), nb)
Base.flush(s::Stream) = flush(stream(s))
Base.isreadonly(s::Stream) = isreadonly(stream(s))
Base.isopen(s::Stream) = isopen(stream(s))
"`magic(fmt)` returns the magic bytes of format `fmt`"
magic{F<:DataFormat}(fmt::Type{F}) = UInt8[info(fmt)[1]...]
"""
`skipmagic(s)` sets the position of `Stream` `s` to be just after the magic bytes.
For a plain IO object, you can use `skipmagic(io, fmt)`.
"""
skipmagic{F}(s::Stream{F}) = (skipmagic(stream(s), F); s)
function skipmagic{sym}(io, fmt::Type{DataFormat{sym}})
magic, _ = sym2info[sym]
skipmagic(io, magic)
nothing
end
skipmagic(io, magic::Function) = nothing
skipmagic{N}(io, magic::NTuple{N,UInt8}) = seek(io, length(magic))
function skipmagic(io, magic::Tuple)
lengths = map(length, magic)
all(x->lengths[1] == x, lengths) && return seek(io, lengths[1]) # it doesn't matter what magic bytes get skipped as they all have the same length
magic = [magic...]
sort!(magic, lt=(a,b)-> length(a)>= length(b)) # start with longest first, to avoid overlapping magic bytes
seekend(io)
len = position(io)
seekstart(io)
filter!(x-> length(x) <= len, magic) # throw out magic bytes that are longer than IO
tmp = read(io, length(first(magic))) # now, first is both the longest and guaranteed to fit into io, so we can just read the bytes
for m in magic
if magic_equal(m, tmp)
seek(io, length(m))
return nothing
end
end
error("tried to skip magic bytes of an IO that does not contain the magic bytes of the format. IO: $io")
end
function magic_equal(magic, buffer)
for (i,elem) in enumerate(magic)
buffer[i] != elem && return false
end
true
end
unknown{F}(::File{F}) = unknown(F)
unknown{F}(::Stream{F}) = unknown(F)
"""
`query(filename)` returns a `File` object with information about the
format inferred from the file's extension and/or magic bytes.
"""
function query(filename::AbstractString)
_, ext = splitext(filename)
if haskey(ext2sym, ext)
sym = ext2sym[ext]
no_magic = !hasmagic(sym)
if lensym(sym) == 1 && (no_magic || !isfile(filename)) # we only found one candidate and there is no magic bytes, or no file, trust the extension
return File{DataFormat{sym}}(filename)
elseif !isfile(filename) && lensym(sym) > 1
return File{DataFormat{sym[1]}}(filename)
end
if no_magic && !hasfunction(sym)
error("Some formats with extension ", ext, " have no magic bytes; use `File{format\"FMT\"}(filename)` to resolve the ambiguity.")
end
end
!isfile(filename) && return File{unknown_df}(filename) # (no extension || no magic byte) && no file
# Otherwise, check the magic bytes
file!(query(open(filename), abspath(filename)))
end
lensym(s::Symbol) = 1
lensym(v::Vector) = length(v)
hasmagic(s::Symbol) = hasmagic(sym2info[s][1])
hasmagic(v::Vector) = any(hasmagic, v)
hasmagic(t::Tuple) = !isempty(t)
hasmagic(::Any) = false # for when magic is a function
hasfunction(s::Symbol) = hasfunction(sym2info[s][1])
hasfunction(v::Vector) = any(hasfunction, v)
hasfunction(s::Any) = true #has function
hasfunction(s::Tuple) = false #has magic
"""
`query(io, [filename])` returns a `Stream` object with information about the
format inferred from the magic bytes.
"""
query(io::IO, filename) = query(io, Nullable(String(filename)))
function query(io::IO, filename::Nullable{String}=Nullable{String}())
magic = Vector{UInt8}(0)
pos = position(io)
for p in magic_list
m = first(p)
length(m) == 0 && continue
while length(m) > length(magic)
if eof(io)
seek(io, pos)
return Stream{unknown_df,typeof(io)}(io, filename)
end
push!(magic, read(io, UInt8))
end
if iter_eq(magic, m)
seek(io, pos)
return Stream{DataFormat{last(p)},typeof(io)}(io, filename)
end
end
if seekable(io)
for p in magic_func
seek(io, pos)
f = first(p)
if f(io)
return Stream{DataFormat{last(p)},typeof(io)}(seek(io, pos), filename)
end
end
seek(io, pos)
end
Stream{unknown_df,typeof(io)}(io, filename)
end
seekable(io::IOBuffer) = io.seekable
seekable(::IOStream) = true
seekable(::Any) = false
function iter_eq(A, B)
length(A) == length(B) || return false
i,j = 1,1
for _=1:length(A)
a=A[i]; b=B[j]
a == b && (i+=1; j+=1; continue)
a == UInt32('\r') && (i+=1; continue) # this seems like the shadiest solution to deal with windows \r\n
b == UInt32('\r') && (j+=1; continue)
return false #now both must be unequal, and no \r windows excemption any more
end
true
end