Skip to content

Commit

Permalink
readdlm improvements: use mmap by default. options to read header and…
Browse files Browse the repository at this point in the history
… ignore invalid characters

added tests
fixed travis tests
  • Loading branch information
tanmaykm committed Jun 21, 2013
1 parent 3d67e2e commit f724a02
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 45 deletions.
18 changes: 18 additions & 0 deletions base/ascii.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,22 @@ ascii(x) = convert(ASCIIString, x)
convert(::Type{ASCIIString}, s::ASCIIString) = s
convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data)
convert(::Type{ASCIIString}, a::Array{Uint8,1}) = is_valid_ascii(a) ? ASCIIString(a) : error("invalid ASCII sequence")
function convert(::Type{ASCIIString}, a::Array{Uint8,1}, invalids_as::ASCIIString)
l = length(a)
idx = 1
iscopy = false
while idx <= l
(a[idx] < 0x80) && (idx +=1; continue)
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
(a[endn] < 0x80) && break
endn += 1
end
(endn > idx) && (endn -= 1)
splice!(a, idx:endn, invalids_as.data)
l = length(a)
end
convert(ASCIIString, a)
end
convert(::Type{ASCIIString}, s::String) = ascii(bytestring(s))
100 changes: 69 additions & 31 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,59 @@ function countlines(io::IO, eol::Char)
nl
end

readdlm(input, T::Type) = readdlm(input, invalid_dlm, T, '\n')
readdlm(input, dlm::Char, T::Type) = readdlm(input, dlm, T, '\n')

readdlm(input) = readdlm(input, invalid_dlm, '\n')
readdlm(input, dlm::Char) = readdlm(input, dlm, '\n')
readdlm(input, T::Type; opts...) = readdlm(input, invalid_dlm, T, '\n'; opts...)
readdlm(input, dlm::Char, T::Type; opts...) = readdlm(input, dlm, T, '\n'; opts...)

readdlm(input; opts...) = readdlm(input, invalid_dlm, '\n'; opts...)
readdlm(input, dlm::Char; opts...) = readdlm(input, dlm, '\n'; opts...)

readdlm(input, dlm::Char, eol::Char; opts...) = readdlm_auto(input, dlm, Float64, eol, true; opts...)
readdlm(input, dlm::Char, T::Type, eol::Char; opts...) = readdlm_auto(input, dlm, T, eol, false; opts...)

function readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool; opts...)
optsd = val_opts(opts)
isa(input, String) && (input = get(optsd, :use_mmap, true) ? mmap_array(Uint8, (filesize(input),), open(input, "r")) : readall(input))
sinp = isa(input, Vector{Uint8}) ? ccall(:jl_array_to_string, ByteString, (Array{Uint8,1},), input) :
isa(input, IO) ? readall(input) :
input
readdlm_string(sinp, dlm, T, eol, auto, optsd)
end

readdlm(input, dlm::Char, eol::Char) = readdlm_auto(input, dlm, Float64, eol, true)
readdlm(input, dlm::Char, T::Type, eol::Char) = readdlm_auto(input, dlm, T, eol, false)
function ascii_if_possible(sbuff::String)
isa(sbuff, ASCIIString) && return sbuff

readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool=false) = readdlm_string(readall(input), dlm, T, eol, auto)
function readdlm_auto(input::Vector{Uint8}, dlm::Char, T::Type, eol::Char, auto::Bool=false)
s = ccall(:jl_array_to_string, ByteString, (Array{Uint8,1},), input)
readdlm_string(s, dlm, T, eol, auto)
asci = true
d = sbuff.data
for idx in 1:length(d)
(d[idx] < 0x80) ? continue : (asci = false; break)
end
asci ? ASCIIString(sbuff.data) : sbuff
end

function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool=false)
nrows,ncols = dlm_dims(sbuff, eol, dlm)
function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
nrows,ncols = try
dlm_dims(sbuff, eol, dlm)
catch ex
!get(optsd, :ignore_invalid_chars, false) && throw(ex)
sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
dlm_dims(sbuff, eol, dlm)
end
offsets = zeros(Int, nrows, ncols)
cells = Array(T, nrows, ncols)
has_header = get(optsd, :has_header, false)
cells = Array(T, has_header ? nrows-1 : nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
dlm_fill(cells, offsets, sbuff, auto)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0)) : dlm_fill(cells, offsets, sbuff, auto, 0)
end

const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
function val_opts(opts)
d = Dict{Symbol,Bool}()
for opt in opts
!contains(valid_opts, opt[1]) && error("unknown option $(opt[1])")
!isa(opt[2], Bool) && error("$(opt[1]) can only be boolean")
d[opt[1]] = opt[2]
end
d
end

function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
Expand All @@ -64,30 +96,32 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
(ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
end

function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool)
function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)
for row in 1:maxrow

for row in (1+row_offset):(maxrow+row_offset)
cell_row = row-row_offset
for col in 1:maxcol
start_pos = dlm_col_begin(maxcol, offsets, row, col)
end_pos = offsets[row,col]
sval = SubString(sbuff, start_pos, end_pos)

if T <: Char
(length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
cells[row,col] = sval
cells[cell_row,col] = sval
elseif T <: Number
if(float64_isvalid(sval, tmp64))
cells[row,col] = tmp64[1]
cells[cell_row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false)
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset)
else
cells[row,col] = NaN
cells[cell_row,col] = NaN
end
elseif T <: String
cells[row,col] = sval
cells[cell_row,col] = sval
elseif T == Any
cells[row,col] = float64_isvalid(sval, tmp64) ? tmp64[1] : sval
cells[cell_row,col] = float64_isvalid(sval, tmp64) ? tmp64[1] : sval
else
error("file entry \"$(sval)\" cannot be converted to $T")
end
Expand All @@ -102,7 +136,7 @@ function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
row = 1
maxrow,maxcol = size(offsets)
idx = 1
while(idx < length(sbuff.data))
while(idx <= length(sbuff.data))
val,idx = next(sbuff, idx)
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
Expand Down Expand Up @@ -131,19 +165,23 @@ end
dlm_dims(s::ASCIIString, eol, dlm) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims(dbuff, eol, dlm)
ncols = nrows = col = 0
for val in dbuff
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
try
for val in dbuff
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
end
catch ex
error("at row $nrows, column $col : $ex)")
end
(col > 0) && (nrow += 1)
(col > 0) && (nrows += 1)
ncols = max(ncols, col, 1)
nrows = max(nrows, 1)
return (nrows, ncols)
end

readcsv(io) = readdlm(io, ',')
readcsv(io, T::Type) = readdlm(io, ',', T)
readcsv(io; opts...) = readdlm(io, ','; opts...)
readcsv(io, T::Type; opts...) = readdlm(io, ',', T; opts...)

# todo: keyword argument for # of digits to print
function writedlm(io::IO, a::Matrix, dlm::Char)
Expand Down
21 changes: 21 additions & 0 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,25 @@ utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{Uint8,1}) = is_valid_utf8(a) ? UTF8String(a) : error("invalid UTF-8 sequence")
function convert(::Type{UTF8String}, a::Array{Uint8,1}, invalids_as::String)
l = length(a)
idx = 1
iscopy = false
while idx <= l
if is_utf8_start(a[idx])
nextidx = idx+1+utf8_trailing[a[idx]+1]
(nextidx <= (l+1)) && (idx = nextidx; continue)
end
!iscopy && (a = copy(a); iscopy = true)
endn = idx
while endn <= l
is_utf8_start(a[endn]) && break
endn += 1
end
(endn > idx) && (endn -= 1)
splice!(a, idx:endn, invalids_as.data)
l = length(a)
end
UTF8String(a)
end
convert(::Type{UTF8String}, s::String) = utf8(bytestring(s))
32 changes: 24 additions & 8 deletions doc/helpdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1537,18 +1537,34 @@
"),

("Text I/O","Base","readdlm","readdlm(filename, delim::Char)
("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=true, ignore_invalid_chars=false)
Read a matrix from a text file where each line gives one row, with
elements separated by the given delimeter. If all data is numeric,
the result will be a numeric array. If some elements cannot be
parsed as numbers, a cell array of numbers and strings is returned.
Read a matrix from the source where each line gives one row, with
elements separated by the given delimeter. The source can be a
text file, stream or byte array. Memory mapped filed can be used
by passing the byte array representation of the mapped segment as
source.
If \"has_header\" is \"true\" the first row of data would be read
as headers and the tuple \"(data_cells, header_cells)\" is
returned instead of only \"data_cells\".
If \"use_mmap\" is \"true\" the file specified by \"source\" is
memory mapped for potential speedups.
If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
invalid character encoding will be ignored. Otherwise an error is
thrown indicating the offending character position.
If all data is numeric, \"data_cells\" will be a numeric array. If
some elements cannot be parsed as numbers, a cell array of numbers
and strings is returned for \"data_cells\".
"),

("Text I/O","Base","readdlm","readdlm(filename, delim::Char, T::Type)
("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)
Read a matrix from a text file with a given element type. If \"T\"
Read a matrix from the source with a given element type. If \"T\"
is a numeric type, the result is an array of that type, with any
non-numeric elements as \"NaN\" for floating-point types, or zero.
Other useful values of \"T\" include \"ASCIIString\", \"String\",
Expand All @@ -1563,7 +1579,7 @@
"),

("Text I/O","Base","readcsv","readcsv(filename[, T::Type])
("Text I/O","Base","readcsv","readcsv(filename[, T::Type]; options...)
Equivalent to \"readdlm\" with \"delim\" set to comma.
Expand Down
17 changes: 13 additions & 4 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1006,19 +1006,28 @@ Text I/O

Create an iterable object that will yield each line from a stream.

.. function:: readdlm(source, delim::Char)
.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=true, ignore_invalid_chars=false)

Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source.

.. function:: readdlm(source, delim::Char, T::Type)
If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.

If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.

If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.


.. function:: readdlm(source, delim::Char, T::Type; options...)

Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.

.. function:: writedlm(filename, array, delim::Char)

Write an array to a text file using the given delimeter (defaults to comma).

.. function:: readcsv(source, [T::Type])
.. function:: readcsv(source, [T::Type]; options...)

Equivalent to ``readdlm`` with ``delim`` set to comma.

Expand Down
2 changes: 1 addition & 1 deletion test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ TESTS = all core keywordargs numbers strings unicode collections hashing \
remote iostring arrayops linalg blas fft dsp sparse bitarray \
random math functional bigint sorting statistics spawn parallel \
arpack file git pkg pkg2 resolve suitesparse complex version \
pollfd mpfr broadcast socket floatapprox priorityqueue
pollfd mpfr broadcast socket floatapprox priorityqueue readdlm

$(TESTS) ::
$(QUIET_JULIA) $(call spawn,$(JULIA_EXECUTABLE)) ./runtests.jl $@
Expand Down
10 changes: 10 additions & 0 deletions test/readdlm.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
dlm_data = try
readdlm(joinpath(JULIA_HOME, split("../../test/perf2/imdb-1.tsv", '/')...), '\t')
catch
readdlm(joinpath(JULIA_HOME, split("../../julia/share/julia/test/perf2/imdb-1.tsv", '/')...), '\t')
end

@test size(dlm_data) == (31383,3)
@test dlm_data[12345,2] == "Gladiator"
@test dlm_data[31383,3] == 2005
@test dlm_data[1,1] == "McClure, Marc (I)"
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ testnames = ["core", "keywordargs", "numbers", "strings", "unicode",
"statistics", "spawn", "parallel", "priorityqueue",
"arpack", "file", "perf", "suitesparse", "version",
"resolve", "pollfd", "mpfr", "broadcast", "complex",
"socket", "floatapprox"]
"socket", "floatapprox", "readdlm"]

tests = ARGS==["all"] ? testnames : ARGS
n = min(8, CPU_CORES, length(tests))
Expand Down

0 comments on commit f724a02

Please sign in to comment.