Skip to content

Commit

Permalink
when no delimiter is specified, delimiters are taken as one or more a…
Browse files Browse the repository at this point in the history
…djoining whitespaces.

fixed bug in handling empty columns.
updated tests and docs.
fixes #5391
  • Loading branch information
tanmaykm committed Jan 18, 2014
1 parent dc51b00 commit def6c1d
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 66 deletions.
118 changes: 75 additions & 43 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,21 @@ function ascii_if_possible(sbuff::String)
end

function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
ign_empty = (dlm == invalid_dlm)

nrows,ncols = try
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
catch ex
!get(optsd, :ignore_invalid_chars, false) && throw(ex)
sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
dlm_dims(sbuff, eol, dlm)
dlm_dims(sbuff, eol, dlm, ign_empty)
end
offsets = zeros(Int, nrows, ncols)
begin_offsets = zeros(Int, nrows, ncols)
end_offsets = zeros(Int, nrows, ncols)
has_header = get(optsd, :has_header, false)
cells = Array(T, has_header ? nrows-1 : nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
dlm_offsets(sbuff, dlm, eol, begin_offsets, end_offsets, ign_empty)
has_header ? (dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)
end

const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
Expand All @@ -93,28 +96,31 @@ function val_opts(opts)
d
end

function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
(row == 1) && (col == 1) && return 1
pp_row = (1 == col) ? (row-1) : row
pp_col = (1 == col) ? ncols : (col-1)

ret = offsets[pp_row, pp_col]
(ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
end

function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
function dlm_fill{T}(cells::Array{T,2}, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)

for row in (1+row_offset):(maxrow+row_offset)
cell_row = row-row_offset
for col in 1:maxcol
start_pos = dlm_col_begin(maxcol, offsets, row, col)
end_pos = offsets[row,col]

end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
sval = SubString(sbuff, start_pos, end_idx)
start_pos = begin_offsets[row,col]
end_pos = end_offsets[row,col]

if start_pos > 0 && end_pos > 0
end_idx = prevind(sbuff, nextind(sbuff,end_pos))
(end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
if ign_adj_dlm
is_default_dlm = (dlm == invalid_dlm)
while start_pos <= end_idx
val = sbuff[start_pos]
(is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
start_pos = nextind(sbuff, start_pos)
end
end
sval = SubString(sbuff, start_pos, end_idx)
else
sval = SubString(sbuff, 1, 0)
end

if T <: Char
(length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
Expand All @@ -123,7 +129,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
if float64_isvalid(sval, tmp64)
cells[cell_row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
return dlm_fill(Array(Any,maxrow,maxcol), begin_offsets, end_offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
else
cells[cell_row,col] = NaN
end
Expand All @@ -140,52 +146,78 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
end


function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
function dlm_offsets(sbuff::UTF8String, dlm, eol, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), begin_offsets, end_offsets, ign_adj_dlm))

col = 0
row = 1
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(sbuff.data)
maxrow,maxcol = size(begin_offsets)
idx = 1
is_default_dlm = (dlm == invalid_dlm)
while(idx <= length(sbuff.data))
got_data = false
last_offset = 0
slen = length(sbuff.data)
while idx <= slen
val,idx = next(sbuff, idx)
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-2
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
end_offsets[row,col] = idx-2
begin_offsets[row,col] = last_offset+1
end
last_offset = idx
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
if last_offset < slen
col += 1
begin_offsets[row,col] = last_offset+1
end_offsets[row,col] = slen
end
end

dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
dlm_offsets(sbuff::ASCIIString, dlmc, eolc, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), begin_offsets, end_offsets, ign_adj_dlm)
function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
col = 0
row = 1
is_default_dlm = (dlm == uint8(invalid_dlm))
maxrow,maxcol = size(offsets)
offsets[maxrow,maxcol] = length(dbuff)
for idx in 1:length(dbuff)
maxrow,maxcol = size(begin_offsets)
got_data = false
last_offset = 0
slen = length(dbuff)
for idx in 1:slen
val = dbuff[idx]
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
if got_data || !ign_adj_dlm
col += 1
end_offsets[row,col] = idx-1
begin_offsets[row,col] = last_offset+1
end
last_offset = idx
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
got_data = false
end
if last_offset < slen
col += 1
begin_offsets[row,col] = last_offset+1
end_offsets[row,col] = slen
end
end

dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
ncols = nrows = col = 0
is_default_dlm = (dlm == convert(D, invalid_dlm))
try
got_data = false
for val in dbuff
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
col += 1
(val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
(got_data || !ign_adj_dlm) && (col += 1)
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
got_data = false
end
catch ex
error("at row $nrows, column $col : $ex)")
Expand Down
59 changes: 45 additions & 14 deletions doc/helpdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2536,25 +2536,34 @@
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
Read a matrix from the source where each line gives one row, with
elements separated by the given delimeter. The source can be a text
file, stream or byte array. Memory mapped filed can be used by
passing the byte array representation of the mapped segment as
source.
Read a matrix from the source where each line (separated by \"eol\")
gives one row, with elements separated by the given delimeter. The
source can be a text file, stream or byte array. Memory mapped files
can be used by passing the byte array representation of the mapped
segment as source.
If \"has_header\" is \"true\" the first row of data would be read
If \"T\" is a numeric type, the result is an array of that type,
with any non-numeric elements as \"NaN\" for floating-point types,
or zero. Other useful values of \"T\" include \"ASCIIString\",
\"String\", and \"Any\".
If \"has_header\" is \"true\", the first row of data would be read
as headers and the tuple \"(data_cells, header_cells)\" is returned
instead of only \"data_cells\".
If \"use_mmap\" is \"true\" the file specified by \"source\" is
If \"use_mmap\" is \"true\", the file specified by \"source\" is
memory mapped for potential speedups.
If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
invalid character encoding will be ignored. Otherwise an error is
thrown indicating the offending character position.
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char, eol::Char; options...)
If all data is numeric, the result will be a numeric array. If some
elements cannot be parsed as numbers, a cell array of numbers and
strings is returned.
Expand All @@ -2563,11 +2572,33 @@

("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)
Read a matrix from the source with a given element type. If \"T\"
is a numeric type, the result is an array of that type, with any
non-numeric elements as \"NaN\" for floating-point types, or zero.
Other useful values of \"T\" include \"ASCIIString\", \"String\",
and \"Any\".
The end of line delimiter is taken as \"\\n\".
"),

("Text I/O","Base","readdlm","readdlm(source, delim::Char; options...)
The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and
strings is returned.
"),

("Text I/O","Base","readdlm","readdlm(source, T::Type; options...)
The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\".
"),

("Text I/O","Base","readdlm","readdlm(source, options...)
The columns are assumed to be separated by one or more whitespaces.
The end of line delimiter is taken as \"\\n\". If all data is
numeric, the result will be a numeric array. If some elements
cannot be parsed as numbers, a cell array of numbers and strings
is returned.
"),

Expand Down
29 changes: 22 additions & 7 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1664,22 +1664,37 @@ Text I/O

Create an iterable object that will yield each line from a stream.

.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
.. function:: readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)

Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source.
Read a matrix from the source where each line (separated by ``eol``) gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped files can be used by passing the byte array representation of the mapped segment as source.

If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.

If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.

If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.

.. function:: readdlm(source, delim::Char, eol::Char; options...)

If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, delim::Char, T::Type; options...)

Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source, delim::Char; options...)

The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: readdlm(source, T::Type; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``.

.. function:: readdlm(source; options...)

The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.

.. function:: writedlm(f, A, delim='\t')

Expand Down
Loading

0 comments on commit def6c1d

Please sign in to comment.