Merge pull request #5403 from tanmaykm/readcsv

Added readdlm option to ignore empty columns.
JuliaLang · Jan 18, 2014 · b465c3e · b465c3e
2 parents ad65af1 + def6c1d
commit b465c3e
Show file tree

Hide file tree

Showing 4 changed files with 229 additions and 66 deletions.
diff --git a/base/datafmt.jl b/base/datafmt.jl
@@ -68,18 +68,21 @@ function ascii_if_possible(sbuff::String)
 end
 
 function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict)
+    ign_empty = (dlm == invalid_dlm)
+
     nrows,ncols = try
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         catch ex
             !get(optsd, :ignore_invalid_chars, false) && throw(ex)
             sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, ""))
-            dlm_dims(sbuff, eol, dlm)
+            dlm_dims(sbuff, eol, dlm, ign_empty)
         end
-    offsets = zeros(Int, nrows, ncols)
+    begin_offsets = zeros(Int, nrows, ncols)
+    end_offsets = zeros(Int, nrows, ncols)
     has_header = get(optsd, :has_header, false)
     cells = Array(T, has_header ? nrows-1 : nrows, ncols)
-    dlm_offsets(sbuff, dlm, eol, offsets)
-    has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1, eol), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0, eol)) : dlm_fill(cells, offsets, sbuff, auto, 0, eol)
+    dlm_offsets(sbuff, dlm, eol, begin_offsets, end_offsets, ign_empty)
+    has_header ? (dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 1, dlm, eol, ign_empty), dlm_fill(Array(String, 1, ncols), begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)) : dlm_fill(cells, begin_offsets, end_offsets, sbuff, auto, 0, dlm, eol, ign_empty)
 end
 
 const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap]
@@ -93,28 +96,31 @@ function val_opts(opts)
     d
 end
 
-function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
-    (row == 1) && (col == 1) && return 1
-    pp_row = (1 == col) ? (row-1) : row
-    pp_col = (1 == col) ? ncols : (col-1)
-
-    ret = offsets[pp_row, pp_col]
-    (ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2)
-end
-
-function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, eol::Char)
+function dlm_fill{T}(cells::Array{T,2}, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int, dlm::Char, eol::Char, ign_adj_dlm::Bool)
     maxrow,maxcol = size(cells)
     tmp64 = Array(Float64,1)
 
     for row in (1+row_offset):(maxrow+row_offset)
         cell_row = row-row_offset
         for col in 1:maxcol
-            start_pos = dlm_col_begin(maxcol, offsets, row, col)
-            end_pos = offsets[row,col]
-
-            end_idx = prevind(sbuff, nextind(sbuff,end_pos))
-            (col == maxcol) && (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
-            sval = SubString(sbuff, start_pos, end_idx)
+            start_pos = begin_offsets[row,col] 
+            end_pos = end_offsets[row,col]
+
+            if start_pos > 0 && end_pos > 0
+                end_idx = prevind(sbuff, nextind(sbuff,end_pos))
+                (end_idx > 0) && ('\n' == eol) && ('\r' == sbuff[end_idx]) && (end_idx = prevind(sbuff, end_idx))
+                if ign_adj_dlm
+                    is_default_dlm = (dlm == invalid_dlm)
+                    while start_pos <= end_idx
+                        val = sbuff[start_pos] 
+                        (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && break
+                        start_pos = nextind(sbuff, start_pos)
+                    end
+                end
+                sval = SubString(sbuff, start_pos, end_idx)
+            else
+                sval = SubString(sbuff, 1, 0)
+            end
 
             if T <: Char
                 (length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
@@ -123,7 +129,7 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
                 if float64_isvalid(sval, tmp64)
                     cells[cell_row,col] = tmp64[1]
                 elseif auto
-                    return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset, eol)
+                    return dlm_fill(Array(Any,maxrow,maxcol), begin_offsets, end_offsets, sbuff, false, row_offset, dlm, eol, ign_adj_dlm)
                 else
                     cells[cell_row,col] = NaN
                 end
@@ -140,52 +146,78 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
 end
 
 
-function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
-    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), offsets))
+function dlm_offsets(sbuff::UTF8String, dlm, eol, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
+    isascii(dlm) && isascii(eol) && (return dlm_offsets(sbuff.data, uint8(dlm), uint8(eol), begin_offsets, end_offsets, ign_adj_dlm))
 
     col = 0
     row = 1
-    maxrow,maxcol = size(offsets)
-    offsets[maxrow,maxcol] = length(sbuff.data)
+    maxrow,maxcol = size(begin_offsets)
     idx = 1
     is_default_dlm = (dlm == invalid_dlm)
-    while(idx <= length(sbuff.data))
+    got_data = false
+    last_offset = 0
+    slen = length(sbuff.data)
+    while idx <= slen
         val,idx = next(sbuff, idx)
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-2
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            end_offsets[row,col] = idx-2
+            begin_offsets[row,col] = last_offset+1
+        end
+        last_offset = idx
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
+    end
+    if last_offset < slen
+        col += 1
+        begin_offsets[row,col] = last_offset+1
+        end_offsets[row,col] = slen
     end
 end
 
-dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2}) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), offsets)
-function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, offsets::Array{Int,2})
+dlm_offsets(sbuff::ASCIIString, dlmc, eolc, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool) = dlm_offsets(sbuff.data, uint8(dlmc), uint8(eolc), begin_offsets, end_offsets, ign_adj_dlm)
+function dlm_offsets(dbuff::Vector{Uint8}, dlm::Uint8, eol::Uint8, begin_offsets::Array{Int,2}, end_offsets::Array{Int,2}, ign_adj_dlm::Bool)
     col = 0
     row = 1
     is_default_dlm = (dlm == uint8(invalid_dlm))
-    maxrow,maxcol = size(offsets)
-    offsets[maxrow,maxcol] = length(dbuff)
-    for idx in 1:length(dbuff)
+    maxrow,maxcol = size(begin_offsets)
+    got_data = false
+    last_offset = 0
+    slen = length(dbuff)
+    for idx in 1:slen
         val = dbuff[idx]
-        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-        col += 1
-        offsets[row,col] = idx-1
+        (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+        if got_data || !ign_adj_dlm
+            col += 1
+            end_offsets[row,col] = idx-1
+            begin_offsets[row,col] = last_offset+1
+        end
+        last_offset = idx
         (row >= maxrow) && (col == maxcol) && break
         (val == eol) && (row += 1; col = 0)
+        got_data = false
+    end
+    if last_offset < slen
+        col += 1
+        begin_offsets[row,col] = last_offset+1
+        end_offsets[row,col] = slen
     end
 end
 
-dlm_dims(s::ASCIIString, eol::Char, dlm::Char) = dlm_dims(s.data, uint8(eol), uint8(dlm))
-function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D)
-    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm)))
+dlm_dims(s::ASCIIString, eol::Char, dlm::Char, ign_adj_dlm::Bool) = dlm_dims(s.data, uint8(eol), uint8(dlm), ign_adj_dlm)
+function dlm_dims{T,D}(dbuff::T, eol::D, dlm::D, ign_adj_dlm::Bool)
+    isa(dbuff, UTF8String) && isascii(eol) && isascii(dlm) && (return dlm_dims(dbuff.data, uint8(eol), uint8(dlm), ign_adj_dlm))
     ncols = nrows = col = 0
     is_default_dlm = (dlm == convert(D, invalid_dlm))
     try
+        got_data = false
         for val in dbuff
-            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && continue
-            col += 1
+            (val != eol) && (is_default_dlm ? !in(val, _default_delims) : (val != dlm)) && (got_data = true) && continue
+            (got_data || !ign_adj_dlm) && (col += 1)
             (val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
+            got_data = false
         end
     catch ex
         error("at row $nrows, column $col : $ex)")

diff --git a/doc/helpdb.jl b/doc/helpdb.jl
@@ -2536,25 +2536,34 @@
 
 "),
 
-("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
 
-   Read a matrix from the source where each line gives one row, with
-   elements separated by the given delimeter. The source can be a text
-   file, stream or byte array. Memory mapped filed can be used by
-   passing the byte array representation of the mapped segment as
-   source.
+   Read a matrix from the source where each line (separated by \"eol\") 
+   gives one row, with elements separated by the given delimeter. The 
+   source can be a text file, stream or byte array. Memory mapped files 
+   can be used by passing the byte array representation of the mapped 
+   segment as source. 
 
-   If \"has_header\" is \"true\" the first row of data would be read
+   If \"T\" is a numeric type, the result is an array of that type, 
+   with any non-numeric elements as \"NaN\" for floating-point types, 
+   or zero. Other useful values of \"T\" include \"ASCIIString\", 
+   \"String\", and \"Any\".
+
+   If \"has_header\" is \"true\", the first row of data would be read
    as headers and the tuple \"(data_cells, header_cells)\" is returned
    instead of only \"data_cells\".
 
-   If \"use_mmap\" is \"true\" the file specified by \"source\" is
+   If \"use_mmap\" is \"true\", the file specified by \"source\" is
    memory mapped for potential speedups.
 
-   If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with
+   If \"ignore_invalid_chars\" is \"true\", bytes in \"source\" with
    invalid character encoding will be ignored. Otherwise an error is
    thrown indicating the offending character position.
 
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, delim::Char, eol::Char; options...)
+
    If all data is numeric, the result will be a numeric array. If some
    elements cannot be parsed as numbers, a cell array of numbers and
    strings is returned.
@@ -2563,11 +2572,33 @@
 
 ("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...)
 
-   Read a matrix from the source with a given element type. If \"T\"
-   is a numeric type, the result is an array of that type, with any
-   non-numeric elements as \"NaN\" for floating-point types, or zero.
-   Other useful values of \"T\" include \"ASCIIString\", \"String\",
-   and \"Any\".
+   The end of line delimiter is taken as \"\\n\". 
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, delim::Char; options...)
+
+   The end of line delimiter is taken as \"\\n\". If all data is 
+   numeric, the result will be a numeric array. If some elements 
+   cannot be parsed as numbers, a cell array of numbers and 
+   strings is returned.
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, T::Type; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. 
+   The end of line delimiter is taken as \"\\n\".
+
+"),
+
+("Text I/O","Base","readdlm","readdlm(source, options...)
+
+   The columns are assumed to be separated by one or more whitespaces. 
+   The end of line delimiter is taken as \"\\n\". If all data is 
+   numeric, the result will be a numeric array. If some elements 
+   cannot be parsed as numbers, a cell array of numbers and strings 
+   is returned.
 
 "),
 

diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1664,22 +1664,37 @@ Text I/O
 
    Create an iterable object that will yield each line from a stream.
 
-.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
+.. function:: readdlm(source, delim::Char, T::Type, eol::Char; has_header=false, use_mmap=false, ignore_invalid_chars=false)
 
-   Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. 
+   Read a matrix from the source where each line (separated by ``eol``) gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped files can be used by passing the byte array representation of the mapped segment as source. 
 
-   If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
+   If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
 
-   If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups.
+   If ``has_header`` is ``true``, the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``.
 
-   If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
+   If ``use_mmap`` is ``true``, the file specified by ``source`` is memory mapped for potential speedups.
 
-   If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+   If ``ignore_invalid_chars`` is ``true``, bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position.
 
+.. function:: readdlm(source, delim::Char, eol::Char; options...)
 
+   If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+
 .. function:: readdlm(source, delim::Char, T::Type; options...)
 
-   Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``.
+   The end of line delimiter is taken as ``\n``.
+
+.. function:: readdlm(source, delim::Char; options...)
+
+   The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
+
+.. function:: readdlm(source, T::Type; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``.
+
+.. function:: readdlm(source; options...)
+
+   The columns are assumed to be separated by one or more whitespaces. The end of line delimiter is taken as ``\n``. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned.
 
 .. function:: writedlm(f, A, delim='\t')