diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 73e738341e..f9212fa6db 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -155,11 +155,11 @@ export # reconcile_groups, PooledDataArray, PooledDataMatrix, PooledDataVector, - print_table, + printtable, range, rbind, read_minibatch, - read_table, + readtable, reldiff, removeNA, rename_group!, @@ -198,7 +198,7 @@ export # reconcile_groups, within!, within, without, - write_table, + writetable, xtab, xtabs, stack_df, @@ -221,6 +221,16 @@ export # reconcile_groups, read_rda, vecbind +############################################################################## +## +## Deprecations +## +############################################################################## + +Base.@deprecate read_table readtable +Base.@deprecate print_table printtable +Base.@deprecate write_table writetable + ############################################################################## ## ## Load files diff --git a/src/io.jl b/src/io.jl index 9ec939f64f..09bf4e6283 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,482 +1,688 @@ -const DEFAULT_BOOLEAN_STRINGS = - ["T", "F", "t", "f", "TRUE", "FALSE", "true", "false"] -const DEFAULT_TRUE_STRINGS = - ["T", "t", "TRUE", "true"] -const DEFAULT_FALSE_STRINGS = - ["F", "f", "FALSE", "false"] - -const DEFAULT_QUOTATION_CHARACTER = '"' -const DEFAULT_SEPARATOR = ',' - -const DEFAULT_MISSINGNESS_INDICATORS = - ["", "NA", "#NA", "N/A", "#N/A", "NULL", "."] - -function parse_bool(x::String) - if contains(DEFAULT_TRUE_STRINGS, x) - return true - elseif contains(DEFAULT_FALSE_STRINGS, x) - return false - else - error("Could not parse bool") +function readnrows!(io::IO, + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + nrows::Int, + eol::Char, + separator::Char, + quotemark::Char) + bytesread::Int = 0 + nrowsread::Int = 0 + eolsread::Int = 0 + separatorsread::Int = 0 + + inquotes::Bool = false + inescape::Bool = false + + chr::Uint8 = ' ' + + buffer_size::Int = length(buffer) + eol_size::Int = length(eol_indices) + separator_size::Int = length(separator_indices) + + while !eof(io) && nrowsread < nrows + bytesread += 1 + chr = read(io, Uint8) + + if buffer_size < bytesread + buffer_size *= 2 + resize!(buffer, buffer_size) + end + buffer[bytesread] = chr + + if !inquotes + if chr == eol + nrowsread +=1 + eolsread +=1 + if eol_size < eolsread + eol_size *= 2 + resize!(eol_indices, eol_size) + end + eol_indices[eolsread] = bytesread + end + + if chr == separator + separatorsread += 1 + if separator_size < separatorsread + separator_size *= 2 + resize!(separator_indices, separator_size) + end + separator_indices[separatorsread] = bytesread + end + + if chr == quotemark && !inescape + inquotes = true + end + else + if chr == quotemark && !inescape + inquotes = false + end + end + + if chr == '\\' + inescape = true + else + inescape = false + end end + + # Deal with files that do not include a final EOL + if eof(io) && chr != eol + nrowsread += 1 + if eol_size < eolsread + eol_size *= 2 + resize!(eol_indices, eol_size) + end + eol_indices[eolsread + 1] = bytesread + 1 + end + + return nrowsread, bytesread, eolsread, separatorsread end -############################################################################## -# -# Low-level text parsing -# -############################################################################## +function buffermatch(buffer::Vector{Uint8}, + left::Int, + right::Int, + exemplars::Vector{ASCIIString}) + l::Int = right - left + 1 -function make_extract_string() - extract_cache = IOBuffer(Array(Uint8, 500), true, true) - # Do we really need a closure? - # Why not just keep passing this argument in? - function f(this, left::Int, right::Int, omitlist::Set = Set()) - extract_cache_size = right - left - if extract_cache_size > extract_cache.size - extract_cache = IOBuffer(Array(Uint8, extract_cache_size), true, true) - end - seek(extract_cache, 0) # necessary? - if length(this) >= 1 - while isvalid(this, right) && right > left && this[right] == ' ' - right -= 1 + for index in 1:length(exemplars) + exemplar::ASCIIString = exemplars[index] + if length(exemplar) == l + isamatch::Bool = true + + for i in 0:(l - 1) + isamatch &= buffer[left + i] == exemplar[1 + i] end - i = left - while i <= right - lasti = i - ch, i = next(this, i) - if !contains(omitlist, lasti) - print(extract_cache, ch) - end + + if isamatch + return true end - return takebuf_string(extract_cache) + end + end + + return false +end + +# All of these functions return three items: +# Parsed value, Success indicator, Missing indicator + +# TODO: Align more closely with parseint code +function bytestoint(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + + if left > right || buffermatch(buffer, left, right, missing_nonstrings) + return 0, true, true + end + + value::Int = 0 + power::Int = 1 + index::Int = right + byte::Uint8 = buffer[index] + + while index > left + if '0' <= byte <= '9' + value += (byte - '0') * power + power *= 10 else - return "" + return value, false, false end + index -= 1 + byte = buffer[index] + end + + if byte == '-' + return -value, true, false + elseif byte == '+' + return value, true, false + elseif '0' <= byte <= '9' + value += (byte - '0') * power + return value, true, false + else + return value, false, false end - return f end -extract_string = make_extract_string() - -const STATE_EXPECTING_VALUE = 0 -const STATE_IN_BARE = 1 -const STATE_IN_QUOTED = 2 -const STATE_POSSIBLE_EOQUOTED = 3 -const STATE_EXPECTING_SEP = 4 - -# Read one line of delimited text -# This is complex because delimited text can contain EOL inside quoted fields -function read_separated_line(io, - separator::Char, - quotation_character::Char) - # Indexes into the current line for the current item - left = 0 - right = 0 - - # Was using RopeString for efficient appends, but rare case and makes - # UTF-8 processing harder - this = Base.chomp!(readline(io)) - - # Short-circuit on the empty line - if this == "" - return Array(UTF8String, 0) + +let out::Vector{Float64} = Array(Float64, 1) + global bytestofloat + function bytestofloat(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}) + if left > right || buffermatch(buffer, left, right, missing_nonstrings) + return 0.0, true, true + end + + success = ccall(:jl_substrtod, + Int32, + (Ptr{Uint8}, Int, Int, Ptr{Float64}), + buffer, + left - 1, + right - left + 1, + out) == 0 + + return out[1], success, false end +end - # 5-state machine. See list of possible states above - state = STATE_EXPECTING_VALUE +function bytestobool(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_nonstrings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}) + if left > right || buffermatch(buffer, left, right, missing_nonstrings) + return false, true, true + end - # Index of characters to remove - omitlist = Set() + if buffermatch(buffer, left, right, true_strings) + return true, true, false + elseif buffermatch(buffer, left, right, false_strings) + return false, true, false + else + return false, false, false + end +end - # Where are we - i = start(this) - eol = false +function bytestostring(buffer::Vector{Uint8}, + left::Int, + right::Int, + missing_strings::Vector{ASCIIString}, + quotemark::Char) + if left > right && buffer[right] != quotemark + return "", true, true + end - # Will eventually return a Vector of strings - num_elems = 0 - ret = Array(ByteString, 0) + if buffermatch(buffer, left, right, missing_strings) + return "", true, true + end - # off we go! use manual loops because this can grow - while true - eol = done(this, i) - if !eol - this_i = i - this_char, i = next(this, i) - end - if state == STATE_EXPECTING_VALUE - if eol - num_elems += 1 - push!(ret, "") - break - elseif this_char == ' ' - continue - elseif this_char == separator - num_elems += 1 - push!(ret, "") - elseif this_char == quotation_character - left = this_i + 1 - state = STATE_IN_QUOTED + return bytestring(buffer[left:right]), true, false +end + +function builddf(rows::Int, + cols::Int, + bytes::Int, + eols::Int, + separators::Int, + buffer::Vector{Uint8}, + eol_indices::Vector{Int}, + separator_indices::Vector{Int}, + separator::Char, + eol::Char, + quotemark::Char, + missing_nonstrings::Vector{ASCIIString}, + missing_strings::Vector{ASCIIString}, + true_strings::Vector{ASCIIString}, + false_strings::Vector{ASCIIString}, + ignorespace::Bool, + makefactors::Bool) + columns::Vector{Any} = Array(Any, cols) + + for j in 1:cols + values = Array(Int, rows) + missing::BitVector = falses(rows) + isint::Bool = true + isfloat::Bool = true + isbool::Bool = true + isstring::Bool = true + + i::Int = 0 + + while i < rows + i += 1 + + # Determine left and right boundaries of field + if j == 1 + if i == 1 + left = 1 + else + left = eol_indices[i - 1] + 1 + end else - left = this_i - state = STATE_IN_BARE + left = separator_indices[(i - 1) * (cols - 1) + j - 1] + 1 end - elseif state == STATE_IN_BARE - if eol - right = this_i - num_elems += 1 - push!(ret, extract_string(this, left, right)) - break - elseif this_char == separator - right = this_i - 1 - num_elems += 1 - push!(ret, extract_string(this, left, right)) - state = STATE_EXPECTING_VALUE + + if j == cols + if i == rows + if buffer[bytes] == eol + right = bytes - 1 + else + right = bytes + end + else + right = eol_indices[i] - 1 + end else - continue + right = separator_indices[(i - 1) * (cols - 1) + j] - 1 end - elseif state == STATE_IN_QUOTED - if eol - this = string(this, "\n", Base.chomp!(readline(io))) - elseif this_char == quotation_character - state = STATE_POSSIBLE_EOQUOTED - else - continue + + # Ignore left-and-right whitespace padding + if ignorespace + while left < right && buffer[left] == ' ' + left += 1 + end + while left < right && buffer[right] == ' ' + right -= 1 + end end - elseif state == STATE_POSSIBLE_EOQUOTED - if eol - right = this_i - 1 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - break - elseif this_char == quotation_character - add!(omitlist, this_i) - state = STATE_IN_QUOTED - elseif this_char == separator - right = this_i - 2 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - empty!(omitlist) - state = STATE_EXPECTING_VALUE - elseif this_char == ' ' - right = this_i - 2 - num_elems += 1 - push!(ret, extract_string(this, left, right, omitlist)) - empty!(omitlist) - state = STATE_EXPECTING_SEP - else - error("unexpected character after a quote") + + # (1) Try to parse values as Int's + if isint + values[i], success, missing[i] = + bytestoint(buffer, left, right, missing_nonstrings) + if success + continue + else + isint = false + values = convert(Array{Float64}, values) + values[i], success, missing[i] = + bytestoint(buffer, left, right, missing_nonstrings) + end end - elseif state == STATE_EXPECTING_SEP - if eol - break - elseif this_char == ' ' - continue - elseif this_char == separator - state = STATE_EXPECTING_VALUE + + # (2) Try to parse as Float64's + if isfloat + values[i], success, missing[i] = + bytestofloat(buffer, left, right, missing_nonstrings) + if success + continue + else + isfloat = false + values = Array(Bool, rows) + i = 1 + end + end + + # If we go this far, we should ignore quote marks on the boundaries + while left < right && buffer[left] == quotemark + left += 1 + end + while left < right && buffer[right] == quotemark + right -= 1 + end + + # (3) Try to parse as Bool's + if isbool + values[i], success, missing[i] = + bytestobool(buffer, left, right, + missing_nonstrings, + true_strings, false_strings) + if success + continue + else + isbool = false + values = Array(UTF8String, rows) + i = 1 + end + end + + # (4) Fallback to UTF8String + if left == right && buffer[right] == quotemark + # Empty string special method + values[i], success, missing[i] = "", true, false else - error("expecting a separator but got something else") + values[i], success, missing[i] = + bytestostring(buffer, left, right, missing_strings, quotemark) end end + + if makefactors && isstring + columns[j] = PooledDataArray(values, missing) + else + columns[j] = DataArray(values, missing) + end end - ret + + # Need to pass this in + column_names = DataFrames.generate_column_names(cols) + + return DataFrame(columns, column_names) end -# Read data line-by-line -function read_separated_text(io::IO, - nrows::Int, - separator::Char, - quotation_character::Char) - # Read one line to determine the number of columns - i = 1 - sp = read_separated_line(io, separator, quotation_character) - # Find a way to reuse this buffer - ncols = length(sp) - - # If the line is blank, return a 0x0 array to signify this - if ncols == 0 - return Array(UTF8String, 0, 0) +function parseline(buffer::Vector{Uint8}, + upper::Int, + eol::Char, + separator::Char, + quotemark::Char) + column_names = Array(UTF8String, 0) + if upper == 1 + return column_names end - # Otherwise, allocate an array to store all of the text we'll read - text_data = Array(UTF8String, nrows, ncols) - text_data[i, :] = sp - - # Loop until we've read nrows of text or run out of text - while i < nrows - sp = read_separated_line(io, separator, quotation_character) - if length(sp) == ncols - i += 1 - text_data[i, :] = sp - else - break + left::Int = 1 + right::Int = -1 + index::Int = -1 + atbound::Bool = false + inquotes::Bool = false + inescape::Bool = false + chr::Uint8 = uint8(' ') + + while left < upper + chr = buffer[left] + while chr == ' ' + left += 1 + chr = buffer[left] end - end - # Return as much text as we read - return text_data[1:i, :] -end + right = left - 1 + + atbound = false + while right < upper && !atbound + right += 1 + chr = buffer[right] + if !inquotes + if chr == separator + atbound = true + elseif chr == quotemark && !inescape + inquotes = true + end + else + if chr == quotemark && !inescape + inquotes = false + end + end -############################################################################## -# -# Inferential steps -# -############################################################################## + if chr == '\\' + inescape = true + else + inescape = false + end + end -function determine_separator{T <: String}(filename::T) - if ismatch(r"csv$", filename) - return ',' - elseif ismatch(r"tsv$", filename) - return '\t' - elseif ismatch(r"wsv$", filename) - return ' ' - else - error("Unable to determine separator used in $filename") - end -end + inquotes = false + inescape = false -function determine_nrows{T <: String}(filename::T, header::Bool) - total_lines = countlines(filename) - if header - return total_lines - 1 - else - return total_lines - end -end + if buffer[left] == quotemark + left += 1 + end -function determine_column_names(io::IO, - separator::Char, - quotation_character::Char, - header::Bool) - seek(io, 0) - fields = read_separated_line(io, separator, quotation_character) + index = right + chr = buffer[index] + while index > left && + (chr == ' ' || chr == eol || + chr == separator || chr == quotemark) + index -= 1 + chr = buffer[index] + end - if length(fields) == 0 - error("Failed to determine column names from an empty data source") - end + push!(column_names, bytestring(buffer[left:index])) - column_names = header ? fields : generate_column_names(length(fields)) - seek(io, 0) - return column_names + left = right + 1 + end + + return column_names end -function convert_to_dataframe{R <: String, - S <: String, - T <: String}(text_data::Matrix{R}, - missingness_indicators::Vector{S}, - column_names::Vector{T}) - # Keep a record of number of rows and columns - nrows, ncols = size(text_data, 1), length(column_names) - - # Short-circuit if the text data is empty - if nrows == 0 - column_types = {Any for i in 1:ncols} - return DataFrame(column_types, column_names, 0) - end - - # Store the columns as a set of DataVector's inside an Array of Any's - columns = Array(Any, ncols) - - # Convert each column of text into a DataVector of the - # appropriate type - dtime = 0.0 - for j in 1:ncols - is_missing = BitVector(nrows) - for i in 1:nrows - value_missing = contains(missingness_indicators, text_data[i, j]) - if value_missing - text_data[i, j] = utf8("0") - is_missing[i] = true - else - is_missing[i] = false - end +# TODO: Respect coltypes +# TODO: Skip blanklines +# TODO: Skip comment lines +# TODO: Use file encoding information +function readtable(io::IO; + header::Bool = true, + separator::Char = ',', + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + makefactors::Bool = false, + ignorespace::Bool = true, + decimal::Char = '.', + colnames::Vector{UTF8String} = Array(UTF8String, 0), + coltypes::Vector{Any} = Array(Any, 0), + nrows::Int = -1, + skipstartlines::Int = 0, + cleancolnames::Bool = true, + skipblanklines::Bool = true, + comment::Char = '#', + encoding::Symbol = :utf8) + + # Allocate buffers to conserve memory + buffer::Vector{Uint8} = Array(Uint8, 2^20) + eol_indices::Vector{Int} = Array(Int, 1) + separator_indices::Vector{Int} = Array(Int, 1) + chr::Uint8 = uint8(' ') + + # Skip lines at the start + skipped_lines::Int = 0 + while skipped_lines < skipstartlines + while !eof(io) && chr != eol + chr = read(io, Uint8) + end + skipped_lines += 1 end - values = Array(Int64, nrows) - try - for i in 1:nrows - values[i] = parseint(text_data[i, j]) - end - catch - try - values = Array(Float64, nrows) - for i in 1:nrows - values[i] = parsefloat(text_data[i, j]) + + # Deal with header + if header + chr = uint8(' ') + headerbytesread = 0 + headerbytes = Array(Uint8, 2^16) + headerbytes_size = length(headerbytes) + while !eof(io) && chr != eol + chr = read(io, Uint8) + headerbytesread += 1 + if headerbytesread > headerbytes_size + headerbytes_size *= 2 + resize!(headerbytes, headerbytes_size) + end + headerbytes[headerbytesread] = chr end - catch - try - values = Array(Bool, nrows) - for i in 1:nrows - values[i] = parse_bool(text_data[i, j]) - end - catch - values = text_data[:, j] + column_names = + parseline(headerbytes, headerbytesread, eol, separator, quotemark) + end + + # Separate text into fields + rows, bytes, eols, separators = + readnrows!(io, + buffer, + eol_indices, + separator_indices, + nrows, + eol, + separator, + quotemark) + + # Determine the number of columns + cols = fld(separators, rows) + 1 + + # Confirm that the number of columns is consistent across rows + if separators != rows * (cols - 1) + linenumber = -1 + j = -1 + for i in 1:rows + bound = eol_indices[i] + j = 1 + while separator_indices[(i - 1) * (cols - 1) + j] < bound + j += 1 + end + if j != cols + linenumber = i + break + end end - end + msg1 = @sprintf "Every line must have %d columns\n" cols + msg2 = @sprintf "Reading failed at line %d with %d columns\n" linenumber j + msg3 = @sprintf "Saw %d rows, but %d fields\n" rows separators + error(string(msg1, msg2, msg3)) end - columns[j] = DataArray(values, is_missing) - end - # Prepare the DataFrame we'll return - df = DataFrame(columns, column_names) - return df -end + # Parse contents of a buffer into a DataFrame + df = builddf(rows, + cols, + bytes, + eols, + separators, + buffer, + eol_indices, + separator_indices, + separator, + eol, + quotemark, + missing_nonstrings, + missing_strings, + true_strings, + false_strings, + ignorespace, + makefactors) + + # Set up column names based on user input and header + if isempty(colnames) + if header + colnames!(df, column_names) + end + else + colnames!(df, colnames) + end -############################################################################## -# -# Text input -# -############################################################################## + # Clean up column names if requested + if cleancolnames + clean_colnames!(df) + end -# Read at most N lines from an IO object -# Then return a minibatch of at most N rows as a DataFrame -# Add column_types, force_types option -function read_minibatch{R <: String, - S <: String}(io::IO, - separator::Char, - quotation_character::Char, - missingness_indicators::Vector{R}, - column_names::Vector{S}, - minibatch_size::Int) - # Represent data as an array of strings before type conversion - text_data = read_separated_text(io, minibatch_size, separator, quotation_character) - - # Convert text data to a DataFrame - return convert_to_dataframe(text_data, missingness_indicators, column_names) + # Return the final DataFrame + return df end -# Read an entire data set into a DataFrame from an IO -# TODO: Do only IO-pass through the data -function read_table{R <: String, - S <: String}(io::IO, - separator::Char, - quotation_character::Char, - missingness_indicators::Vector{R}, - header::Bool, - column_names::Vector{S}, - nrows::Int) - # Return to start of stream - seek(io, 0) - - # Read first line to remove header in advance - if header - readline(io) - end - - # Represent data as an array of strings before type conversion - text_data = read_separated_text(io, nrows, separator, quotation_character) - - # Short-circuit if data set is empty except for a header line - if size(text_data, 1) == 0 - column_types = {Any for i in 1:length(column_names)} - return DataFrame(column_types, column_names, 0) - else - # Convert text data to a DataFrame - df = convert_to_dataframe(text_data, missingness_indicators, column_names) +function readtable(filename::String; + header::Bool = true, + separator::Char = getseparator(filename), + eol::Char = '\n', + quotemark::Char = '"', + missing_nonstrings::Vector{ASCIIString} = ["", "NA"], + missing_strings::Vector{ASCIIString} = ["NA"], + true_strings::Vector{ASCIIString} = ["T", "t", "TRUE", "true"], + false_strings::Vector{ASCIIString} = ["F", "f", "FALSE", "false"], + makefactors::Bool = false, + ignorespace::Bool = true, + decimal::Char = '.', + colnames::Vector{UTF8String} = Array(UTF8String, 0), + coltypes::Vector{Any} = Array(Any, 0), + nrows::Int = -1, + skipstartlines::Int = 0, + cleancolnames::Bool = false, + skipblanklines::Bool = true, + comment::Char = '#', + encoding::Symbol = :utf8) + + # Open an IO stream + io = open(filename, "r") + + # If user wants all rows, overestimate nrows + if nrows == -1 + nrows = filesize(filename) + end + + # Use the IO stream method for readtable() + df = readtable(io, + header = header, + separator = separator, + eol = eol, + quotemark = quotemark, + missing_nonstrings = missing_nonstrings, + missing_strings = missing_strings, + true_strings = true_strings, + false_strings = false_strings, + makefactors = makefactors, + ignorespace = ignorespace, + decimal = decimal, + colnames = colnames, + coltypes = coltypes, + nrows = nrows, + skipstartlines = skipstartlines, + cleancolnames = cleancolnames, + skipblanklines = cleancolnames, + comment = comment, + encoding = encoding) + + # Close the IO stream + close(io) + + # Return the resulting DataFrame return df - end end -function read_table{T <: String}(filename::T) - # Do inference for missing configuration settings - separator = determine_separator(filename) - quotation_character = DEFAULT_QUOTATION_CHARACTER - missingness_indicators = DEFAULT_MISSINGNESS_INDICATORS - header = true - nrows = determine_nrows(filename, header) - io = open(filename, "r") - column_names = determine_column_names(io, separator, quotation_character, header) - df = read_table(io, - separator, - quotation_character, - missingness_indicators, - header, - column_names, - nrows) - close(io) - return df +function getseparator(filename::String) + if ismatch(r"csv$", filename) + return ',' + elseif ismatch(r"tsv$", filename) + return '\t' + elseif ismatch(r"wsv$", filename) + return ' ' + else + error("Unable to determine separator used in $filename") + end end - ############################################################################## # # Text output # ############################################################################## -# Quotation rules -function in_quotes(val::String, quotation_character::Char) - string(quotation_character, val, quotation_character) -end -function in_quotes(val::Real, quotation_character::Char) - string(val) -end -function in_quotes(val::Any, quotation_character::Char) - string(quotation_character, string(val), quotation_character) -end - -# TODO: write_table should do more to react to the type of each column -# Need to increase precision of string representation of Float64's -function print_table(io::IO, - df::DataFrame, - separator::Char, - quotation_character::Char, - header::Bool) - n, p = nrow(df), ncol(df) - if header - column_names = colnames(df) - for j in 1:p - if j < p - print(io, in_quotes(column_names[j], quotation_character)) - print(io, separator) - else - println(io, in_quotes(column_names[j], quotation_character)) - end +quoted(val::String, quotemark::Char) = string(quotemark, val, quotemark) +quoted(val::Real, quotemark::Char) = string(val) +quoted(val::Any, quotemark::Char) = string(quotemark, string(val), quotemark) + +# TODO: Increase precision of string representation of Float64's +function printtable(io::IO, + df::DataFrame; + separator::Char = ',', + quotemark::Char = '"', + header::Bool = true) + n, p = size(df) + if header + column_names = colnames(df) + for j in 1:p + if j < p + print(io, quoted(column_names[j], quotemark)) + print(io, separator) + else + println(io, quoted(column_names[j], quotemark)) + end + end end - end - for i in 1:n - for j in 1:p - if j < p - print(io, in_quotes(df[i, j], quotation_character)) - print(io, separator) - else - println(io, in_quotes(df[i, j], quotation_character)) - end + for i in 1:n + for j in 1:p + if j < p + print(io, quoted(df[i, j], quotemark)) + print(io, separator) + else + println(io, quoted(df[i, j], quotemark)) + end + end end - end + return end -function print_table(io::IO, - df::DataFrame, - separator::Char, - quotation_character::Char) - print_table(io, df, separator, quotation_character, true) -end - -function print_table(df::DataFrame, separator::Char, quotation_character::Char) - print_table(OUTPUT_STREAM, df, separator, quotation_character, true) -end - -function print_table(df::DataFrame) - print_table(OUTPUT_STREAM, - df, - DEFAULT_SEPARATOR, - DEFAULT_QUOTATION_CHARACTER, - true) -end - -function write_table(filename::String, - df::DataFrame, - separator::Char, - quotation_character::Char) - io = open(filename, "w") - print_table(io, df, separator, quotation_character) - close(io) +function printtable(df::DataFrame; + separator::Char = ',', + quotemark::Char = '"', + header::Bool = true) + printtable(OUTPUT_STREAM, + df, + separator = separator, + quotemark = quotemark, + header = header) + return end # Infer configuration settings from filename -function write_table(filename::String, df::DataFrame) - separator = determine_separator(filename) - quotation_character = DEFAULT_QUOTATION_CHARACTER - write_table(filename, df, separator, quotation_character) +function writetable(filename::String, + df::DataFrame; + separator::Char = getseparator(filename), + quotemark::Char = '"', + header::Bool = true) + io = open(filename, "w") + printtable(io, + df, + separator = separator, + quotemark = quotemark, + header = header) + close(io) + return end ############################################################################## @@ -485,18 +691,16 @@ end # ############################################################################## -# Wrappers for serialization -function save(filename, d) +function save(filename::String, df::AbstractDataFrame) f = open(filename, "w") - serialize(f, d) + serialize(f, df) close(f) + return end -function load_df(filename) +function load_df(filename::String) f = open(filename) dd = deserialize(f) close(f) return dd end - -# end diff --git a/test/data.jl b/test/data.jl index ce947c0299..97e8e4799e 100644 --- a/test/data.jl +++ b/test/data.jl @@ -93,7 +93,7 @@ test_group("DataVector to something else") @assert all(convert(Vector{Int}, dvint2) .== [5:8]) @assert all([i + 1 for i in dvint2] .== [6:9]) @assert all([length(x)::Int for x in dvstr] == [3, 3, 1, 4]) -@assert repr(dvint) == "[1, 2, NA, 4]" +@assert repr(dvint) == "[1,2,NA,4]" test_group("PooledDataVector to something else") @assert all(removeNA(pdvstr) .== ["one", "one", "two", "two", "one", "one"]) diff --git a/test/data/complex_data.csv b/test/data/complex_data.csv index fbdf0a5783..15bea6b698 100644 --- a/test/data/complex_data.csv +++ b/test/data/complex_data.csv @@ -1,2 +1,3 @@ +C1,C2,C3,C4,C5 a,"b","c,d",1.0,1 a,"b","",, diff --git a/test/data/messy.csv b/test/data/messy.csv new file mode 100644 index 0000000000..b3e96ef21b --- /dev/null +++ b/test/data/messy.csv @@ -0,0 +1,6 @@ +"A","B","C","D","E" +1 , 2, 3.1 ,"true","X" +3 , 4, 2.3 ,"false","Y" +NA,,NA,NA,NA +,NA,,,NA +,,,, diff --git a/test/io.jl b/test/io.jl index 6b28c7529e..811c3dc0ee 100644 --- a/test/io.jl +++ b/test/io.jl @@ -1,222 +1,26 @@ -# unit tests of extract_string -x = "12345678" -@assert DataFrames.extract_string(x, 3, 6) == "3456" -@assert DataFrames.extract_string(x, 3, 3) == "3" -@assert DataFrames.extract_string(x, 3, 6, Set(3)) == "456" -@assert DataFrames.extract_string(x, 3, 6, Set(5, 3)) == "46" -x = "\"Güerín\",\"Sí\",\"No\"" -@assert DataFrames.extract_string(x, 2, 7, Set(3)) == "Gerí" -@assert DataFrames.extract_string("", 0,0,Set()) == "" - -# Handle the empty string properly -test0 = IOString("") -res0 = DataFrames.read_separated_line(test0, ',', '"') -@assert isempty(res0) - -test1 = IOString("I'm A,I'm B,I'm C,-0.3932755625236671,20.157657978753534") -res1 = DataFrames.read_separated_line(test1, ',', '"') -@assert res1[2] == "I'm B" -@assert res1[4] == "-0.3932755625236671" - -test2 = IOString("123, 456 , \"789\",TRUE") -res2 = DataFrames.read_separated_line(test2, ',', '"') -@assert res2[2] == "456" -@assert res2[3] == "789" - -test3 = IOString("123 ,456 , \"789\" , \"TRUE\"") -res3 = DataFrames.read_separated_line(test3, ',', '"') -@assert res3[2] == "456" -@assert res3[4] == "TRUE" - -test4 = IOString("123 ,456 , , \"TRUE\"") -res4 = DataFrames.read_separated_line(test4, ',', '"') -@assert res4[3] == "" -@assert res4[4] == "TRUE" - -test5 = IOString("123 ,456 , \"a\"\"b\" ,\"TRUE\"") -res5 = DataFrames.read_separated_line(test5, ',', '"') -@assert res5[3] == "a\"b" -@assert res5[4] == "TRUE" - -test6 = IOString("123 ,456 , \"a -b\" ,\"TRUE\"") -res6 = DataFrames.read_separated_line(test6, ',', '"') -@assert res6[3] == "a\nb" -@assert res6[4] == "TRUE" - -# Should this be one NA? -# test7 = IOString("") -# res7 = DataFrames.read_separated_line(test7, ',', '"') -# @assert length(res7) == 1 - -test8 = IOString("a,\"b\",\"cd\",1.0,1\na,\"b\",\"cd\",1.0,1") -res8 = DataFrames.read_separated_line(test8, ',', '"') -@assert length(res8) == 5 -@assert res8[5] == "1" - -test9 = IOString("\"Güerín\",\"Sí\",\"No\"") -res9 = DataFrames.read_separated_line(test9, ',', '"') -@assert res9[2] == "Sí" - -test10 = IOString("1,2,3,,") -res10 = DataFrames.read_separated_line(test10, ',', '"') -@assert length(res10) == 5 - -filename = Pkg.dir("DataFrames", "test", "data", "simple_data.csv") -open(filename,"r") do io - t1 = read_table(io, ',', '"', DataFrames.DEFAULT_MISSINGNESS_INDICATORS, false, ["1","2","3","4","5"], 2) - @assert nrow(t1) == 2 - @assert t1[1,2] == "b" +using DataFrames + +filenames = ["test/data/big_data.csv", + "test/data/bool.csv", + "test/data/complex_data.csv", + "test/data/corrupt_utf8.csv", + "test/data/corrupt_utf8_short.csv", + "test/data/messy.csv", + "test/data/movies.csv", + "test/data/sample_data.csv", + "test/data/simple_data.csv", + "test/data/space_after_delimiter.csv", + "test/data/space_around_delimiter.csv", + "test/data/space_before_delimiter.csv", + "test/data/types.csv", + "test/data/utf8.csv"] + +for filename in filenames + df = readtable(filename) end +filename = "test/data/sample_data.tsv" +df = readtable(filename, separator = '\t') - -# Test separated line splitting -# -# TODO: Test minimially-quoted -# TODO: Test only-strings-quoted - -separators = [',', '\t', ' '] -quotation_characters = ['\'', '"'] - -# Test all-entries-quoted for all quote characters and separators -items = {"a", "b", "c,d", "1.0", "1"} -item_buffer = Array(UTF8String, length(items)) - -# TODO: make this work with new splitting code -# for separator in separators -# for quotation_character in quotation_characters -# line = join(map(x -> string(quotation_character, x, quotation_character), -# items), -# separator) -# current_item_buffer = Array(Char, strlen(line)) -# split_results = DataFrames.split_separated_line(line, separator, quotation_character, item_buffer, current_item_buffer) -# @assert all(split_results .== items) -# end -# end - -# Test reading -@assert DataFrames.determine_separator("blah.csv") == ',' -@assert DataFrames.determine_separator("blah.tsv") == '\t' -@assert DataFrames.determine_separator("blah.wsv") == ' ' -# @assert DataFrames.determine_separator("blah.txt") -# Need to change to use @expects to test that error gets raised - -filename = Pkg.dir("DataFrames", "test", "data", "big_data.csv") -separator = DataFrames.determine_separator(filename) -quotation_character = '"' -missingness_indicators = ["", "NA"] -header = true -column_names = UTF8String["A", "B", "C", "D", "E"] -minibatch_size = 10 - -file = open(filename, "r") -readline(file) -minibatch = read_minibatch(file, - separator, - quotation_character, - missingness_indicators, - column_names, - minibatch_size) -@assert nrow(minibatch) == minibatch_size -@assert ncol(minibatch) == length(column_names) -@assert colnames(minibatch) == column_names -@assert eltype(minibatch[:, 1]) == UTF8String -@assert eltype(minibatch[:, 2]) == UTF8String -@assert eltype(minibatch[:, 3]) == UTF8String -@assert eltype(minibatch[:, 4]) == Float64 -@assert eltype(minibatch[:, 5]) == Float64 -close(file) - -@elapsed df = read_table(filename) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert typeof(df[:, 1]) == DataVector{UTF8String} -@assert typeof(df[:, 2]) == DataVector{UTF8String} -@assert typeof(df[:, 3]) == DataVector{UTF8String} -@assert typeof(df[:, 4]) == DataVector{Float64} -@assert typeof(df[:, 5]) == DataVector{Float64} - -# TODO: Split apart methods that perform seek() from those that don't -text_data = convert(Array{UTF8String, 2}, (["1" "3" "A"; "2" "3" "NA"; "3" "3.1" "C"])) - -true_df = DataFrame(quote - x1 = DataArray([1, 2, 3]) - x2 = DataArray([3, 3, 3.1]) - x3 = DataArray(UTF8String["A", "", "C"], [false, true, false]) - end) -df = DataFrames.convert_to_dataframe(text_data, - ["", "NA"], - ["x1", "x2", "x3"]) -@assert isequal(df, true_df) -@assert isequal(eltype(df["x1"]), Int64) -@assert isequal(eltype(df["x2"]), Float64) -@assert isequal(eltype(df["x3"]), UTF8String) - -filename = Pkg.dir("DataFrames", "test", "data", "big_data.csv") -separator = DataFrames.determine_separator(filename) -quotation_character = '"' -missingness_indicators = ["", "NA"] -header = true - -nrows = DataFrames.determine_nrows(filename, header) -@assert nrows == 10_000 - -io = open(filename, "r") - -column_names = DataFrames.determine_column_names(io, separator, quotation_character, header) -@assert column_names == UTF8String["A", "B", "C", "D", "E"] - -seek(io, 0) -if header - readline(io) -end -text_data = DataFrames.read_separated_text(io, nrows, separator, quotation_character) -@assert eltype(text_data) == UTF8String -@assert size(text_data) == (10_000, 5) - -df = read_table(io, - separator, - quotation_character, - missingness_indicators, - header, - column_names, - nrows) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert eltype(df[:, 1]) == UTF8String -@assert eltype(df[:, 2]) == UTF8String -@assert eltype(df[:, 3]) == UTF8String -@assert eltype(df[:, 4]) == Float64 -@assert eltype(df[:, 5]) == Float64 - -df = read_table(filename) -@assert nrow(df) == 10_000 -@assert ncol(df) == 5 -@assert colnames(df) == column_names -@assert eltype(df[:, 1]) == UTF8String -@assert eltype(df[:, 2]) == UTF8String -@assert eltype(df[:, 3]) == UTF8String -@assert eltype(df[:, 4]) == Float64 -@assert eltype(df[:, 5]) == Float64 - -# TODO: Add test case in which data file has header, but no rows -# Example "RDatasets/data/Zelig/sna.ex.csv" -# "","Var1","Var2","Var3","Var4","Var5" - -# Additional data sets - -@elapsed df = read_table("test/data/big_data.csv") -# TODO: Make this faster -@elapsed df = read_table("test/data/movies.csv") -# TODO: Release this data set publicly -#@elapsed df = read_table("test/data/bigrams.tsv") -@elapsed df = read_table("test/data/utf8.csv") -@elapsed df = read_table("test/data/bool.csv") -@elapsed df = read_table("test/data/types.csv") -@elapsed df = read_table("test/data/space_after_delimiter.csv") -@elapsed df = read_table("test/data/space_before_delimiter.csv") -@elapsed df = read_table("test/data/space_around_delimiter.csv") -@elapsed df = read_table("test/data/corrupt_utf8.csv") +filename = "test/data/sample_data.wsv" +df = readtable(filename, separator = ' ')