From 16d19c4a49fab96f1dba0b0d3ee22bb394d8e388 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 13 Feb 2017 21:42:21 -0800 Subject: [PATCH 1/4] DataFrame -> DataTable --- REQUIRE | 2 +- src/CSV.jl | 12 +++++------ src/Sink.jl | 10 ++++----- src/Source.jl | 18 ++++++++-------- test/datastreams.jl | 14 ++++++------ test/runtests.jl | 2 +- test/source.jl | 52 ++++++++++++++++++++++----------------------- 7 files changed, 55 insertions(+), 55 deletions(-) diff --git a/REQUIRE b/REQUIRE index 686013bb..af9c4394 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,5 +1,5 @@ julia 0.5 Compat 0.10.0 DataStreams 0.1.0 -DataFrames +DataTables WeakRefStrings 0.1.3 diff --git a/src/CSV.jl b/src/CSV.jl index c4d04bb3..784a0d68 100644 --- a/src/CSV.jl +++ b/src/CSV.jl @@ -1,9 +1,9 @@ __precompile__(true) module CSV -using Compat, DataStreams, DataFrames, WeakRefStrings +using Compat, DataStreams, DataTables, WeakRefStrings -export Data, DataFrame +export Data, DataTable immutable CSVError <: Exception msg::String @@ -87,11 +87,11 @@ keyword arguments, see the docs for [`CSV.read`](@ref) or type `?CSV.read` at th An example of re-using a `CSV.Source` is: ```julia -# manually construct a `CSV.Source` once, then stream its data to both a DataFrame +# manually construct a `CSV.Source` once, then stream its data to both a DataTable # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataFrame) +df1 = CSV.read(source, DataTable) CSV.reset!(source) sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") ``` @@ -123,11 +123,11 @@ keyword arguments, see the docs for [`CSV.write`](@ref) or type `?CSV.write` at An example of re-using a `CSV.Sink` is: ```julia -# manually construct a `CSV.Source` once, then stream its data to both a DataFrame +# manually construct a `CSV.Source` once, then stream its data to both a DataTable # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataFrame) +df1 = CSV.read(source, DataTable) CSV.reset!(source) sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") ``` diff --git a/src/Sink.jl b/src/Sink.jl index abd1f89d..7c1b9675 100644 --- a/src/Sink.jl +++ b/src/Sink.jl @@ -102,24 +102,24 @@ Keyword Arguments: A few example invocations include: ```julia -# write out a DataFrame `df` to a file name "out.csv" with all defaults, including comma as delimiter +# write out a DataTable `df` to a file name "out.csv" with all defaults, including comma as delimiter CSV.write("out.csv", df) -# write out a DataFrame, this time as a tab-delimited file +# write out a DataTable, this time as a tab-delimited file CSV.write("out.csv", df; delim='\t') -# write out a DataFrame, with null values represented by the string "NA" +# write out a DataTable, with null values represented by the string "NA" CSV.write("out.csv", df; null="NA") # write out a "header-less" file, with actual data starting on row 1 CSV.write("out.csv", df; header=false) -# write out a DataFrame `df` twice to a file, the resulting file with have twice the # of rows as the DataFrame +# write out a DataTable `df` twice to a file, the resulting file with have twice the # of rows as the DataTable # note the usage of the keyword argument `append=true` in the 2nd call CSV.write("out.csv", df) CSV.write("out.csv", df; append=true) -# write a DataFrame out to an IOBuffer instead of a file +# write a DataTable out to an IOBuffer instead of a file io = IOBuffer CSV.write(io, df) diff --git a/src/Source.jl b/src/Source.jl index ccbe72f1..3738aa45 100644 --- a/src/Source.jl +++ b/src/Source.jl @@ -194,17 +194,17 @@ Data.streamfrom(source::CSV.Source, ::Type{Data.Field}, ::Type{Nullable{WeakRefS Data.reference(source::CSV.Source) = source.io.data """ -`CSV.read(fullpath::Union{AbstractString,IO}, sink::Type{T}=DataFrame, args...; kwargs...)` => `typeof(sink)` +`CSV.read(fullpath::Union{AbstractString,IO}, sink::Type{T}=DataTable, args...; kwargs...)` => `typeof(sink)` `CSV.read(fullpath::Union{AbstractString,IO}, sink::Data.Sink; kwargs...)` => `Data.Sink` -parses a delimited file into a Julia structure (a DataFrame by default, but any valid `Data.Sink` may be requested). +parses a delimited file into a Julia structure (a DataTable by default, but any valid `Data.Sink` may be requested). Positional arguments: * `fullpath`; can be a file name (string) or other `IO` instance -* `sink::Type{T}`; `DataFrame` by default, but may also be other `Data.Sink` types that support streaming via `Data.Field` interface; note that the method argument can be the *type* of `Data.Sink`, plus any required arguments the sink may need (`args...`). +* `sink::Type{T}`; `DataTable` by default, but may also be other `Data.Sink` types that support streaming via `Data.Field` interface; note that the method argument can be the *type* of `Data.Sink`, plus any required arguments the sink may need (`args...`). or an already constructed `sink` may be passed (2nd method above) Keyword Arguments: @@ -233,7 +233,7 @@ Oftentimes, however, it can be convenient to work with `WeakRefStrings` dependin Example usage: ``` julia> dt = CSV.read("bids.csv") -7656334×9 DataFrames.DataFrame +7656334×9 DataTables.DataTable │ Row │ bid_id │ bidder_id │ auction │ merchandise │ device │ ├─────────┼─────────┼─────────────────────────────────────────┼─────────┼──────────────────┼─────────────┤ │ 1 │ 0 │ "8dac2b259fd1c6d1120e519fb1ac14fbqvax8" │ "ewmzr" │ "jewelry" │ "phone0" │ @@ -269,17 +269,17 @@ CSV.read(file; types=Dict("col3"=>Float64, "col6"=>String)) # this is also a way to limit the # of rows to be read in a file if only a sample is needed CSV.read(file; rows=10000) -# for data files, `file` and `file2`, with the same structure, read both into a single DataFrame +# for data files, `file` and `file2`, with the same structure, read both into a single DataTable # note that `df` is used as a 2nd argument in the 2nd call to `CSV.read` and the keyword argument # `append=true` is passed df = CSV.read(file) df = CSV.read(file2, df; append=true) -# manually construct a `CSV.Source` once, then stream its data to both a DataFrame +# manually construct a `CSV.Source` once, then stream its data to both a DataTable # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataFrame) +df1 = CSV.read(source, DataTable) CSV.reset!(source) db = SQLite.DB() sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") @@ -287,7 +287,7 @@ sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") """ function read end -function read(fullpath::Union{AbstractString,IO}, sink=DataFrame, args...; append::Bool=false, transforms::Dict=Dict{Int,Function}(), kwargs...) +function read(fullpath::Union{AbstractString,IO}, sink=DataTable, args...; append::Bool=false, transforms::Dict=Dict{Int,Function}(), kwargs...) source = Source(fullpath; kwargs...) sink = Data.stream!(source, sink, append, transforms, args...) Data.close!(sink) @@ -301,5 +301,5 @@ function read{T}(fullpath::Union{AbstractString,IO}, sink::T; append::Bool=false return sink end -read(source::CSV.Source, sink=DataFrame, args...; append::Bool=false, transforms::Dict=Dict{Int,Function}()) = (sink = Data.stream!(source, sink, append, transforms, args...); Data.close!(sink); return sink) +read(source::CSV.Source, sink=DataTable, args...; append::Bool=false, transforms::Dict=Dict{Int,Function}()) = (sink = Data.stream!(source, sink, append, transforms, args...); Data.close!(sink); return sink) read{T}(source::CSV.Source, sink::T; append::Bool=false, transforms::Dict=Dict{Int,Function}()) = (sink = Data.stream!(source, sink, append, transforms); Data.close!(sink); return sink) diff --git a/test/datastreams.jl b/test/datastreams.jl index a27727c5..5fa73c3a 100644 --- a/test/datastreams.jl +++ b/test/datastreams.jl @@ -1,17 +1,17 @@ -# DataFrames +# DataTables FILE = joinpath(DSTESTDIR, "randoms_small.csv") DF = CSV.read(FILE) DF2 = CSV.read(FILE) -dfsource = Tester("DataFrame", x->x, false, DataFrame, (:DF,), scalartransforms, vectortransforms, x->x, x->nothing) -dfsink = Tester("DataFrame", x->x, false, DataFrame, (:DF2,), scalartransforms, vectortransforms, x->x, x->nothing) -function DataFrames.DataFrame(sym::Symbol; append::Bool=false) +dfsource = Tester("DataTable", x->x, false, DataTable, (:DF,), scalartransforms, vectortransforms, x->x, x->nothing) +dfsink = Tester("DataTable", x->x, false, DataTable, (:DF2,), scalartransforms, vectortransforms, x->x, x->nothing) +function DataTables.DataTable(sym::Symbol; append::Bool=false) return @eval $sym end -function DataFrames.DataFrame(sch::Data.Schema, ::Type{Data.Field}, append::Bool, ref::Vector{UInt8}, sym::Symbol) - return DataFrame(DataFrame(sym), sch, Data.Field, append, ref) +function DataTables.DataTable(sch::Data.Schema, ::Type{Data.Field}, append::Bool, ref::Vector{UInt8}, sym::Symbol) + return DataTable(DataTable(sym), sch, Data.Field, append, ref) end -function DataFrame(sink, sch::Data.Schema, ::Type{Data.Field}, append::Bool, ref::Vector{UInt8}) +function DataTable(sink, sch::Data.Schema, ::Type{Data.Field}, append::Bool, ref::Vector{UInt8}) rows, cols = size(sch) newsize = max(0, rows + (append ? size(sink, 1) : 0)) # need to make sure we don't break a NullableVector{WeakRefString{UInt8}} when appending diff --git a/test/runtests.jl b/test/runtests.jl index 83253c8b..27f26e9c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,5 @@ using CSV -using Base.Test, DataStreams, DataFrames, NullableArrays, WeakRefStrings, Libz, DecFP +using Base.Test, DataStreams, DataTables, NullableArrays, WeakRefStrings, Libz, DecFP include("parsefields.jl") include("io.jl") diff --git a/test/source.jl b/test/source.jl index cb757a37..c2d48cc0 100644 --- a/test/source.jl +++ b/test/source.jl @@ -14,7 +14,7 @@ f = CSV.Source(joinpath(dir, "test_utf8.csv")) @test size(f, 1) == 3 @test Data.header(f) == ["col1","col2","col3"] @test Data.types(f) == [Nullable{Float64},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == 1.0 @test ds[2,1].value == 4.0 @test ds[3,1].value == 7.0 @@ -31,7 +31,7 @@ so = CSV.Source(si) @test Data.header(so) == ["col1","col2","col3"] @test Data.types(so) == [Nullable{Float64},Nullable{Float64},Nullable{Float64}] # @test so.iopos == 21 -ds = Data.stream!(so, DataFrame) +ds = Data.stream!(so, DataTable) @test ds[1,1].value == 1.0 @test ds[2,1].value == 4.0 @test ds[3,1].value == 7.0 @@ -56,7 +56,7 @@ f = CSV.Source(joinpath(dir, "test_single_column.csv")) @test size(f, 1) == 3 @test Data.header(f) == ["col1"] @test Data.types(f) == [Nullable{Int}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == 1 @test ds[2,1].value == 2 @test ds[3,1].value == 3 @@ -76,7 +76,7 @@ f = CSV.Source(joinpath(dir, "test_empty_file_newlines.csv")) f = CSV.Source(joinpath(dir, "test_simple_quoted.csv")) @test size(f, 2) == 2 @test size(f, 1) == 1 -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test string(ds[1,1].value) == "quoted field 1" @test string(ds[1,2].value) == "quoted field 2" f = CSV.Source(joinpath(dir, "test_quoted_delim_and_newline.csv")) @@ -88,7 +88,7 @@ f = CSV.Source(joinpath(dir, "test_crlf_line_endings.csv")) @test Data.header(f) == ["col1","col2","col3"] @test size(f, 2) == 3 @test Data.types(f) == [Nullable{Int},Nullable{Int},Nullable{Int}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == 1 f = CSV.Source(joinpath(dir, "test_newline_line_endings.csv")) @test Data.header(f) == ["col1","col2","col3"] @@ -116,7 +116,7 @@ f = CSV.Source(joinpath(dir, "test_dates.csv"); types=[Date], dateformat="yyyy-m @test size(f, 2) == 1 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{Date}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == Date(2015,1,1) @test ds[2,1].value == Date(2015,1,2) @test ds[3,1].value == Date(2015,1,3) @@ -132,21 +132,21 @@ f = CSV.Source(joinpath(dir, "test_datetimes.csv"); dateformat="yyyy-mm-dd HH:MM @test size(f, 2) == 1 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{DateTime}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == DateTime(2015,1,1) @test ds[2,1].value == DateTime(2015,1,2,0,0,1) @test ds[3,1].value == DateTime(2015,1,3,0,12,0,1) #test bad types f = CSV.Source(joinpath(dir, "test_float_in_int_column.csv"); types=[Int,Int,Int]) -@test_throws CSV.CSVError Data.stream!(f, DataFrame) +@test_throws CSV.CSVError Data.stream!(f, DataTable) #test null/missing values f = CSV.Source(joinpath(dir, "test_missing_value_NULL.csv")) @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == 1.0 @test string(ds[1,2].value) == "2.0" @test string(ds[2,2].value) == "NULL" @@ -155,7 +155,7 @@ f = CSV.Source(joinpath(dir, "test_missing_value_NULL.csv"); null="NULL") @test size(f, 1) == 3 @test f.options.null == "NULL" @test Data.types(f) == [Nullable{Float64},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @test ds[1,1].value == 1.0 @test string(ds[1,2].value) == "2.0" @test isnull(ds[2,2]) @@ -172,109 +172,109 @@ f = CSV.Source(joinpath(dir, "baseball.csv")) @test size(f, 1) == 35 @test Data.header(f) == ["Rk","Year","Age","Tm","Lg","","W","L","W-L%","G","Finish","Wpost","Lpost","W-L%post",""] @test Data.types(f) == [Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Int},Nullable{Int},Nullable{Float64},Nullable{WeakRefString{UInt8}}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict(10=>Float64,12=>Float64)) @test size(f, 2) == 18 @test size(f, 1) == 36634 @test Data.header(f) == ["policyID","statecode","county","eq_site_limit","hu_site_limit","fl_site_limit","fr_site_limit","tiv_2011","tiv_2012","eq_site_deductible","hu_site_deductible","fl_site_deductible","fr_site_deductible","point_latitude","point_longitude","line","construction","point_granularity"] @test Data.types(f) == [Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict{String,DataType}("eq_site_deductible"=>Float64,"fl_site_deductible"=>Float64)) @test size(f, 2) == 18 @test size(f, 1) == 36634 @test Data.header(f) == ["policyID","statecode","county","eq_site_limit","hu_site_limit","fl_site_limit","fr_site_limit","tiv_2011","tiv_2012","eq_site_deductible","hu_site_deductible","fl_site_deductible","fr_site_deductible","point_latitude","point_longitude","line","construction","point_granularity"] @test Data.types(f) == [Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "SacramentocrimeJanuary2006.csv")) @test size(f, 2) == 9 @test size(f, 1) == 7584 @test Data.header(f) == ["cdatetime","address","district","beat","grid","crimedescr","ucr_ncic_code","latitude","longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "Sacramentorealestatetransactions.csv")) @test size(f, 2) == 12 @test size(f, 1) == 985 @test Data.header(f) == ["street","city","zip","state","beds","baths","sq__ft","type","sale_date","price","latitude","longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "SalesJan2009.csv")) @test size(f, 2) == 12 @test size(f, 1) == 998 @test Data.header(f) == ["Transaction_date","Product","Price","Payment_Type","Name","City","State","Country","Account_Created","Last_Login","Latitude","Longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "stocks.csv")) @test size(f, 2) == 2 @test size(f, 1) == 30 @test Data.header(f) == ["Stock Name","Company Name"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "TechCrunchcontinentalUSA.csv")) @test size(f, 2) == 10 @test size(f, 1) == 1460 @test Data.header(f) == ["permalink","company","numEmps","category","city","state","fundedDate","raisedAmt","raisedCurrency","round"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "Fielding.csv")) @test size(f, 2) == 18 @test size(f, 1) == 167938 @test Data.header(f) == ["playerID","yearID","stint","teamID","lgID","POS","G","GS","InnOuts","PO","A","E","DP","PB","WP","SB","CS","ZR"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "latest (1).csv"); header=0, null="\\N") @test size(f, 2) == 25 @test size(f, 1) == 1000 @test Data.header(f) == ["Column$i" for i = 1:size(f, 2)] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Date},Nullable{Date},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{Float64}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "pandas_zeros.csv")) @test size(f, 2) == 50 @test size(f, 1) == 100000 @test Data.header(f) == [string(i) for i = 0:49] @test Data.types(f) == repmat([Nullable{Int}],50) -@time ds = Data.stream!(f, DataFrame) +@time ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "test_header_range.csv");header=1:3) @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.header(f) == ["col1_sub1_part1","col2_sub2_part2","col3_sub3_part3"] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "test_header_range.csv");header=["col1_sub1_part1","col2_sub2_part2","col3_sub3_part3"],datarow=4) @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.header(f) == ["col1_sub1_part1","col2_sub2_part2","col3_sub3_part3"] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "test_basic.csv");types=Dict(2=>Float64)) @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{Int},Nullable{Float64},Nullable{Int}] -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "test_basic_pipe.csv");delim='|') @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{Int},Nullable{Int},Nullable{Int}] @test f.options.delim == UInt8('|') -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) f = CSV.Source(joinpath(dir, "test_basic_pipe.csv");delim='|',footerskip=1) @test size(f, 2) == 3 @test size(f, 1) == 2 @test Data.types(f) == [Nullable{Int},Nullable{Int},Nullable{Int}] @test f.options.delim == UInt8('|') -ds = Data.stream!(f, DataFrame) +ds = Data.stream!(f, DataTable) @show f t = tempname() From d826f3bc216b0092cf17d4a777592a94773cae78 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Wed, 15 Feb 2017 10:57:30 -0800 Subject: [PATCH 2/4] df -> dt --- benchmark/benchmarks.jl | 4 ++-- src/CSV.jl | 4 ++-- src/Sink.jl | 18 +++++++++--------- src/Source.jl | 8 ++++---- src/io.jl | 4 ++-- test/datastreams.jl | 10 +++++----- test/source.jl | 9 ++++----- 7 files changed, 28 insertions(+), 29 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 1f415d27..3d72cfe0 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -50,9 +50,9 @@ TYPES = !is_windows() ? (Int, Float64, WeakRefString{UInt8}, String, Date, DateT end @benchgroup "CSV.write" begin - df = CSV.read(FILE) + dt = CSV.read(FILE) t = tempname() - @bench "CSV.write" CSV.write(t, df) + @bench "CSV.write" CSV.write(t, dt) end end diff --git a/src/CSV.jl b/src/CSV.jl index 784a0d68..a399dcdc 100644 --- a/src/CSV.jl +++ b/src/CSV.jl @@ -91,7 +91,7 @@ An example of re-using a `CSV.Source` is: # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataTable) +dt1 = CSV.read(source, DataTable) CSV.reset!(source) sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") ``` @@ -127,7 +127,7 @@ An example of re-using a `CSV.Sink` is: # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataTable) +dt1 = CSV.read(source, DataTable) CSV.reset!(source) sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") ``` diff --git a/src/Sink.jl b/src/Sink.jl index 7c1b9675..ffc7b5dd 100644 --- a/src/Sink.jl +++ b/src/Sink.jl @@ -102,26 +102,26 @@ Keyword Arguments: A few example invocations include: ```julia -# write out a DataTable `df` to a file name "out.csv" with all defaults, including comma as delimiter -CSV.write("out.csv", df) +# write out a DataTable `dt` to a file name "out.csv" with all defaults, including comma as delimiter +CSV.write("out.csv", dt) # write out a DataTable, this time as a tab-delimited file -CSV.write("out.csv", df; delim='\t') +CSV.write("out.csv", dt; delim='\t') # write out a DataTable, with null values represented by the string "NA" -CSV.write("out.csv", df; null="NA") +CSV.write("out.csv", dt; null="NA") # write out a "header-less" file, with actual data starting on row 1 -CSV.write("out.csv", df; header=false) +CSV.write("out.csv", dt; header=false) -# write out a DataTable `df` twice to a file, the resulting file with have twice the # of rows as the DataTable +# write out a DataTable `dt` twice to a file, the resulting file with have twice the # of rows as the DataTable # note the usage of the keyword argument `append=true` in the 2nd call -CSV.write("out.csv", df) -CSV.write("out.csv", df; append=true) +CSV.write("out.csv", dt) +CSV.write("out.csv", dt; append=true) # write a DataTable out to an IOBuffer instead of a file io = IOBuffer -CSV.write(io, df) +CSV.write(io, dt) # write the result of an SQLite query out to a comma-delimited file db = SQLite.DB() diff --git a/src/Source.jl b/src/Source.jl index 3738aa45..c7519fd0 100644 --- a/src/Source.jl +++ b/src/Source.jl @@ -270,16 +270,16 @@ CSV.read(file; types=Dict("col3"=>Float64, "col6"=>String)) CSV.read(file; rows=10000) # for data files, `file` and `file2`, with the same structure, read both into a single DataTable -# note that `df` is used as a 2nd argument in the 2nd call to `CSV.read` and the keyword argument +# note that `dt` is used as a 2nd argument in the 2nd call to `CSV.read` and the keyword argument # `append=true` is passed -df = CSV.read(file) -df = CSV.read(file2, df; append=true) +dt = CSV.read(file) +dt = CSV.read(file2, dt; append=true) # manually construct a `CSV.Source` once, then stream its data to both a DataTable # and SQLite table `sqlite_table` in the SQLite database `db` # note the use of `CSV.reset!` to ensure the `source` can be streamed from again source = CSV.Source(file) -df1 = CSV.read(source, DataTable) +dt1 = CSV.read(source, DataTable) CSV.reset!(source) db = SQLite.DB() sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table") diff --git a/src/io.jl b/src/io.jl index c3e68857..1543d909 100644 --- a/src/io.jl +++ b/src/io.jl @@ -125,8 +125,8 @@ end immutable NullField end # try to infer the type of the value in `val`. The precedence of type checking is `Int` => `Float64` => `Date` => `DateTime` => `String` -slottype{T}(df::Dates.Slot{T}) = T -timetype(df::Dates.DateFormat) = any(slottype(T) in (Dates.Hour,Dates.Minute,Dates.Second,Dates.Millisecond) for T in df.slots) ? DateTime : Date +slottype{T}(dt::Dates.Slot{T}) = T +timetype(dt::Dates.DateFormat) = any(slottype(T) in (Dates.Hour,Dates.Minute,Dates.Second,Dates.Millisecond) for T in dt.slots) ? DateTime : Date function detecttype(val::AbstractString, format, datecheck, null) (val == "" || val == null) && return NullField diff --git a/test/datastreams.jl b/test/datastreams.jl index 5fa73c3a..5df91946 100644 --- a/test/datastreams.jl +++ b/test/datastreams.jl @@ -1,10 +1,10 @@ # DataTables FILE = joinpath(DSTESTDIR, "randoms_small.csv") -DF = CSV.read(FILE) -DF2 = CSV.read(FILE) -dfsource = Tester("DataTable", x->x, false, DataTable, (:DF,), scalartransforms, vectortransforms, x->x, x->nothing) -dfsink = Tester("DataTable", x->x, false, DataTable, (:DF2,), scalartransforms, vectortransforms, x->x, x->nothing) +DT = CSV.read(FILE) +DT2 = CSV.read(FILE) +dtsource = Tester("DataTable", x->x, false, DataTable, (:DT,), scalartransforms, vectortransforms, x->x, x->nothing) +dtsink = Tester("DataTable", x->x, false, DataTable, (:DT2,), scalartransforms, vectortransforms, x->x, x->nothing) function DataTables.DataTable(sym::Symbol; append::Bool=false) return @eval $sym end @@ -46,4 +46,4 @@ FILE2 = joinpath(DSTESTDIR, "randoms2_small.csv") csvsource = Tester("CSV.Source", CSV.read, true, CSV.Source, (FILE,), scalartransforms, vectortransforms, x->x, x->nothing) csvsink = Tester("CSV.Sink", CSV.write, true, CSV.Sink, (FILE2,), scalartransforms, vectortransforms, x->CSV.read(FILE2; use_mmap=false), x->rm(FILE2)) -DataStreamsIntegrationTests.teststream([dfsource, csvsource], [dfsink, csvsink]; rows=99) +DataStreamsIntegrationTests.teststream([dtsource, csvsource], [dtsink, csvsink]; rows=99) diff --git a/test/source.jl b/test/source.jl index c2d48cc0..3db69fea 100644 --- a/test/source.jl +++ b/test/source.jl @@ -292,12 +292,12 @@ f = open(t, "w") Base.write(f, readstring(joinpath(dir, "test_missing_value_NULL.csv"))) seekstart(f) source = CSV.Source(f; header=[], datarow=2, nullable=false) -df = CSV.read(source) -@test Data.header(df) == ["Column1", "Column2", "Column3"] +dt = CSV.read(source) +@test Data.header(dt) == ["Column1", "Column2", "Column3"] CSV.reset!(source) -df2 = CSV.read(source) -@test isequal(df, df2) +dt2 = CSV.read(source) +@test isequal(dt, dt2) @test_throws ArgumentError CSV.Source(f; types = [Int, Int, Int, Int]) close(f) @@ -321,4 +321,3 @@ let fn = tempname() gc(); gc() rm(fn) end - From 7a1fba2ac1f92ccf26f95e0d7f7190a2297116c2 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Thu, 16 Mar 2017 20:19:54 -0700 Subject: [PATCH 3/4] updates --- src/Source.jl | 4 ++-- test/source.jl | 25 ++++++++++++------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/Source.jl b/src/Source.jl index ccbe72f1..53810646 100644 --- a/src/Source.jl +++ b/src/Source.jl @@ -10,7 +10,7 @@ function Source(fullpath::Union{AbstractString,IO}; datarow::Int=-1, # by default, data starts immediately after header or start of file types::Union{Dict{Int,DataType},Dict{String,DataType},Vector{DataType}}=DataType[], nullable::Bool=true, - weakrefstrings::Bool=true, + weakrefstrings::Bool=false, dateformat::Union{AbstractString,Dates.DateFormat}=Dates.ISODateFormat, footerskip::Int=0, @@ -38,7 +38,7 @@ function Source(;fullpath::Union{AbstractString,IO}="", datarow::Int=-1, # by default, data starts immediately after header or start of file types::Union{Dict{Int,DataType},Dict{String,DataType},Vector{DataType}}=DataType[], nullable::Bool=true, - weakrefstrings::Bool=true, + weakrefstrings::Bool=false, footerskip::Int=0, rows_for_type_detect::Int=100, diff --git a/test/source.jl b/test/source.jl index cb757a37..9a487874 100644 --- a/test/source.jl +++ b/test/source.jl @@ -66,7 +66,7 @@ f = CSV.Source(joinpath(dir, "test_empty_file.csv")) @test size(f, 1) == 0 #test file with just newlines -f = CSV.Source(joinpath(dir, "test_empty_file_newlines.csv")) +f = CSV.Source(joinpath(dir, "test_empty_file_newlines.csv"), weakrefstrings=true) @test size(f, 2) == 1 @test size(f, 1) == 9 @test Data.header(f) == [""] @@ -142,7 +142,7 @@ f = CSV.Source(joinpath(dir, "test_float_in_int_column.csv"); types=[Int,Int,Int @test_throws CSV.CSVError Data.stream!(f, DataFrame) #test null/missing values -f = CSV.Source(joinpath(dir, "test_missing_value_NULL.csv")) +f = CSV.Source(joinpath(dir, "test_missing_value_NULL.csv"), weakrefstrings=true) @test size(f, 2) == 3 @test size(f, 1) == 3 @test Data.types(f) == [Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{Float64}] @@ -167,70 +167,70 @@ f = CSV.Source(joinpath(dir, "test_missing_value.csv")) @test Data.types(f) == [Nullable{Float64},Nullable{Float64},Nullable{Float64}] #other various files found around the internet -f = CSV.Source(joinpath(dir, "baseball.csv")) +f = CSV.Source(joinpath(dir, "baseball.csv"), weakrefstrings=true) @test size(f, 2) == 15 @test size(f, 1) == 35 @test Data.header(f) == ["Rk","Year","Age","Tm","Lg","","W","L","W-L%","G","Finish","Wpost","Lpost","W-L%post",""] @test Data.types(f) == [Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Int},Nullable{Int},Nullable{Float64},Nullable{WeakRefString{UInt8}}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict(10=>Float64,12=>Float64)) +f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict(10=>Float64,12=>Float64), weakrefstrings=true) @test size(f, 2) == 18 @test size(f, 1) == 36634 @test Data.header(f) == ["policyID","statecode","county","eq_site_limit","hu_site_limit","fl_site_limit","fr_site_limit","tiv_2011","tiv_2012","eq_site_deductible","hu_site_deductible","fl_site_deductible","fr_site_deductible","point_latitude","point_longitude","line","construction","point_granularity"] @test Data.types(f) == [Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict{String,DataType}("eq_site_deductible"=>Float64,"fl_site_deductible"=>Float64)) +f = CSV.Source(joinpath(dir, "FL_insurance_sample.csv");types=Dict{String,DataType}("eq_site_deductible"=>Float64,"fl_site_deductible"=>Float64), weakrefstrings=true) @test size(f, 2) == 18 @test size(f, 1) == 36634 @test Data.header(f) == ["policyID","statecode","county","eq_site_limit","hu_site_limit","fl_site_limit","fr_site_limit","tiv_2011","tiv_2012","eq_site_deductible","hu_site_deductible","fl_site_deductible","fr_site_deductible","point_latitude","point_longitude","line","construction","point_granularity"] @test Data.types(f) == [Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Float64},Nullable{Int},Nullable{Float64},Nullable{Float64},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "SacramentocrimeJanuary2006.csv")) +f = CSV.Source(joinpath(dir, "SacramentocrimeJanuary2006.csv"), weakrefstrings=true) @test size(f, 2) == 9 @test size(f, 1) == 7584 @test Data.header(f) == ["cdatetime","address","district","beat","grid","crimedescr","ucr_ncic_code","latitude","longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Float64},Nullable{Float64}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "Sacramentorealestatetransactions.csv")) +f = CSV.Source(joinpath(dir, "Sacramentorealestatetransactions.csv"), weakrefstrings=true) @test size(f, 2) == 12 @test size(f, 1) == 985 @test Data.header(f) == ["street","city","zip","state","beds","baths","sq__ft","type","sale_date","price","latitude","longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Float64},Nullable{Float64}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "SalesJan2009.csv")) +f = CSV.Source(joinpath(dir, "SalesJan2009.csv"), weakrefstrings=true) @test size(f, 2) == 12 @test size(f, 1) == 998 @test Data.header(f) == ["Transaction_date","Product","Price","Payment_Type","Name","City","State","Country","Account_Created","Last_Login","Latitude","Longitude"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Float64},Nullable{Float64}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "stocks.csv")) +f = CSV.Source(joinpath(dir, "stocks.csv"), weakrefstrings=true) @test size(f, 2) == 2 @test size(f, 1) == 30 @test Data.header(f) == ["Stock Name","Company Name"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "TechCrunchcontinentalUSA.csv")) +f = CSV.Source(joinpath(dir, "TechCrunchcontinentalUSA.csv"), weakrefstrings=true) @test size(f, 2) == 10 @test size(f, 1) == 1460 @test Data.header(f) == ["permalink","company","numEmps","category","city","state","fundedDate","raisedAmt","raisedCurrency","round"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "Fielding.csv")) +f = CSV.Source(joinpath(dir, "Fielding.csv"), weakrefstrings=true) @test size(f, 2) == 18 @test size(f, 1) == 167938 @test Data.header(f) == ["playerID","yearID","stint","teamID","lgID","POS","G","GS","InnOuts","PO","A","E","DP","PB","WP","SB","CS","ZR"] @test Data.types(f) == [Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{Int},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}},Nullable{WeakRefString{UInt8}}] ds = Data.stream!(f, DataFrame) -f = CSV.Source(joinpath(dir, "latest (1).csv"); header=0, null="\\N") +f = CSV.Source(joinpath(dir, "latest (1).csv"); header=0, null="\\N", weakrefstrings=true) @test size(f, 2) == 25 @test size(f, 1) == 1000 @test Data.header(f) == ["Column$i" for i = 1:size(f, 2)] @@ -321,4 +321,3 @@ let fn = tempname() gc(); gc() rm(fn) end - From 4411334d5ed79d58dee4129ef22e4d477376db81 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Fri, 17 Mar 2017 13:27:33 -0700 Subject: [PATCH 4/4] modify docstring with new default --- src/Source.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Source.jl b/src/Source.jl index 4d840984..1fe5dd73 100644 --- a/src/Source.jl +++ b/src/Source.jl @@ -217,7 +217,7 @@ Keyword Arguments: * `datarow::Int`; specifies the row on which the actual data starts in the file; by default, the data is expected on the next row after the header row(s); for a file without column names (header), specify `datarow=1` * `types`; column types can be provided manually as a complete Vector{DataType}, or in a Dict to reference individual columns by name or number * `nullable::Bool`; indicates whether values can be nullable or not; `true` by default. If set to `false` and missing values are encountered, a `NullException` will be thrown -* `weakrefstrings::Bool=true`: indicates whether string-type columns should use the `WeakRefString` (for efficiency) or a regular `String` type +* `weakrefstrings::Bool=false`: indicates whether string-type columns should use the `WeakRefString` (for efficiency) or a regular `String` type * `dateformat::Union{AbstractString,Dates.DateFormat}`; how all dates/datetimes in the dataset are formatted * `footerskip::Int`; indicates the number of rows to skip at the end of the file * `rows_for_type_detect::Int=100`; indicates how many rows should be read to infer the types of columns