Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to DataStreams 0.2.0 #95

Merged
merged 14 commits into from
Sep 7, 2017
3 changes: 0 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ os:
- osx

julia:
- 0.5
- 0.6
- nightly

Expand All @@ -18,5 +17,3 @@ after_success:
- julia -e 'cd(Pkg.dir("CSV")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
- julia -e 'Pkg.add("Documenter")'
- julia -e 'cd(Pkg.dir("CSV")); include(joinpath("docs", "make.jl"))'
- julia -e 'Pkg.add("PkgBenchmark")'
- julia -e 'using PkgBenchmark; benchmarkpkg("CSV"; promptsave=false, promptoverwrite=false)'
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

| **Documentation** | **PackageEvaluator** | **Build Status** |
|:-------------------------------------------------------------------------------:|:---------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.4-img]][pkg-0.4-url] [![][pkg-0.5-img]][pkg-0.5-url] [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |


## Installation
Expand All @@ -23,7 +23,7 @@ julia> Pkg.add("CSV")

## Project Status

The package is tested against Julia `0.4`, `0.5`, and *current* `0.6` on Linux, OS X, and Windows.
The package is tested against Julia `0.6` and nightly on Linux, OS X, and Windows.

## Contributing and Questions

Expand All @@ -49,9 +49,5 @@ Contributions are very welcome, as are feature requests and suggestions. Please

[issues-url]: https://github.com/JuliaData/CSV.jl/issues

[pkg-0.4-img]: http://pkg.julialang.org/badges/CSV_0.4.svg
[pkg-0.4-url]: http://pkg.julialang.org/?pkg=CSV
[pkg-0.5-img]: http://pkg.julialang.org/badges/CSV_0.5.svg
[pkg-0.5-url]: http://pkg.julialang.org/?pkg=CSV
[pkg-0.6-img]: http://pkg.julialang.org/badges/CSV_0.6.svg
[pkg-0.6-url]: http://pkg.julialang.org/?pkg=CSV
6 changes: 3 additions & 3 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
julia 0.5
julia 0.6
Compat 0.9.5
DataStreams 0.1.0 0.2.0
DataStreams 0.2.0
DataFrames
WeakRefStrings 0.1.3 0.3.0
WeakRefStrings 0.3.0
2 changes: 0 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
environment:
matrix:
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
Expand Down
203 changes: 203 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,206 @@ TYPES = !is_windows() ? (Int, Float64, WeakRefString{UInt8}, String, Date, DateT
end

end




# generate single column files w/ 1M rows for each type
using WeakRefStrings

val = "hey"
for i in (1001, 100.1, WeakRefString{UInt8}(pointer(val), 3, 0), Date(2008, 1, 3), DateTime(2008, 3, 4))
open("/Users/jacobquinn/Downloads/randoms_$(typeof(i)).csv", "w") do f
for j = 1:1_000_000
write(f, string(i))
write(f, "\n")
end
end
end

using CSV, TextParse
for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
println("comparing for T = $T...")
# T == WeakRefStrings.WeakRefString{UInt8} && continue
@time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
# @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
end

@time CSV.read("/Users/jacobquinn/Downloads/yellow_tripdata_2015-01.csv");

for T in ('Int64', 'Float64', 'WeakRefString{UInt8}', 'Date', 'DateTime'):
start = time.time()
delim = ','
table = pandas.read_csv("/Users/jacobquinn/Downloads/randoms_" + T + ".csv", delimiter=delim)
end = time.time()
print(end - start)

@time df = CSV.read("/Users/jacobquinn/Downloads/file.txt"; delim=' ');
@time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv")
# julia> for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
# println("comparing for T = $T...")
# @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
# @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv");
# end
# comparing for T = Int64...
# pre-allocating DataFrame w/ rows = 999999
# 0.043684 seconds (1.00 M allocations: 22.929 MiB, 31.61% gc time)
# 0.045556 seconds (460 allocations: 15.575 MiB, 3.20% gc time)
# comparing for T = Float64...
# pre-allocating DataFrame w/ rows = 999999
# 0.080026 seconds (1.00 M allocations: 22.974 MiB, 23.80% gc time)
# 0.082530 seconds (457 allocations: 16.528 MiB)
# comparing for T = WeakRefString{UInt8}...
# pre-allocating DataFrame w/ rows = 999999
# 0.058446 seconds (1.89 k allocations: 22.986 MiB, 8.53% gc time)
# 0.069034 seconds (595 allocations: 5.188 MiB)
# comparing for T = Date...
# pre-allocating DataFrame w/ rows = 999999
# 0.125229 seconds (2.00 M allocations: 53.504 MiB, 20.94% gc time)
# 0.120472 seconds (1.00 M allocations: 51.846 MiB, 6.73% gc time)
# comparing for T = DateTime...
# pre-allocating DataFrame w/ rows = 999999
# 0.175855 seconds (2.00 M allocations: 53.504 MiB, 23.30% gc time)
# 0.187619 seconds (1.00 M allocations: 60.516 MiB, 4.40% gc time)


T = Int64
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_$(T).csv";)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=true)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=false)
# source.schema = DataStreams.Data.Schema(DataStreams.Data.header(source.schema), (Int, String, String, Float64, Float64, Date, DateTime), 9)
# @time df = CSV.read(source, NamedTuple);
sink = Si = NamedTuple
transforms = Dict{Int,Function}(1=>x->x-1)
append = false
args = kwargs = ()
source_schema = DataStreams.Data.schema(source)
sink_schema, transforms2 = DataStreams.Data.transform(source_schema, transforms, true);
sinkstreamtype = DataStreams.Data.Field
sink = Si(sink_schema, sinkstreamtype, append, args...; kwargs...);
columns = []
filter = x->true
@code_warntype DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)

function testt(t)
a = getfield(t, 1)
b = getfield(t, 2)
c = getfield(t, 3)
d = getfield(t, 4)
e = getfield(t, 5)
f = getfield(t, 6)
g = getfield(t, 7)
return (a, b, c, d, e, f, g)
end
@code_warntype testt((i1=(?Int)[], i2=(?String)[], i3=(?String)[], i4=(?Float64)[], i5=(?Float64)[], i6=(?Date)[], i7=(?DateTime)[]))

@code_llvm DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)

@code_warntype @time CSV.parsefield(IOBuffer(), ?Int, CSV.Options(), 0, 0, CSV.STATE)

t = Vector{Int}(1000000)

# having CSV.parsefield(io, T) where T !>: Null decreases allocations by 1.00M
# inlining CSV.parsefield also dropped allocations
# making CSV.Options not have a type parameter also sped things up
#

using BenchmarkTools

g(x) = x < 5 ? x : -1
A = [i for i = 1:10]
function get_then_set(A)
@simd for i = 1:10
@inbounds A[i] = g(i)
end
return A
end
@code_warntype g(1)
@code_warntype get_then_set(A)
@benchmark get_then_set(A) # 20ns

@inline g3(x) = g2(x)
@inline function g2(x)
if x < 20
return x * 20
end

if x < 15
return nothing
end

if x < 12
return 2x
end

if x * 20 / 4 % 2 == 0
return 1
end

if x < 0
return nothing
end
return nothing
end

A = Union{Int, Void}[i for i = 1:10]
@inline function get_then_set2(A)
@simd for i = 1:10
# Base.arrayset(A, g2(i), i)
val = g3(i)
if val isa Void
@inbounds A[i] = val#::Union{Int, Void}
else
@inbounds A[i] = val#::Union{Int, Void}
end
end
return A
end
function run_lots(N)
A = Union{Int, Void}[i for i = 1:10]
for i = 1:N
get_then_set2(A)
end
return
end

@code_warntype g2(1)
@code_warntype get_then_set2(A)
@code_llvm get_then_set2(A)
@benchmark get_then_set2(A) # 155ns


g4(x::Int) = 1
g4(x::Void) = 0

A = [i for i = 1:10]
function get_sum(A)
s = 0
for a in A
s += g4(a)
end
return s
end
@code_warntype get_sum(A)
@code_llvm get_sum(A)
@benchmark get_sum(A) # 24ns

A = Union{Int, Void}[i for i = 1:10]
A[[3, 5, 7]] = nothing
function get_sum2(A)
s = 0
for a in A
s += g4(a)
end
return s
end
@code_warntype get_sum2(A)
@code_llvm get_sum(A)
@benchmark get_sum2(A) # 100ns


function getstatic{T}(t::T)
return t[1]
end
54 changes: 33 additions & 21 deletions src/CSV.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
__precompile__(true)
module CSV

using Compat, DataStreams, DataFrames, WeakRefStrings
using DataStreams, WeakRefStrings, Nulls, DataFrames

export Data, DataFrame

immutable CSVError <: Exception
struct ParsingException <: Exception
msg::String
end

Expand All @@ -24,18 +22,19 @@ const ZERO = UInt8('0')
const TEN = UInt8('9')+UInt8(1)
Base.isascii(c::UInt8) = c < 0x80

@inline function unsafe_read(from::Base.AbstractIOBuffer, ::Type{UInt8}=UInt8)
readbyte(from::IO) = Base.read(from, UInt8)
peekbyte(from::IO) = Base.peek(from)

@inline function readbyte(from::IOBuffer)
@inbounds byte = from.data[from.ptr]
from.ptr = from.ptr + 1
return byte
end
unsafe_read(from::IO, T) = Base.read(from, T)

@inline function unsafe_peek(from::Base.AbstractIOBuffer)
@inline function peekbyte(from::IOBuffer)
@inbounds byte = from.data[from.ptr]
return byte
end
unsafe_peek(from::IO) = (mark(from); v = Base.read(from, UInt8); reset(from); return v)

"""
Represents the various configuration settings for delimited text file parsing.
Expand All @@ -48,30 +47,29 @@ Keyword Arguments:
* `null::String`; indicates how NULL values are represented in the dataset
* `dateformat::Union{AbstractString,Dates.DateFormat}`; how dates/datetimes are represented in the dataset
"""
type Options
mutable struct Options{D}
delim::UInt8
quotechar::UInt8
escapechar::UInt8
null::String
null::Vector{UInt8}
nullcheck::Bool
dateformat::Dates.DateFormat
datecheck::Bool
dateformat::D
# non-public for now
datarow::Int
rows::Int
header::Union{Integer,UnitRange{Int},Vector}
types::Union{Dict{Int,DataType},Dict{String,DataType},Vector{DataType}}
types
end

Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null=String(""), dateformat=Dates.ISODateFormat, datarow=-1, rows=0, header=1, types=DataType[]) =
Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null="", dateformat=Dates.ISODateTimeFormat, datarow=-1, rows=0, header=1, types=Type[]) =
Options(delim%UInt8, quotechar%UInt8, escapechar%UInt8,
ascii(null), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), dateformat == Dates.ISODateTimeFormat || dateformat == Dates.ISODateFormat, datarow, rows, header, types)
map(UInt8, collect(ascii(null))), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), datarow, rows, header, types)
function Base.show(io::IO,op::Options)
println(io, " CSV.Options:")
println(io, " delim: '", Char(op.delim), "'")
println(io, " quotechar: '", Char(op.quotechar), "'")
print(io, " escapechar: '"); escape_string(io, string(Char(op.escapechar)), "\\"); println(io, "'")
print(io, " null: \""); escape_string(io, op.null, "\\"); println(io, "\"")
print(io, " null: \""); escape_string(io, isempty(op.null) ? "" : String(collect(op.null)), "\\"); println(io, "\"")
print(io, " dateformat: ", op.dateformat)
end

Expand All @@ -96,10 +94,10 @@ CSV.reset!(source)
sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
```
"""
type Source <: Data.Source
mutable struct Source{I, D} <: Data.Source
schema::Data.Schema
options::Options
io::IOBuffer
options::Options{D}
io::I
ptr::Int # pointer to underlying data buffer
fullpath::String
datapos::Int # the position in the IOBuffer where the rows of data begins
Expand All @@ -111,6 +109,16 @@ function Base.show(io::IO, f::Source)
show(io, f.schema)
end

# mutable struct TransposedSource{I, D} <: Data.Source
# schema::Data.Schema
# options::Options{D}
# io::I
# ptr::Int # pointer to underlying data buffer
# fullpath::String
# datapos::Int # the position in the IOBuffer where the rows of data begins
# columnpositions::Vector{Int}
# end

"""
A type that satisfies the `Data.Sink` interface in the `DataStreams.jl` package.

Expand All @@ -132,19 +140,23 @@ CSV.reset!(source)
sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
```
"""
type Sink <: Data.Sink
options::Options
mutable struct Sink{D, B} <: Data.Sink
options::Options{D}
io::IOBuffer
fullpath::Union{String, IO}
datapos::Int # the position in the IOBuffer where the rows of data begins
header::Bool
colnames::Vector{String}
cols::Int
append::Bool
quotefields::B
end

include("parsefields.jl")
include("float.jl")
include("io.jl")
include("Source.jl")
# include("TransposedSource.jl")
include("Sink.jl")

end # module
Loading