Skip to content
3 changes: 0 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ os:
- osx

julia:
- 0.5
- 0.6
- nightly

Expand All @@ -18,5 +17,3 @@ after_success:
- julia -e 'cd(Pkg.dir("CSV")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
- julia -e 'Pkg.add("Documenter")'
- julia -e 'cd(Pkg.dir("CSV")); include(joinpath("docs", "make.jl"))'
- julia -e 'Pkg.add("PkgBenchmark")'
- julia -e 'using PkgBenchmark; benchmarkpkg("CSV"; promptsave=false, promptoverwrite=false)'
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

| **Documentation** | **PackageEvaluator** | **Build Status** |
|:-------------------------------------------------------------------------------:|:---------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.4-img]][pkg-0.4-url] [![][pkg-0.5-img]][pkg-0.5-url] [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |
| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |


## Installation
Expand All @@ -23,7 +23,7 @@ julia> Pkg.add("CSV")

## Project Status

The package is tested against Julia `0.4`, `0.5`, and *current* `0.6` on Linux, OS X, and Windows.
The package is tested against Julia `0.6` and nightly on Linux, OS X, and Windows.

## Contributing and Questions

Expand All @@ -49,9 +49,5 @@ Contributions are very welcome, as are feature requests and suggestions. Please

[issues-url]: https://github.com/JuliaData/CSV.jl/issues

[pkg-0.4-img]: http://pkg.julialang.org/badges/CSV_0.4.svg
[pkg-0.4-url]: http://pkg.julialang.org/?pkg=CSV
[pkg-0.5-img]: http://pkg.julialang.org/badges/CSV_0.5.svg
[pkg-0.5-url]: http://pkg.julialang.org/?pkg=CSV
[pkg-0.6-img]: http://pkg.julialang.org/badges/CSV_0.6.svg
[pkg-0.6-url]: http://pkg.julialang.org/?pkg=CSV
6 changes: 3 additions & 3 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
julia 0.5
julia 0.6
Compat 0.9.5
DataStreams 0.1.0 0.2.0
DataStreams 0.2.0
DataFrames
WeakRefStrings 0.1.3 0.3.0
WeakRefStrings 0.3.0
2 changes: 0 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
environment:
matrix:
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
Expand Down
203 changes: 203 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,206 @@ TYPES = !is_windows() ? (Int, Float64, WeakRefString{UInt8}, String, Date, DateT
end

end




# generate single column files w/ 1M rows for each type
using WeakRefStrings

val = "hey"
for i in (1001, 100.1, WeakRefString{UInt8}(pointer(val), 3, 0), Date(2008, 1, 3), DateTime(2008, 3, 4))
open("/Users/jacobquinn/Downloads/randoms_$(typeof(i)).csv", "w") do f
for j = 1:1_000_000
write(f, string(i))
write(f, "\n")
end
end
end

using CSV, TextParse
for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
println("comparing for T = $T...")
# T == WeakRefStrings.WeakRefString{UInt8} && continue
@time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
# @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
end

@time CSV.read("/Users/jacobquinn/Downloads/yellow_tripdata_2015-01.csv");

for T in ('Int64', 'Float64', 'WeakRefString{UInt8}', 'Date', 'DateTime'):
start = time.time()
delim = ','
table = pandas.read_csv("/Users/jacobquinn/Downloads/randoms_" + T + ".csv", delimiter=delim)
end = time.time()
print(end - start)

@time df = CSV.read("/Users/jacobquinn/Downloads/file.txt"; delim=' ');
@time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv")
# julia> for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
# println("comparing for T = $T...")
# @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
# @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv");
# end
# comparing for T = Int64...
# pre-allocating DataFrame w/ rows = 999999
# 0.043684 seconds (1.00 M allocations: 22.929 MiB, 31.61% gc time)
# 0.045556 seconds (460 allocations: 15.575 MiB, 3.20% gc time)
# comparing for T = Float64...
# pre-allocating DataFrame w/ rows = 999999
# 0.080026 seconds (1.00 M allocations: 22.974 MiB, 23.80% gc time)
# 0.082530 seconds (457 allocations: 16.528 MiB)
# comparing for T = WeakRefString{UInt8}...
# pre-allocating DataFrame w/ rows = 999999
# 0.058446 seconds (1.89 k allocations: 22.986 MiB, 8.53% gc time)
# 0.069034 seconds (595 allocations: 5.188 MiB)
# comparing for T = Date...
# pre-allocating DataFrame w/ rows = 999999
# 0.125229 seconds (2.00 M allocations: 53.504 MiB, 20.94% gc time)
# 0.120472 seconds (1.00 M allocations: 51.846 MiB, 6.73% gc time)
# comparing for T = DateTime...
# pre-allocating DataFrame w/ rows = 999999
# 0.175855 seconds (2.00 M allocations: 53.504 MiB, 23.30% gc time)
# 0.187619 seconds (1.00 M allocations: 60.516 MiB, 4.40% gc time)


T = Int64
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_$(T).csv";)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=true)
@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=false)
# source.schema = DataStreams.Data.Schema(DataStreams.Data.header(source.schema), (Int, String, String, Float64, Float64, Date, DateTime), 9)
# @time df = CSV.read(source, NamedTuple);
sink = Si = NamedTuple
transforms = Dict{Int,Function}(1=>x->x-1)
append = false
args = kwargs = ()
source_schema = DataStreams.Data.schema(source)
sink_schema, transforms2 = DataStreams.Data.transform(source_schema, transforms, true);
sinkstreamtype = DataStreams.Data.Field
sink = Si(sink_schema, sinkstreamtype, append, args...; kwargs...);
columns = []
filter = x->true
@code_warntype DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)

function testt(t)
a = getfield(t, 1)
b = getfield(t, 2)
c = getfield(t, 3)
d = getfield(t, 4)
e = getfield(t, 5)
f = getfield(t, 6)
g = getfield(t, 7)
return (a, b, c, d, e, f, g)
end
@code_warntype testt((i1=(?Int)[], i2=(?String)[], i3=(?String)[], i4=(?Float64)[], i5=(?Float64)[], i6=(?Date)[], i7=(?DateTime)[]))

@code_llvm DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)

@code_warntype @time CSV.parsefield(IOBuffer(), ?Int, CSV.Options(), 0, 0, CSV.STATE)

t = Vector{Int}(1000000)

# having CSV.parsefield(io, T) where T !>: Null decreases allocations by 1.00M
# inlining CSV.parsefield also dropped allocations
# making CSV.Options not have a type parameter also sped things up
#

using BenchmarkTools

g(x) = x < 5 ? x : -1
A = [i for i = 1:10]
function get_then_set(A)
@simd for i = 1:10
@inbounds A[i] = g(i)
end
return A
end
@code_warntype g(1)
@code_warntype get_then_set(A)
@benchmark get_then_set(A) # 20ns

@inline g3(x) = g2(x)
@inline function g2(x)
if x < 20
return x * 20
end

if x < 15
return nothing
end

if x < 12
return 2x
end

if x * 20 / 4 % 2 == 0
return 1
end

if x < 0
return nothing
end
return nothing
end

A = Union{Int, Void}[i for i = 1:10]
@inline function get_then_set2(A)
@simd for i = 1:10
# Base.arrayset(A, g2(i), i)
val = g3(i)
if val isa Void
@inbounds A[i] = val#::Union{Int, Void}
else
@inbounds A[i] = val#::Union{Int, Void}
end
end
return A
end
function run_lots(N)
A = Union{Int, Void}[i for i = 1:10]
for i = 1:N
get_then_set2(A)
end
return
end

@code_warntype g2(1)
@code_warntype get_then_set2(A)
@code_llvm get_then_set2(A)
@benchmark get_then_set2(A) # 155ns


g4(x::Int) = 1
g4(x::Void) = 0

A = [i for i = 1:10]
function get_sum(A)
s = 0
for a in A
s += g4(a)
end
return s
end
@code_warntype get_sum(A)
@code_llvm get_sum(A)
@benchmark get_sum(A) # 24ns

A = Union{Int, Void}[i for i = 1:10]
A[[3, 5, 7]] = nothing
function get_sum2(A)
s = 0
for a in A
s += g4(a)
end
return s
end
@code_warntype get_sum2(A)
@code_llvm get_sum(A)
@benchmark get_sum2(A) # 100ns


function getstatic{T}(t::T)
return t[1]
end
54 changes: 33 additions & 21 deletions src/CSV.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
__precompile__(true)
module CSV

using Compat, DataStreams, DataFrames, WeakRefStrings
using DataStreams, WeakRefStrings, Nulls, DataFrames

export Data, DataFrame

immutable CSVError <: Exception
struct ParsingException <: Exception
msg::String
end

Expand All @@ -24,18 +22,19 @@ const ZERO = UInt8('0')
const TEN = UInt8('9')+UInt8(1)
Base.isascii(c::UInt8) = c < 0x80

@inline function unsafe_read(from::Base.AbstractIOBuffer, ::Type{UInt8}=UInt8)
readbyte(from::IO) = Base.read(from, UInt8)
peekbyte(from::IO) = Base.peek(from)

@inline function readbyte(from::IOBuffer)
@inbounds byte = from.data[from.ptr]
from.ptr = from.ptr + 1
return byte
end
unsafe_read(from::IO, T) = Base.read(from, T)

@inline function unsafe_peek(from::Base.AbstractIOBuffer)
@inline function peekbyte(from::IOBuffer)
@inbounds byte = from.data[from.ptr]
return byte
end
unsafe_peek(from::IO) = (mark(from); v = Base.read(from, UInt8); reset(from); return v)

"""
Represents the various configuration settings for delimited text file parsing.
Expand All @@ -48,30 +47,29 @@ Keyword Arguments:
* `null::String`; indicates how NULL values are represented in the dataset
* `dateformat::Union{AbstractString,Dates.DateFormat}`; how dates/datetimes are represented in the dataset
"""
type Options
mutable struct Options{D}
delim::UInt8
quotechar::UInt8
escapechar::UInt8
null::String
null::Vector{UInt8}
nullcheck::Bool
dateformat::Dates.DateFormat
datecheck::Bool
dateformat::D
# non-public for now
datarow::Int
rows::Int
header::Union{Integer,UnitRange{Int},Vector}
types::Union{Dict{Int,DataType},Dict{String,DataType},Vector{DataType}}
types
end

Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null=String(""), dateformat=Dates.ISODateFormat, datarow=-1, rows=0, header=1, types=DataType[]) =
Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null="", dateformat=Dates.ISODateTimeFormat, datarow=-1, rows=0, header=1, types=Type[]) =
Options(delim%UInt8, quotechar%UInt8, escapechar%UInt8,
ascii(null), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), dateformat == Dates.ISODateTimeFormat || dateformat == Dates.ISODateFormat, datarow, rows, header, types)
map(UInt8, collect(ascii(null))), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), datarow, rows, header, types)
function Base.show(io::IO,op::Options)
println(io, " CSV.Options:")
println(io, " delim: '", Char(op.delim), "'")
println(io, " quotechar: '", Char(op.quotechar), "'")
print(io, " escapechar: '"); escape_string(io, string(Char(op.escapechar)), "\\"); println(io, "'")
print(io, " null: \""); escape_string(io, op.null, "\\"); println(io, "\"")
print(io, " null: \""); escape_string(io, isempty(op.null) ? "" : String(collect(op.null)), "\\"); println(io, "\"")
print(io, " dateformat: ", op.dateformat)
end

Expand All @@ -96,10 +94,10 @@ CSV.reset!(source)
sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
```
"""
type Source <: Data.Source
mutable struct Source{I, D} <: Data.Source
schema::Data.Schema
options::Options
io::IOBuffer
options::Options{D}
io::I
ptr::Int # pointer to underlying data buffer
fullpath::String
datapos::Int # the position in the IOBuffer where the rows of data begins
Expand All @@ -111,6 +109,16 @@ function Base.show(io::IO, f::Source)
show(io, f.schema)
end

# mutable struct TransposedSource{I, D} <: Data.Source
# schema::Data.Schema
# options::Options{D}
# io::I
# ptr::Int # pointer to underlying data buffer
# fullpath::String
# datapos::Int # the position in the IOBuffer where the rows of data begins
# columnpositions::Vector{Int}
# end

"""
A type that satisfies the `Data.Sink` interface in the `DataStreams.jl` package.

Expand All @@ -132,19 +140,23 @@ CSV.reset!(source)
sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
```
"""
type Sink <: Data.Sink
options::Options
mutable struct Sink{D, B} <: Data.Sink
options::Options{D}
io::IOBuffer
fullpath::Union{String, IO}
datapos::Int # the position in the IOBuffer where the rows of data begins
header::Bool
colnames::Vector{String}
cols::Int
append::Bool
quotefields::B
end

include("parsefields.jl")
include("float.jl")
include("io.jl")
include("Source.jl")
# include("TransposedSource.jl")
include("Sink.jl")

end # module
Loading