JuliaData · quinnj · Sep 7, 2017 · May 15, 2017 · May 28, 2017 · May 30, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,6 @@ os:
   - osx
 
 julia:
-  - 0.5
   - 0.6
   - nightly
 
@@ -18,5 +17,3 @@ after_success:
   - julia -e 'cd(Pkg.dir("CSV")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
   - julia -e 'Pkg.add("Documenter")'
   - julia -e 'cd(Pkg.dir("CSV")); include(joinpath("docs", "make.jl"))'
-  - julia -e 'Pkg.add("PkgBenchmark")'
-  - julia -e 'using PkgBenchmark; benchmarkpkg("CSV"; promptsave=false, promptoverwrite=false)'
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 | **Documentation**                                                               | **PackageEvaluator**                                            | **Build Status**                                                                                |
 |:-------------------------------------------------------------------------------:|:---------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|
-| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.4-img]][pkg-0.4-url] [![][pkg-0.5-img]][pkg-0.5-url] [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |
+| [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url] | [![][pkg-0.6-img]][pkg-0.6-url] | [![][travis-img]][travis-url] [![][appveyor-img]][appveyor-url] [![][codecov-img]][codecov-url] |
 
 
 ## Installation
@@ -23,7 +23,7 @@ julia> Pkg.add("CSV")
 
 ## Project Status
 
-The package is tested against Julia `0.4`, `0.5`, and *current* `0.6` on Linux, OS X, and Windows.
+The package is tested against Julia `0.6` and nightly on Linux, OS X, and Windows.
 
 ## Contributing and Questions
 
@@ -49,9 +49,5 @@ Contributions are very welcome, as are feature requests and suggestions. Please
 
 [issues-url]: https://github.com/JuliaData/CSV.jl/issues
 
-[pkg-0.4-img]: http://pkg.julialang.org/badges/CSV_0.4.svg
-[pkg-0.4-url]: http://pkg.julialang.org/?pkg=CSV
-[pkg-0.5-img]: http://pkg.julialang.org/badges/CSV_0.5.svg
-[pkg-0.5-url]: http://pkg.julialang.org/?pkg=CSV
 [pkg-0.6-img]: http://pkg.julialang.org/badges/CSV_0.6.svg
 [pkg-0.6-url]: http://pkg.julialang.org/?pkg=CSV
diff --git a/REQUIRE b/REQUIRE
@@ -1,5 +1,5 @@
-julia 0.5
+julia 0.6
 Compat 0.9.5
-DataStreams 0.1.0 0.2.0
+DataStreams 0.2.0
 DataFrames
-WeakRefStrings 0.1.3 0.3.0
+WeakRefStrings 0.3.0
diff --git a/appveyor.yml b/appveyor.yml
@@ -1,7 +1,5 @@
 environment:
   matrix:
-  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
-  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
   - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
   - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
   - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -56,3 +56,206 @@ TYPES = !is_windows() ? (Int, Float64, WeakRefString{UInt8}, String, Date, DateT
     end
 
 end
+
+
+
+
+# generate single column files w/ 1M rows for each type
+using WeakRefStrings
+
+val = "hey"
+for i in (1001, 100.1, WeakRefString{UInt8}(pointer(val), 3, 0), Date(2008, 1, 3), DateTime(2008, 3, 4))
+    open("/Users/jacobquinn/Downloads/randoms_$(typeof(i)).csv", "w") do f
+        for j = 1:1_000_000
+            write(f, string(i))
+            write(f, "\n")
+        end
+    end
+end
+
+using CSV, TextParse
+for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
+    println("comparing for T = $T...")
+    # T == WeakRefStrings.WeakRefString{UInt8} && continue
+    @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv"; nullable=true);
+    # @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$T.csv");
+end
+
+@time CSV.read("/Users/jacobquinn/Downloads/yellow_tripdata_2015-01.csv");
+
+for T in ('Int64', 'Float64', 'WeakRefString{UInt8}', 'Date', 'DateTime'):
+    start = time.time()
+    delim = ','
+    table = pandas.read_csv("/Users/jacobquinn/Downloads/randoms_" + T + ".csv", delimiter=delim)
+    end = time.time()
+    print(end - start)
+
+@time df = CSV.read("/Users/jacobquinn/Downloads/file.txt"; delim=' ');
+@time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv")
+# julia> for T in (Int, Float64, WeakRefStrings.WeakRefString{UInt8}, Date, DateTime)
+#            println("comparing for T = $T...")
+#            @time CSV.read("/Users/jacobquinn/Downloads/randoms_$(T).csv");
+#            @time TextParse.csvread("/Users/jacobquinn/Downloads/randoms_$(T).csv");
+#        end
+# comparing for T = Int64...
+# pre-allocating DataFrame w/ rows = 999999
+#   0.043684 seconds (1.00 M allocations: 22.929 MiB, 31.61% gc time)
+#   0.045556 seconds (460 allocations: 15.575 MiB, 3.20% gc time)
+# comparing for T = Float64...
+# pre-allocating DataFrame w/ rows = 999999
+#   0.080026 seconds (1.00 M allocations: 22.974 MiB, 23.80% gc time)
+#   0.082530 seconds (457 allocations: 16.528 MiB)
+# comparing for T = WeakRefString{UInt8}...
+# pre-allocating DataFrame w/ rows = 999999
+#   0.058446 seconds (1.89 k allocations: 22.986 MiB, 8.53% gc time)
+#   0.069034 seconds (595 allocations: 5.188 MiB)
+# comparing for T = Date...
+# pre-allocating DataFrame w/ rows = 999999
+#   0.125229 seconds (2.00 M allocations: 53.504 MiB, 20.94% gc time)
+#   0.120472 seconds (1.00 M allocations: 51.846 MiB, 6.73% gc time)
+# comparing for T = DateTime...
+# pre-allocating DataFrame w/ rows = 999999
+#   0.175855 seconds (2.00 M allocations: 53.504 MiB, 23.30% gc time)
+#   0.187619 seconds (1.00 M allocations: 60.516 MiB, 4.40% gc time)
+
+
+T = Int64
+@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_$(T).csv";)
+@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=true)
+@time source = CSV.Source("/Users/jacobquinn/Downloads/randoms_small.csv"; nullable=false)
+# source.schema = DataStreams.Data.Schema(DataStreams.Data.header(source.schema), (Int, String, String, Float64, Float64, Date, DateTime), 9)
+# @time df = CSV.read(source, NamedTuple);
+sink = Si = NamedTuple
+transforms = Dict{Int,Function}(1=>x->x-1)
+append = false
+args = kwargs = ()
+source_schema = DataStreams.Data.schema(source)
+sink_schema, transforms2 = DataStreams.Data.transform(source_schema, transforms, true);
+sinkstreamtype = DataStreams.Data.Field
+sink = Si(sink_schema, sinkstreamtype, append, args...; kwargs...);
+columns = []
+filter = x->true
+@code_warntype DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
+@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
+
+function testt(t)
+    a = getfield(t, 1)
+    b = getfield(t, 2)
+    c = getfield(t, 3)
+    d = getfield(t, 4)
+    e = getfield(t, 5)
+    f = getfield(t, 6)
+    g = getfield(t, 7)
+    return (a, b, c, d, e, f, g)
+end
+@code_warntype testt((i1=(?Int)[], i2=(?String)[], i3=(?String)[], i4=(?Float64)[], i5=(?Float64)[], i6=(?Date)[], i7=(?DateTime)[]))
+
+@code_llvm DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
+@time DataStreams.Data.stream!(source, sinkstreamtype, sink, source_schema, sink_schema, transforms2, filter, columns)
+
+@code_warntype @time CSV.parsefield(IOBuffer(), ?Int, CSV.Options(), 0, 0, CSV.STATE)
+
+t = Vector{Int}(1000000)
+
+# having CSV.parsefield(io, T) where T !>: Null decreases allocations by 1.00M
+# inlining CSV.parsefield also dropped allocations
+# making CSV.Options not have a type parameter also sped things up
+#
+
+using BenchmarkTools
+
+g(x) = x < 5 ? x : -1
+A = [i for i = 1:10]
+function get_then_set(A)
+    @simd for i = 1:10
+        @inbounds A[i] = g(i)
+    end
+    return A
+end
+@code_warntype g(1)
+@code_warntype get_then_set(A)
+@benchmark get_then_set(A) # 20ns
+
+@inline g3(x) = g2(x)
+@inline function g2(x)
+    if x < 20
+        return x * 20
+    end
+
+    if x < 15
+        return nothing
+    end
+
+    if x < 12
+        return 2x
+    end
+
+    if x * 20 / 4 % 2 == 0
+        return 1
+    end
+
+    if x < 0
+        return nothing
+    end
+    return nothing
+end
+
+A = Union{Int, Void}[i for i = 1:10]
+@inline function get_then_set2(A)
+    @simd for i = 1:10
+        # Base.arrayset(A, g2(i), i)
+        val = g3(i)
+        if val isa Void
+            @inbounds A[i] = val#::Union{Int, Void}
+        else
+            @inbounds A[i] = val#::Union{Int, Void}
+        end
+    end
+    return A
+end
+function run_lots(N)
+    A = Union{Int, Void}[i for i = 1:10]
+    for i = 1:N
+        get_then_set2(A)
+    end
+    return
+end
+
+@code_warntype g2(1)
+@code_warntype get_then_set2(A)
+@code_llvm get_then_set2(A)
+@benchmark get_then_set2(A) # 155ns
+
+
+g4(x::Int) = 1
+g4(x::Void) = 0
+
+A = [i for i = 1:10]
+function get_sum(A)
+    s = 0
+    for a in A
+        s += g4(a)
+    end
+    return s
+end
+@code_warntype get_sum(A)
+@code_llvm get_sum(A)
+@benchmark get_sum(A) # 24ns
+
+A = Union{Int, Void}[i for i = 1:10]
+A[[3, 5, 7]] = nothing
+function get_sum2(A)
+    s = 0
+    for a in A
+        s += g4(a)
+    end
+    return s
+end
+@code_warntype get_sum2(A)
+@code_llvm get_sum(A)
+@benchmark get_sum2(A) # 100ns
+
+
+function getstatic{T}(t::T)
+    return t[1]
+end
diff --git a/src/CSV.jl b/src/CSV.jl
@@ -1,11 +1,9 @@
 __precompile__(true)
 module CSV
 
-using Compat, DataStreams, DataFrames, WeakRefStrings
+using DataStreams, WeakRefStrings, Nulls, DataFrames
 
-export Data, DataFrame
-
-immutable CSVError <: Exception
+struct ParsingException <: Exception
     msg::String
 end
 
@@ -24,18 +22,19 @@ const ZERO    = UInt8('0')
 const TEN     = UInt8('9')+UInt8(1)
 Base.isascii(c::UInt8) = c < 0x80
 
-@inline function unsafe_read(from::Base.AbstractIOBuffer, ::Type{UInt8}=UInt8)
+readbyte(from::IO) = Base.read(from, UInt8)
+peekbyte(from::IO) = Base.peek(from)
+
+@inline function readbyte(from::IOBuffer)
     @inbounds byte = from.data[from.ptr]
     from.ptr = from.ptr + 1
     return byte
 end
-unsafe_read(from::IO, T) = Base.read(from, T)
 
-@inline function unsafe_peek(from::Base.AbstractIOBuffer)
+@inline function peekbyte(from::IOBuffer)
     @inbounds byte = from.data[from.ptr]
     return byte
 end
-unsafe_peek(from::IO) = (mark(from); v = Base.read(from, UInt8); reset(from); return v)
 
 """
 Represents the various configuration settings for delimited text file parsing.
@@ -48,30 +47,29 @@ Keyword Arguments:
  * `null::String`; indicates how NULL values are represented in the dataset
  * `dateformat::Union{AbstractString,Dates.DateFormat}`; how dates/datetimes are represented in the dataset
 """
-type Options
+mutable struct Options{D}
     delim::UInt8
     quotechar::UInt8
     escapechar::UInt8
-    null::String
+    null::Vector{UInt8}
     nullcheck::Bool
-    dateformat::Dates.DateFormat
-    datecheck::Bool
+    dateformat::D
     # non-public for now
     datarow::Int
     rows::Int
     header::Union{Integer,UnitRange{Int},Vector}
-    types::Union{Dict{Int,DataType},Dict{String,DataType},Vector{DataType}}
+    types
 end
 
-Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null=String(""), dateformat=Dates.ISODateFormat, datarow=-1, rows=0, header=1, types=DataType[]) =
+Options(;delim=COMMA, quotechar=QUOTE, escapechar=ESCAPE, null="", dateformat=Dates.ISODateTimeFormat, datarow=-1, rows=0, header=1, types=Type[]) =
     Options(delim%UInt8, quotechar%UInt8, escapechar%UInt8,
-            ascii(null), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), dateformat == Dates.ISODateTimeFormat || dateformat == Dates.ISODateFormat, datarow, rows, header, types)
+            map(UInt8, collect(ascii(null))), null != "", isa(dateformat,Dates.DateFormat) ? dateformat : Dates.DateFormat(dateformat), datarow, rows, header, types)
 function Base.show(io::IO,op::Options)
     println(io, "    CSV.Options:")
     println(io, "        delim: '", Char(op.delim), "'")
     println(io, "        quotechar: '", Char(op.quotechar), "'")
     print(io, "        escapechar: '"); escape_string(io, string(Char(op.escapechar)), "\\"); println(io, "'")
-    print(io, "        null: \""); escape_string(io, op.null, "\\"); println(io, "\"")
+    print(io, "        null: \""); escape_string(io, isempty(op.null) ? "" : String(collect(op.null)), "\\"); println(io, "\"")
     print(io, "        dateformat: ", op.dateformat)
 end
 
@@ -96,10 +94,10 @@ CSV.reset!(source)
 sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
 ```
 """
-type Source <: Data.Source
+mutable struct Source{I, D} <: Data.Source
     schema::Data.Schema
-    options::Options
-    io::IOBuffer
+    options::Options{D}
+    io::I
     ptr::Int # pointer to underlying data buffer
     fullpath::String
     datapos::Int # the position in the IOBuffer where the rows of data begins
@@ -111,6 +109,16 @@ function Base.show(io::IO, f::Source)
     show(io, f.schema)
 end
 
+# mutable struct TransposedSource{I, D} <: Data.Source
+#     schema::Data.Schema
+#     options::Options{D}
+#     io::I
+#     ptr::Int # pointer to underlying data buffer
+#     fullpath::String
+#     datapos::Int # the position in the IOBuffer where the rows of data begins
+#     columnpositions::Vector{Int}
+# end
+
 """
 A type that satisfies the `Data.Sink` interface in the `DataStreams.jl` package.
 
@@ -132,19 +140,23 @@ CSV.reset!(source)
 sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
 ```
 """
-type Sink <: Data.Sink
-    options::Options
+mutable struct Sink{D, B} <: Data.Sink
+    options::Options{D}
     io::IOBuffer
     fullpath::Union{String, IO}
     datapos::Int # the position in the IOBuffer where the rows of data begins
     header::Bool
     colnames::Vector{String}
+    cols::Int
     append::Bool
+    quotefields::B
 end
 
 include("parsefields.jl")
+include("float.jl")
 include("io.jl")
 include("Source.jl")
+# include("TransposedSource.jl")
 include("Sink.jl")
 
 end # module