Fix a few allocation/inference issues (#885)

This was from an effort to play around with precompilation some more. There's a tricky balance where if we precompile too much of `CSV.File`, then we run into the Base bug where there are Base methods missing and cause painful allocations via invoke (JuliaLang/julia#35972). The state of precompilation for CSV.jl is thus pretty disappointing: precompiling currently takes ~7.5s (which is fine, this is the number we're fine to have keep growing), loading CSV.jl precompiled takes ~2.5s (seems a little high, but not terrible), but the first call to `CSV.File` takes ~6.5s :sad_face:. At this point, I've spent too much time and don't understand the ins and outs of precompilation interactions with later runtime code to debug further, but hopefully we can recruit help from some experts to take CSV.jl precompilation to the next level. As-is, it's workable, so let's go with this for now and get this darn release out.
JuliaData · Sep 8, 2021 · 8cb5893 · 8cb5893
1 parent 9d8da50
commit 8cb5893
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 9 deletions.
diff --git a/src/CSV.jl b/src/CSV.jl
@@ -70,7 +70,8 @@ include("precompile.jl")
 _precompile_()
 
 function __init__()
- # CSV.File(IOBuffer(PRECOMPILE_DATA))
+ CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA))
+ # CSV.File(IOBuffer(CSV.PRECOMPILE_DATA))
  # foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA)))
  # CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
 end

diff --git a/src/context.jl b/src/context.jl
@@ -118,7 +118,7 @@ struct Context
  names::Vector{Symbol}
  rowsguess::Int64
  cols::Int
- buf::AbstractVector{UInt8}
+ buf::Vector{UInt8}
  datapos::Int64
  len::Int
  datarow::Int
@@ -141,6 +141,61 @@ struct Context
  streaming::Bool
 end
 
+# user-facing function if just the context is desired
+function Context(source::ValidSources;
+ # file options
+ # header can be a row number, range of rows, or actual string vector
+ header::Union{Integer, Vector{Symbol}, Vector{String}, AbstractVector{<:Integer}}=1,
+ normalizenames::Bool=false,
+ # by default, data starts immediately after header or start of file
+ datarow::Integer=-1,
+ skipto::Integer=-1,
+ footerskip::Integer=0,
+ transpose::Bool=false,
+ comment::Union{String, Nothing}=nothing,
+ ignoreemptyrows::Bool=true,
+ ignoreemptylines=nothing,
+ select=nothing,
+ drop=nothing,
+ limit::Union{Integer, Nothing}=nothing,
+ buffer_in_memory::Bool=false,
+ threaded::Union{Bool, Nothing}=nothing,
+ ntasks::Union{Nothing, Integer}=nothing,
+ tasks::Union{Nothing, Integer}=nothing,
+ rows_to_check::Integer=DEFAULT_ROWS_TO_CHECK,
+ lines_to_check=nothing,
+ # parsing options
+ missingstrings=String[],
+ missingstring="",
+ delim::Union{Nothing, Char, String}=nothing,
+ ignorerepeated::Bool=false,
+ quoted::Bool=true,
+ quotechar::Union{UInt8, Char}='"',
+ openquotechar::Union{UInt8, Char, Nothing}=nothing,
+ closequotechar::Union{UInt8, Char, Nothing}=nothing,
+ escapechar::Union{UInt8, Char}='"',
+ dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
+ dateformats=nothing,
+ decimal::Union{UInt8, Char}=UInt8('.'),
+ truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
+ falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
+ # type options
+ type=nothing,
+ types=nothing,
+ typemap::Dict=Dict{Type, Type}(),
+ pool::Union{Bool, Real, AbstractVector, AbstractDict}=DEFAULT_POOL,
+ downcast::Bool=false,
+ lazystrings::Bool=false,
+ stringtype::StringTypes=DEFAULT_STRINGTYPE,
+ strict::Bool=false,
+ silencewarnings::Bool=false,
+ maxwarnings::Int=DEFAULT_MAX_WARNINGS,
+ debug::Bool=false,
+ parsingdebug::Bool=false
+ )
+ return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false)
+end
+
 @refargs function Context(source::ValidSources,
  # file options
  # header can be a row number, range of rows, or actual string vector

diff --git a/src/file.jl b/src/file.jl
@@ -213,13 +213,13 @@ function File(source::ValidSources;
  # select=nothing;drop=nothing;limit=nothing;threaded=nothing;ntasks=Threads.nthreads();tasks=nothing;rows_to_check=30;lines_to_check=nothing;missingstrings=String[];missingstring="";
  # delim=nothing;ignorerepeated=false;quoted=true;quotechar='"';openquotechar=nothing;closequotechar=nothing;escapechar='"';dateformat=nothing;
  # dateformats=nothing;decimal=UInt8('.');truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=Dict{Type,Type}();
- # pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=true;parsingdebug=false;buffer_in_memory=false
+ # pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=false;parsingdebug=false;buffer_in_memory=false
  # @descend CSV.Context(CSV.Arg(source), CSV.Arg(header), CSV.Arg(normalizenames), CSV.Arg(datarow), CSV.Arg(skipto), CSV.Arg(footerskip), CSV.Arg(transpose), CSV.Arg(comment), CSV.Arg(ignoreemptyrows), CSV.Arg(ignoreemptylines), CSV.Arg(select), CSV.Arg(drop), CSV.Arg(limit), CSV.Arg(buffer_in_memory), CSV.Arg(threaded), CSV.Arg(ntasks), CSV.Arg(tasks), CSV.Arg(rows_to_check), CSV.Arg(lines_to_check), CSV.Arg(missingstrings), CSV.Arg(missingstring), CSV.Arg(delim), CSV.Arg(ignorerepeated), CSV.Arg(quoted), CSV.Arg(quotechar), CSV.Arg(openquotechar), CSV.Arg(closequotechar), CSV.Arg(escapechar), CSV.Arg(dateformat), CSV.Arg(dateformats), CSV.Arg(decimal), CSV.Arg(truestrings), CSV.Arg(falsestrings), CSV.Arg(type), CSV.Arg(types), CSV.Arg(typemap), CSV.Arg(pool), CSV.Arg(downcast), CSV.Arg(lazystrings), CSV.Arg(stringtype), CSV.Arg(strict), CSV.Arg(silencewarnings), CSV.Arg(maxwarnings), CSV.Arg(debug), CSV.Arg(parsingdebug), CSV.Arg(false))
  ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false)
  return File(ctx)
 end
 
-function File(ctx::Context, chunking::Bool=false)
+function File(ctx::Context, @nospecialize(chunking::Bool=false))
  @inbounds begin
  # we now do our parsing pass over the file, starting at datapos
  if ctx.threaded
@@ -235,6 +235,7 @@ function File(ctx::Context, chunking::Bool=false)
  rows = zeros(Int64, ntasks) # how many rows each parsing task ended up actually parsing
  @sync for i = 1:ntasks
  Threads.@spawn multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock)
+ # CSV.multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock)
  end
  finalrows = sum(rows)
  if ctx.limit < finalrows
@@ -740,9 +741,7 @@ end
 function detectcell(buf, pos, len, row, rowoffset, i, col, ctx, rowsguess)::Tuple{Int64, Int16}
  # debug && println("detecting on task $(Threads.threadid())")
  opts = col.options
- # stats = Base.gc_num()
  code, tlen, x, xT = detect(pass, buf, pos, len, opts, false, ctx.downcast, rowoffset + row, i)
- # diff = GC_Diff(gc_num(), stats)
  if x === missing
  col.anymissing = true
  @goto finaldone

diff --git a/src/precompile.jl b/src/precompile.jl
@@ -1,7 +1,8 @@
 const PRECOMPILE_DATA = "int,float,date,datetime,bool,null,str,catg,int_float\n1,3.14,2019-01-01,2019-01-01T01:02:03,true,,hey,abc,2\n2,NaN,2019-01-02,2019-01-03T01:02:03,false,,there,abc,3.14\n"
 function _precompile_()
- # ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
- # CSV.File(IOBuffer(PRECOMPILE_DATA))
+ ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
+ while false; end
+ # CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA))
  # foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA)))
- # CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
+ CSV.Context(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
 end