Skip to content

Commit

Permalink
Fix a few allocation/inference issues (#885)
Browse files Browse the repository at this point in the history
This was from an effort to play around with precompilation some more.
There's a tricky balance where if we precompile too much of `CSV.File`,
then we run into the Base bug where there are Base methods missing and
cause painful allocations via invoke
(JuliaLang/julia#35972). The state of
precompilation for CSV.jl is thus pretty disappointing: precompiling
currently takes ~7.5s (which is fine, this is the number we're fine to
have keep growing), loading CSV.jl precompiled takes ~2.5s (seems a
little high, but not terrible), but the first call to `CSV.File` takes
~6.5s :sad_face:. At this point, I've spent too much time and don't
understand the ins and outs of precompilation interactions with later
runtime code to debug further, but hopefully we can recruit help from
some experts to take CSV.jl precompilation to the next level. As-is,
it's workable, so let's go with this for now and get this darn release
out.
  • Loading branch information
quinnj authored Sep 8, 2021
1 parent 9d8da50 commit 8cb5893
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 9 deletions.
3 changes: 2 additions & 1 deletion src/CSV.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ include("precompile.jl")
_precompile_()

function __init__()
# CSV.File(IOBuffer(PRECOMPILE_DATA))
CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA))
# CSV.File(IOBuffer(CSV.PRECOMPILE_DATA))
# foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA)))
# CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
end
Expand Down
57 changes: 56 additions & 1 deletion src/context.jl
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct Context
names::Vector{Symbol}
rowsguess::Int64
cols::Int
buf::AbstractVector{UInt8}
buf::Vector{UInt8}
datapos::Int64
len::Int
datarow::Int
Expand All @@ -141,6 +141,61 @@ struct Context
streaming::Bool
end

# user-facing function if just the context is desired
function Context(source::ValidSources;
# file options
# header can be a row number, range of rows, or actual string vector
header::Union{Integer, Vector{Symbol}, Vector{String}, AbstractVector{<:Integer}}=1,
normalizenames::Bool=false,
# by default, data starts immediately after header or start of file
datarow::Integer=-1,
skipto::Integer=-1,
footerskip::Integer=0,
transpose::Bool=false,
comment::Union{String, Nothing}=nothing,
ignoreemptyrows::Bool=true,
ignoreemptylines=nothing,
select=nothing,
drop=nothing,
limit::Union{Integer, Nothing}=nothing,
buffer_in_memory::Bool=false,
threaded::Union{Bool, Nothing}=nothing,
ntasks::Union{Nothing, Integer}=nothing,
tasks::Union{Nothing, Integer}=nothing,
rows_to_check::Integer=DEFAULT_ROWS_TO_CHECK,
lines_to_check=nothing,
# parsing options
missingstrings=String[],
missingstring="",
delim::Union{Nothing, Char, String}=nothing,
ignorerepeated::Bool=false,
quoted::Bool=true,
quotechar::Union{UInt8, Char}='"',
openquotechar::Union{UInt8, Char, Nothing}=nothing,
closequotechar::Union{UInt8, Char, Nothing}=nothing,
escapechar::Union{UInt8, Char}='"',
dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing,
dateformats=nothing,
decimal::Union{UInt8, Char}=UInt8('.'),
truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS,
falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS,
# type options
type=nothing,
types=nothing,
typemap::Dict=Dict{Type, Type}(),
pool::Union{Bool, Real, AbstractVector, AbstractDict}=DEFAULT_POOL,
downcast::Bool=false,
lazystrings::Bool=false,
stringtype::StringTypes=DEFAULT_STRINGTYPE,
strict::Bool=false,
silencewarnings::Bool=false,
maxwarnings::Int=DEFAULT_MAX_WARNINGS,
debug::Bool=false,
parsingdebug::Bool=false
)
return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false)
end

@refargs function Context(source::ValidSources,
# file options
# header can be a row number, range of rows, or actual string vector
Expand Down
7 changes: 3 additions & 4 deletions src/file.jl
Original file line number Diff line number Diff line change
Expand Up @@ -213,13 +213,13 @@ function File(source::ValidSources;
# select=nothing;drop=nothing;limit=nothing;threaded=nothing;ntasks=Threads.nthreads();tasks=nothing;rows_to_check=30;lines_to_check=nothing;missingstrings=String[];missingstring="";
# delim=nothing;ignorerepeated=false;quoted=true;quotechar='"';openquotechar=nothing;closequotechar=nothing;escapechar='"';dateformat=nothing;
# dateformats=nothing;decimal=UInt8('.');truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=Dict{Type,Type}();
# pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=true;parsingdebug=false;buffer_in_memory=false
# pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=false;parsingdebug=false;buffer_in_memory=false
# @descend CSV.Context(CSV.Arg(source), CSV.Arg(header), CSV.Arg(normalizenames), CSV.Arg(datarow), CSV.Arg(skipto), CSV.Arg(footerskip), CSV.Arg(transpose), CSV.Arg(comment), CSV.Arg(ignoreemptyrows), CSV.Arg(ignoreemptylines), CSV.Arg(select), CSV.Arg(drop), CSV.Arg(limit), CSV.Arg(buffer_in_memory), CSV.Arg(threaded), CSV.Arg(ntasks), CSV.Arg(tasks), CSV.Arg(rows_to_check), CSV.Arg(lines_to_check), CSV.Arg(missingstrings), CSV.Arg(missingstring), CSV.Arg(delim), CSV.Arg(ignorerepeated), CSV.Arg(quoted), CSV.Arg(quotechar), CSV.Arg(openquotechar), CSV.Arg(closequotechar), CSV.Arg(escapechar), CSV.Arg(dateformat), CSV.Arg(dateformats), CSV.Arg(decimal), CSV.Arg(truestrings), CSV.Arg(falsestrings), CSV.Arg(type), CSV.Arg(types), CSV.Arg(typemap), CSV.Arg(pool), CSV.Arg(downcast), CSV.Arg(lazystrings), CSV.Arg(stringtype), CSV.Arg(strict), CSV.Arg(silencewarnings), CSV.Arg(maxwarnings), CSV.Arg(debug), CSV.Arg(parsingdebug), CSV.Arg(false))
ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false)
return File(ctx)
end

function File(ctx::Context, chunking::Bool=false)
function File(ctx::Context, @nospecialize(chunking::Bool=false))
@inbounds begin
# we now do our parsing pass over the file, starting at datapos
if ctx.threaded
Expand All @@ -235,6 +235,7 @@ function File(ctx::Context, chunking::Bool=false)
rows = zeros(Int64, ntasks) # how many rows each parsing task ended up actually parsing
@sync for i = 1:ntasks
Threads.@spawn multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock)
# CSV.multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock)
end
finalrows = sum(rows)
if ctx.limit < finalrows
Expand Down Expand Up @@ -740,9 +741,7 @@ end
function detectcell(buf, pos, len, row, rowoffset, i, col, ctx, rowsguess)::Tuple{Int64, Int16}
# debug && println("detecting on task $(Threads.threadid())")
opts = col.options
# stats = Base.gc_num()
code, tlen, x, xT = detect(pass, buf, pos, len, opts, false, ctx.downcast, rowoffset + row, i)
# diff = GC_Diff(gc_num(), stats)
if x === missing
col.anymissing = true
@goto finaldone
Expand Down
7 changes: 4 additions & 3 deletions src/precompile.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
const PRECOMPILE_DATA = "int,float,date,datetime,bool,null,str,catg,int_float\n1,3.14,2019-01-01,2019-01-01T01:02:03,true,,hey,abc,2\n2,NaN,2019-01-02,2019-01-03T01:02:03,false,,there,abc,3.14\n"
function _precompile_()
# ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
# CSV.File(IOBuffer(PRECOMPILE_DATA))
ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
while false; end
# CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA))
# foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA)))
# CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
CSV.Context(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv"))
end

0 comments on commit 8cb5893

Please sign in to comment.