diff --git a/src/CSV.jl b/src/CSV.jl index 617cdcf2..9ff52a09 100644 --- a/src/CSV.jl +++ b/src/CSV.jl @@ -70,7 +70,8 @@ include("precompile.jl") _precompile_() function __init__() - # CSV.File(IOBuffer(PRECOMPILE_DATA)) + CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA)) + # CSV.File(IOBuffer(CSV.PRECOMPILE_DATA)) # foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA))) # CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv")) end diff --git a/src/context.jl b/src/context.jl index 4a4a0bc5..e134a630 100644 --- a/src/context.jl +++ b/src/context.jl @@ -118,7 +118,7 @@ struct Context names::Vector{Symbol} rowsguess::Int64 cols::Int - buf::AbstractVector{UInt8} + buf::Vector{UInt8} datapos::Int64 len::Int datarow::Int @@ -141,6 +141,61 @@ struct Context streaming::Bool end +# user-facing function if just the context is desired +function Context(source::ValidSources; + # file options + # header can be a row number, range of rows, or actual string vector + header::Union{Integer, Vector{Symbol}, Vector{String}, AbstractVector{<:Integer}}=1, + normalizenames::Bool=false, + # by default, data starts immediately after header or start of file + datarow::Integer=-1, + skipto::Integer=-1, + footerskip::Integer=0, + transpose::Bool=false, + comment::Union{String, Nothing}=nothing, + ignoreemptyrows::Bool=true, + ignoreemptylines=nothing, + select=nothing, + drop=nothing, + limit::Union{Integer, Nothing}=nothing, + buffer_in_memory::Bool=false, + threaded::Union{Bool, Nothing}=nothing, + ntasks::Union{Nothing, Integer}=nothing, + tasks::Union{Nothing, Integer}=nothing, + rows_to_check::Integer=DEFAULT_ROWS_TO_CHECK, + lines_to_check=nothing, + # parsing options + missingstrings=String[], + missingstring="", + delim::Union{Nothing, Char, String}=nothing, + ignorerepeated::Bool=false, + quoted::Bool=true, + quotechar::Union{UInt8, Char}='"', + openquotechar::Union{UInt8, Char, Nothing}=nothing, + closequotechar::Union{UInt8, Char, Nothing}=nothing, + escapechar::Union{UInt8, Char}='"', + dateformat::Union{String, Dates.DateFormat, Nothing, AbstractDict}=nothing, + dateformats=nothing, + decimal::Union{UInt8, Char}=UInt8('.'), + truestrings::Union{Vector{String}, Nothing}=TRUE_STRINGS, + falsestrings::Union{Vector{String}, Nothing}=FALSE_STRINGS, + # type options + type=nothing, + types=nothing, + typemap::Dict=Dict{Type, Type}(), + pool::Union{Bool, Real, AbstractVector, AbstractDict}=DEFAULT_POOL, + downcast::Bool=false, + lazystrings::Bool=false, + stringtype::StringTypes=DEFAULT_STRINGTYPE, + strict::Bool=false, + silencewarnings::Bool=false, + maxwarnings::Int=DEFAULT_MAX_WARNINGS, + debug::Bool=false, + parsingdebug::Bool=false + ) + return @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false) +end + @refargs function Context(source::ValidSources, # file options # header can be a row number, range of rows, or actual string vector diff --git a/src/file.jl b/src/file.jl index 7d3a6e36..0a8c09fd 100644 --- a/src/file.jl +++ b/src/file.jl @@ -213,13 +213,13 @@ function File(source::ValidSources; # select=nothing;drop=nothing;limit=nothing;threaded=nothing;ntasks=Threads.nthreads();tasks=nothing;rows_to_check=30;lines_to_check=nothing;missingstrings=String[];missingstring=""; # delim=nothing;ignorerepeated=false;quoted=true;quotechar='"';openquotechar=nothing;closequotechar=nothing;escapechar='"';dateformat=nothing; # dateformats=nothing;decimal=UInt8('.');truestrings=nothing;falsestrings=nothing;type=nothing;types=nothing;typemap=Dict{Type,Type}(); - # pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=true;parsingdebug=false;buffer_in_memory=false + # pool=CSV.DEFAULT_POOL;downcast=false;lazystrings=false;stringtype=String;strict=false;silencewarnings=false;maxwarnings=100;debug=false;parsingdebug=false;buffer_in_memory=false # @descend CSV.Context(CSV.Arg(source), CSV.Arg(header), CSV.Arg(normalizenames), CSV.Arg(datarow), CSV.Arg(skipto), CSV.Arg(footerskip), CSV.Arg(transpose), CSV.Arg(comment), CSV.Arg(ignoreemptyrows), CSV.Arg(ignoreemptylines), CSV.Arg(select), CSV.Arg(drop), CSV.Arg(limit), CSV.Arg(buffer_in_memory), CSV.Arg(threaded), CSV.Arg(ntasks), CSV.Arg(tasks), CSV.Arg(rows_to_check), CSV.Arg(lines_to_check), CSV.Arg(missingstrings), CSV.Arg(missingstring), CSV.Arg(delim), CSV.Arg(ignorerepeated), CSV.Arg(quoted), CSV.Arg(quotechar), CSV.Arg(openquotechar), CSV.Arg(closequotechar), CSV.Arg(escapechar), CSV.Arg(dateformat), CSV.Arg(dateformats), CSV.Arg(decimal), CSV.Arg(truestrings), CSV.Arg(falsestrings), CSV.Arg(type), CSV.Arg(types), CSV.Arg(typemap), CSV.Arg(pool), CSV.Arg(downcast), CSV.Arg(lazystrings), CSV.Arg(stringtype), CSV.Arg(strict), CSV.Arg(silencewarnings), CSV.Arg(maxwarnings), CSV.Arg(debug), CSV.Arg(parsingdebug), CSV.Arg(false)) ctx = @refargs Context(source, header, normalizenames, datarow, skipto, footerskip, transpose, comment, ignoreemptyrows, ignoreemptylines, select, drop, limit, buffer_in_memory, threaded, ntasks, tasks, rows_to_check, lines_to_check, missingstrings, missingstring, delim, ignorerepeated, quoted, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, pool, downcast, lazystrings, stringtype, strict, silencewarnings, maxwarnings, debug, parsingdebug, false) return File(ctx) end -function File(ctx::Context, chunking::Bool=false) +function File(ctx::Context, @nospecialize(chunking::Bool=false)) @inbounds begin # we now do our parsing pass over the file, starting at datapos if ctx.threaded @@ -235,6 +235,7 @@ function File(ctx::Context, chunking::Bool=false) rows = zeros(Int64, ntasks) # how many rows each parsing task ended up actually parsing @sync for i = 1:ntasks Threads.@spawn multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock) + # CSV.multithreadparse(ctx, pertaskcolumns, rowchunkguess, i, rows, wholecolumnslock) end finalrows = sum(rows) if ctx.limit < finalrows @@ -740,9 +741,7 @@ end function detectcell(buf, pos, len, row, rowoffset, i, col, ctx, rowsguess)::Tuple{Int64, Int16} # debug && println("detecting on task $(Threads.threadid())") opts = col.options - # stats = Base.gc_num() code, tlen, x, xT = detect(pass, buf, pos, len, opts, false, ctx.downcast, rowoffset + row, i) - # diff = GC_Diff(gc_num(), stats) if x === missing col.anymissing = true @goto finaldone diff --git a/src/precompile.jl b/src/precompile.jl index 364b9c2a..07b7636a 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,7 +1,8 @@ const PRECOMPILE_DATA = "int,float,date,datetime,bool,null,str,catg,int_float\n1,3.14,2019-01-01,2019-01-01T01:02:03,true,,hey,abc,2\n2,NaN,2019-01-02,2019-01-03T01:02:03,false,,there,abc,3.14\n" function _precompile_() - # ccall(:jl_generating_output, Cint, ()) == 1 || return nothing - # CSV.File(IOBuffer(PRECOMPILE_DATA)) + ccall(:jl_generating_output, Cint, ()) == 1 || return nothing + while false; end + # CSV.Context(IOBuffer(CSV.PRECOMPILE_DATA)) # foreach(row -> row, CSV.Rows(IOBuffer(PRECOMPILE_DATA))) - # CSV.File(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv")) + CSV.Context(joinpath(dirname(pathof(CSV)), "..", "test", "testfiles", "promotions.csv")) end