diff --git a/docs/src/api.md b/docs/src/api.md index 045d6420..8b772387 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -38,6 +38,7 @@ JuliaSyntax.highlight JuliaSyntax.sourcetext JuliaSyntax.source_line JuliaSyntax.source_location +JuliaSyntax.source_line_range ``` ## Expression heads/kinds diff --git a/prototypes/simple_parser.jl b/prototypes/simple_parser.jl index 918f12ed..06a408a2 100644 --- a/prototypes/simple_parser.jl +++ b/prototypes/simple_parser.jl @@ -140,7 +140,7 @@ end function parse_and_show(production::Function, code) st = ParseStream(code) production(st) - t = JuliaSyntax.build_tree(GreenNode, st, wrap_toplevel_as_kind=K"error") + t = JuliaSyntax.build_tree(GreenNode, st) show(stdout, MIME"text/plain"(), t, code, show_trivia=true) if !isempty(st.diagnostics) println() diff --git a/src/expr.jl b/src/expr.jl index 09416874..845939ac 100644 --- a/src/expr.jl +++ b/src/expr.jl @@ -68,7 +68,7 @@ function _strip_parens(ex) end end -function _leaf_to_Expr(source, head, srcrange, node) +function _leaf_to_Expr(source, txtbuf, head, srcrange, node) k = kind(head) if k == K"core_@cmd" return GlobalRef(Core, Symbol("@cmd")) @@ -79,7 +79,7 @@ function _leaf_to_Expr(source, head, srcrange, node) Expr(:error) : Expr(:error, "$(_token_error_descriptions[k]): `$(source[srcrange])`") else - val = isnothing(node) ? parse_julia_literal(source, head, srcrange) : node.val + val = isnothing(node) ? parse_julia_literal(txtbuf, head, srcrange) : node.val if val isa Union{Int128,UInt128,BigInt} # Ignore the values of large integers and convert them back to # symbolic/textural form for compatibility with the Expr @@ -457,7 +457,9 @@ end function build_tree(::Type{Expr}, stream::ParseStream; filename=nothing, first_line=1, kws...) - source = SourceFile(sourcetext(stream), filename=filename, first_line=first_line) + source = SourceFile(sourcetext(stream), first_index=first_byte(stream), + filename=filename, first_line=first_line) + txtbuf = textbuf(stream) args = Any[] childranges = UnitRange{Int}[] childheads = SyntaxHead[] @@ -467,7 +469,7 @@ function build_tree(::Type{Expr}, stream::ParseStream; end k = kind(head) if isnothing(nodechildren) - ex = _leaf_to_Expr(source, head, srcrange, nothing) + ex = _leaf_to_Expr(source, txtbuf, head, srcrange, nothing) else resize!(childranges, length(nodechildren)) resize!(childheads, length(nodechildren)) @@ -487,7 +489,8 @@ end function _to_expr(node::SyntaxNode) if !haschildren(node) - return _leaf_to_Expr(node.source, head(node), range(node), node) + offset, txtbuf = _unsafe_wrap_substring(sourcetext(node.source)) + return _leaf_to_Expr(node.source, txtbuf, head(node), range(node) .+ offset, node) end cs = children(node) args = Any[_to_expr(c) for c in cs] diff --git a/src/hooks.jl b/src/hooks.jl index 63a44628..c50e89b9 100644 --- a/src/hooks.jl +++ b/src/hooks.jl @@ -157,10 +157,7 @@ function core_parser_hook(code, filename::String, lineno::Int, offset::Int, opti write(_debug_log[], code) end - io = IOBuffer(code) - seek(io, offset) - - stream = ParseStream(io) + stream = ParseStream(code, offset+1) if options === :statement || options === :atom # To copy the flisp parser driver: # * Parsing atoms consumes leading trivia @@ -179,9 +176,7 @@ function core_parser_hook(code, filename::String, lineno::Int, offset::Int, opti end if any_error(stream) - tree = build_tree(SyntaxNode, stream, - wrap_toplevel_as_kind=K"None", first_line=lineno, - filename=filename) + tree = build_tree(SyntaxNode, stream, first_line=lineno, filename=filename) tag = _incomplete_tag(tree, lastindex(code)) if _has_v1_10_hooks exc = ParseError(stream, filename=filename, first_line=lineno, @@ -233,13 +228,7 @@ function core_parser_hook(code, filename::String, lineno::Int, offset::Int, opti # # show_diagnostics(stdout, stream.diagnostics, code) # - ex = build_tree(Expr, stream; filename=filename, - wrap_toplevel_as_kind=K"None", first_line=lineno) - if @isexpr(ex, :None) - # The None wrapping is only to give somewhere for trivia to be - # attached; unwrap! - ex = only(ex.args) - end + ex = build_tree(Expr, stream; filename=filename, first_line=lineno) end # Note the next byte in 1-based indexing is `last_byte(stream) + 1` but @@ -291,15 +280,13 @@ else Base.Meta.ParseError(e::JuliaSyntax.ParseError) = e end -const _default_parser = _has_v1_6_hooks ? Core._parse : nothing - """ enable_in_core!([enable=true; freeze_world_age=true, debug_filename=nothing]) Connect the JuliaSyntax parser to the Julia runtime so that it replaces the flisp parser for all parsing work. That is, JuliaSyntax will be used for -`include()` `Meta.parse()`, the REPL, etc. To disable, set use -`enable_in_core!(false)`. +`include()` `Meta.parse()`, the REPL, etc. To reset to the reference parser, +use `enable_in_core!(false)`. Keyword arguments: * `freeze_world_age` - Use a fixed world age for the parser to prevent @@ -322,7 +309,7 @@ function enable_in_core!(enable=true; freeze_world_age = true, world_age = freeze_world_age ? Base.get_world_counter() : typemax(UInt) _set_core_parse_hook(fix_world_age(core_parser_hook, world_age)) else - _set_core_parse_hook(_default_parser) + _set_core_parse_hook(Core.Compiler.fl_parse) end nothing end diff --git a/src/kinds.jl b/src/kinds.jl index 6dcc3557..54f37e88 100644 --- a/src/kinds.jl +++ b/src/kinds.jl @@ -914,6 +914,8 @@ const _kind_names = "cartesian_iterator" "comprehension" "typed_comprehension" + # Container for a single statement/atom plus any trivia and errors + "wrapper" "END_SYNTAX_KINDS" ] diff --git a/src/literal_parsing.jl b/src/literal_parsing.jl index 67a7af6a..5cd610ce 100644 --- a/src/literal_parsing.jl +++ b/src/literal_parsing.jl @@ -68,7 +68,7 @@ Like `Base.parse(Union{Float64,Float32}, str)`, but permits float underflow Parse a Float64. str[firstind:lastind] must be a valid floating point literal string. If the value is outside Float64 range. """ -function parse_float_literal(::Type{T}, str::String, +function parse_float_literal(::Type{T}, str::Union{String,SubString,Vector{UInt8}}, firstind::Integer, endind::Integer) where {T} # force specialize with where {T} strsize = endind - firstind bufsz = 50 @@ -172,70 +172,68 @@ end #------------------------------------------------------------------------------- -is_indentation(c) = c == ' ' || c == '\t' - """ Process Julia source code escape sequences for raw strings """ -function unescape_raw_string(io::IO, str::AbstractString, is_cmd::Bool) - delim = is_cmd ? '`' : '"' - i = firstindex(str) - lastidx = lastindex(str) - while i <= lastidx - c = str[i] - if c != '\\' - if c == '\r' +function unescape_raw_string(io::IO, txtbuf::Vector{UInt8}, + firstind, endind, is_cmd::Bool) + delim = is_cmd ? u8"`" : u8"\"" + i = firstind + while i < endind + c = txtbuf[i] + if c != u8"\\" + if c == u8"\r" # convert literal \r and \r\n in strings to \n (issue #11988) - if i+1 <= lastidx && str[i+1] == '\n' + if i+1 < endind && txtbuf[i+1] == u8"\n" i += 1 end - c = '\n' + c = u8"\n" end write(io, c) - i = nextind(str, i) + i += 1 continue end # Process \ escape sequences j = i - while j <= lastidx && str[j] == '\\' + while j < endind && txtbuf[j] == u8"\\" j += 1 end nbackslash = j - i - if (j <= lastidx && str[j] == delim) || j > lastidx + if (j < endind && txtbuf[j] == delim) || j >= endind # Backslashes before a delimiter must also be escaped nbackslash = div(nbackslash,2) end for k = 1:nbackslash - write(io, '\\') + write(io, u8"\\") end i = j - if i <= lastidx - write(io, str[i]) - i = nextind(str, i) + if i < endind + write(io, txtbuf[i]) + i += 1 end end end """ Process Julia source code escape sequences for non-raw strings. -`str` should be passed without delimiting quotes. +`txtbuf` should be passed without delimiting quotes. """ -function unescape_julia_string(io::IO, str::AbstractString, +function unescape_julia_string(io::IO, txtbuf::Vector{UInt8}, firstind, endind, diagnostics) had_error = false i = firstind while i < endind - c = str[i] - if c != '\\' - if c == '\r' + c = txtbuf[i] + if c != u8"\\" + if c == u8"\r" # convert literal \r and \r\n in strings to \n (issue #11988) - if i+1 < endind && str[i+1] == '\n' + if i+1 < endind && txtbuf[i+1] == u8"\n" i += 1 end - c = '\n' + c = u8"\n" end write(io, c) - i = nextind(str, i) + i = nextind(txtbuf, i) continue end # Process \ escape sequences. See also Base.unescape_string which some @@ -248,20 +246,20 @@ function unescape_julia_string(io::IO, str::AbstractString, had_error = true break end - c = str[i] - if c == 'x' || c == 'u' || c == 'U' + c = txtbuf[i] + if c == u8"x" || c == u8"u" || c == u8"U" n = k = 0 - m = c == 'x' ? 2 : - c == 'u' ? 4 : 8 + m = c == u8"x" ? 2 : + c == u8"u" ? 4 : 8 while (k += 1) <= m && i+1 < endind - nc = str[i+1] - n = '0' <= nc <= '9' ? n<<4 + (nc-'0') : - 'a' <= nc <= 'f' ? n<<4 + (nc-'a'+10) : - 'A' <= nc <= 'F' ? n<<4 + (nc-'A'+10) : break + nc = txtbuf[i+1] + n = u8"0" <= nc <= u8"9" ? n<<4 + (nc-u8"0") : + u8"a" <= nc <= u8"f" ? n<<4 + (nc-u8"a"+10) : + u8"A" <= nc <= u8"F" ? n<<4 + (nc-u8"A"+10) : break i += 1 end if k == 1 || n > 0x10ffff - u = m == 4 ? 'u' : 'U' + u = m == 4 ? u8"u" : u8"U" msg = (m == 2) ? "invalid hex escape sequence" : "invalid unicode escape sequence" emit_diagnostic(diagnostics, escstart:i, error=msg) @@ -273,12 +271,12 @@ function unescape_julia_string(io::IO, str::AbstractString, print(io, Char(n)) end end - elseif '0' <= c <= '7' + elseif u8"0" <= c <= u8"7" k = 1 - n = c-'0' + n = Int(c - u8"0") while (k += 1) <= 3 && i+1 < endind - c = str[i+1] - n = ('0' <= c <= '7') ? n<<3 + c-'0' : break + c = txtbuf[i+1] + n = (u8"0" <= c <= u8"7") ? n<<3 + c-u8"0" : break i += 1 end if n > 255 @@ -290,20 +288,20 @@ function unescape_julia_string(io::IO, str::AbstractString, end else u = # C escapes - c == 'n' ? '\n' : - c == 't' ? '\t' : - c == 'r' ? '\r' : - c == 'e' ? '\e' : - c == 'b' ? '\b' : - c == 'f' ? '\f' : - c == 'v' ? '\v' : - c == 'a' ? '\a' : + c == u8"n" ? u8"\n" : + c == u8"t" ? u8"\t" : + c == u8"r" ? u8"\r" : + c == u8"e" ? u8"\e" : + c == u8"b" ? u8"\b" : + c == u8"f" ? u8"\f" : + c == u8"v" ? u8"\v" : + c == u8"a" ? u8"\a" : # Literal escapes allowed in Julia source - c == '\\' ? '\\' : - c == '\'' ? '\'' : - c == '"' ? '"' : - c == '$' ? '$' : - c == '`' ? '`' : + c == u8"\\" ? u8"\\" : + c == u8"'" ? u8"'" : + c == u8"\"" ? u8"\"" : + c == u8"$" ? u8"$" : + c == u8"`" ? u8"`" : nothing if isnothing(u) emit_diagnostic(diagnostics, escstart:i, @@ -313,7 +311,10 @@ function unescape_julia_string(io::IO, str::AbstractString, write(io, u) end end - i = nextind(str, i) + # For non-ascii characters we may not be in the middle of the UTF-8 + # encoding for that char, but this doesn't matter because unescaping + # only relies on the ascii subset. + i += 1 end return had_error end @@ -362,43 +363,56 @@ end #------------------------------------------------------------------------------- -function parse_julia_literal(source, head::SyntaxHead, srcrange) +function parse_julia_literal(txtbuf::Vector{UInt8}, head::SyntaxHead, srcrange) # Leaf node k = kind(head) - val_str = view(source, srcrange) # Any errors parsing literals are represented as ErrorVal() - this can # happen when the user sets `ignore_errors=true` during parsing. - val = if k == K"Integer" - parse_int_literal(val_str) - elseif k == K"Float" - v, code = parse_float_literal(Float64, source.code, first(srcrange), + if k == K"Float" + v, code = parse_float_literal(Float64, txtbuf, first(srcrange), last(srcrange)+1) - (code === :ok || code === :underflow) ? v : ErrorVal() + return (code === :ok || code === :underflow) ? v : ErrorVal() elseif k == K"Float32" - v, code = parse_float_literal(Float32, source.code, first(srcrange), + v, code = parse_float_literal(Float32, txtbuf, first(srcrange), last(srcrange)+1) - (code === :ok || code === :underflow) ? v : ErrorVal() - elseif k in KSet"BinInt OctInt HexInt" - parse_uint_literal(val_str, k) - elseif k == K"true" - true - elseif k == K"false" - false + return (code === :ok || code === :underflow) ? v : ErrorVal() elseif k == K"Char" io = IOBuffer() - had_error = unescape_julia_string(io, source.code, first(srcrange), + had_error = unescape_julia_string(io, txtbuf, first(srcrange), last(srcrange)+1, Diagnostic[]) if had_error - ErrorVal() + return ErrorVal() else seek(io, 0) c = read(io, Char) - eof(io) ? c : ErrorVal() + return eof(io) ? c : ErrorVal() + end + elseif k in KSet"String CmdString" + io = IOBuffer() + had_error = false + if has_flags(head, RAW_STRING_FLAG) + unescape_raw_string(io, txtbuf, first(srcrange), last(srcrange)+1, + k == K"CmdString") + else + had_error = unescape_julia_string(io, txtbuf, first(srcrange), + last(srcrange)+1, Diagnostic[]) end + return had_error ? ErrorVal() : String(take!(io)) + elseif k == K"true" + return true + elseif k == K"false" + return false + end + + val_str = String(txtbuf[srcrange]) + if k == K"Integer" + parse_int_literal(val_str) + elseif k in KSet"BinInt OctInt HexInt" + parse_uint_literal(val_str, k) elseif k == K"Identifier" if has_flags(head, RAW_STRING_FLAG) io = IOBuffer() - unescape_raw_string(io, val_str, false) + unescape_raw_string(io, txtbuf, first(srcrange), last(srcrange)+1, false) Symbol(normalize_identifier(String(take!(io)))) else Symbol(normalize_identifier(val_str)) @@ -406,16 +420,6 @@ function parse_julia_literal(source, head::SyntaxHead, srcrange) elseif is_keyword(k) # This should only happen for tokens nested inside errors Symbol(val_str) - elseif k in KSet"String CmdString" - io = IOBuffer() - had_error = false - if has_flags(head, RAW_STRING_FLAG) - unescape_raw_string(io, val_str, k == K"CmdString") - else - had_error = unescape_julia_string(io, source.code, first(srcrange), - last(srcrange)+1, Diagnostic[]) - end - had_error ? ErrorVal() : String(take!(io)) elseif is_operator(k) isempty(srcrange) ? Symbol(untokenize(k)) : # synthetic invisible tokens diff --git a/src/parse_stream.jl b/src/parse_stream.jl index f37a1f59..ac7f9d7f 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -905,7 +905,7 @@ end # ParseStream Post-processing function validate_tokens(stream::ParseStream) - text = sourcetext(stream) + txtbuf = textbuf(stream) toks = stream.tokens charbuf = IOBuffer() for i = 2:length(toks) @@ -922,13 +922,13 @@ function validate_tokens(stream::ParseStream) elseif k == K"Float" || k == K"Float32" underflow0 = false if k == K"Float" - x, code = parse_float_literal(Float64, text, fbyte, nbyte) + x, code = parse_float_literal(Float64, txtbuf, fbyte, nbyte) # jl_strtod_c can return "underflow" even for valid cases such # as `5e-324` where the source is an exact representation of # `x`. So only warn when underflowing to zero. underflow0 = code === :underflow && x == 0 else - x, code = parse_float_literal(Float32, text, fbyte, nbyte) + x, code = parse_float_literal(Float32, txtbuf, fbyte, nbyte) underflow0 = code === :underflow && x == 0 end if code === :ok @@ -944,7 +944,7 @@ function validate_tokens(stream::ParseStream) elseif k == K"Char" @assert fbyte < nbyte # Already handled in the parser truncate(charbuf, 0) - had_error = unescape_julia_string(charbuf, text, fbyte, + had_error = unescape_julia_string(charbuf, txtbuf, fbyte, nbyte, stream.diagnostics) if had_error error_kind = K"ErrorInvalidEscapeSequence" @@ -958,19 +958,18 @@ function validate_tokens(stream::ParseStream) end end elseif k == K"String" && !has_flags(t, RAW_STRING_FLAG) - had_error = unescape_julia_string(devnull, text, fbyte, + had_error = unescape_julia_string(devnull, txtbuf, fbyte, nbyte, stream.diagnostics) if had_error error_kind = K"ErrorInvalidEscapeSequence" end elseif is_error(k) && k != K"error" # Emit messages for non-generic token errors - # - textrange = fbyte:prevind(text, nbyte) + tokstr = String(txtbuf[tokrange]) msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter" - "$(_token_error_descriptions[k]) $(repr(text[fbyte]))" + "$(_token_error_descriptions[k]) $(repr(tokstr[1]))" elseif k in KSet"ErrorInvalidUTF8 ErrorBidiFormatting" - "$(_token_error_descriptions[k]) $(repr(text[textrange]))" + "$(_token_error_descriptions[k]) $(repr(tokstr))" else _token_error_descriptions[k] end @@ -990,8 +989,7 @@ end # API for extracting results from ParseStream """ - build_tree(make_node::Function, ::Type{StackEntry}, stream::ParseStream; - wrap_toplevel_as_kind=nothing, kws...) + build_tree(make_node::Function, ::Type{StackEntry}, stream::ParseStream; kws...) Construct a tree from a ParseStream using depth-first traversal. `make_node` must have the signature @@ -1002,14 +1000,13 @@ where `children` is either `nothing` for leaf nodes or an iterable of the children of type `StackEntry` for internal nodes. `StackEntry` may be a node type, but also may include other information required during building the tree. -A single node which covers the input is expected, but if the ParseStream has -multiple nodes at the top level, `wrap_toplevel_as_kind` may be used to wrap -them in a single node. +If the ParseStream has multiple nodes at the top level, `K"wrapper"` is used to +wrap them in a single node. The tree here is constructed depth-first in postorder. """ function build_tree(make_node::Function, ::Type{NodeType}, stream::ParseStream; - wrap_toplevel_as_kind=nothing, kws...) where NodeType + kws...) where NodeType stack = Vector{NamedTuple{(:first_token,:node),Tuple{Int,NodeType}}}() tokens = stream.tokens @@ -1068,15 +1065,11 @@ function build_tree(make_node::Function, ::Type{NodeType}, stream::ParseStream; end if length(stack) == 1 return only(stack).node - elseif !isnothing(wrap_toplevel_as_kind) - # Mostly for debugging + else srcrange = (stream.tokens[1].next_byte: stream.tokens[end].next_byte - 1) children = (x.node for x in stack) - return make_node(SyntaxHead(wrap_toplevel_as_kind, EMPTY_FLAGS), - srcrange, children) - else - error("Found multiple nodes at top level") + return make_node(SyntaxHead(K"wrapper", EMPTY_FLAGS), srcrange, children) end end @@ -1092,21 +1085,22 @@ state for further parsing. """ function sourcetext(stream::ParseStream; steal_textbuf=false) root = stream.text_root - # The following works for SubString but makes the return type of this - # method type unstable. + # The following kinda works but makes the return type of this method type + # unstable. (Also codeunit(root) == UInt8 doesn't imply UTF-8 encoding?) # if root isa AbstractString && codeunit(root) == UInt8 # return root - if root isa String - return root + str = if root isa String || root isa SubString + root elseif steal_textbuf - return String(stream.textbuf) + String(stream.textbuf) else # Safe default for other cases is to copy the buffer. Technically this # could possibly be avoided in some situations, but might have side # effects such as mutating stream.text_root or stealing the storage of # stream.textbuf - return String(copy(stream.textbuf)) + String(copy(stream.textbuf)) end + SubString(str, first_byte(stream), thisind(str, last_byte(stream))) end """ diff --git a/src/parser_api.jl b/src/parser_api.jl index ffc6538f..cc47dacc 100644 --- a/src/parser_api.jl +++ b/src/parser_api.jl @@ -74,7 +74,6 @@ function _parse(rule::Symbol, need_eof::Bool, ::Type{T}, text, index=1; version= stream = ParseStream(text, index; version=version) if ignore_trivia && rule != :all bump_trivia(stream, skip_newlines=true) - empty!(stream) end parse!(stream; rule=rule) if need_eof @@ -87,12 +86,7 @@ function _parse(rule::Symbol, need_eof::Bool, ::Type{T}, text, index=1; version= (!ignore_warnings && !isempty(stream.diagnostics)) throw(ParseError(stream, filename=filename, first_line=first_line)) end - # TODO: Figure out a more satisfying solution to the wrap_toplevel_as_kind - # mess that we've got here. - # * It's kind of required for GreenNode, as GreenNode only records spans, - # not absolute positions. - # * Dropping it would be ok for SyntaxNode and Expr... - tree = build_tree(T, stream; wrap_toplevel_as_kind=K"toplevel", filename=filename, first_line=first_line, kws...) + tree = build_tree(T, stream; filename=filename, first_line=first_line, kws...) tree, last_byte(stream) + 1 end diff --git a/src/precompile.jl b/src/precompile.jl index 6f8d010c..922be354 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -2,6 +2,7 @@ let filename = joinpath(@__DIR__, "literal_parsing.jl") text = read(filename, String) parseall(Expr, text) + parseall(SyntaxNode, text) if _has_v1_6_hooks enable_in_core!() Meta.parse("1 + 2") diff --git a/src/source_files.jl b/src/source_files.jl index e15a419c..283f6d1f 100644 --- a/src/source_files.jl +++ b/src/source_files.jl @@ -1,24 +1,30 @@ """ - SourceFile(code [; filename=nothing, first_line=1]) + SourceFile(code [; filename=nothing, first_line=1, first_index=1]) -A UTF-8 source code string with associated file name and line number. +UTF-8 source text with associated file name and line number, storing the +character indices of the start of each line. `first_line` and `first_index` +can be used to specify the line number and index of the first character of +`code` within a larger piece of source text. -`SourceFile` stores the character positions of line starts to facilitate indexing. +`SourceFile` may be indexed via `getindex` or `view` to get a string. Line +information for a byte offset can be looked up via the `source_line`, +`source_location` and `source_line_range` functions. """ struct SourceFile - # We use `code::String` for now but it could be some other UTF-8 based - # string data structure with byte-based indexing. - # - # For example a rope data structure may be good for incremental editing - # https://en.wikipedia.org/wiki/Rope_(data_structure) - code::String + # TODO: Rename SourceFile -> SourceText / SourceChunk / SourceIndex / SourceLineIndex ? + # See https://github.com/JuliaLang/JuliaSyntax.jl/issues/190 + code::SubString + # Offset of `code` within a larger chunk of source text + byte_offset::Int filename::Union{Nothing,String} + # first_column::Int ?? first_line::Int # String index of start of every line line_starts::Vector{Int} end -function SourceFile(code::AbstractString; filename=nothing, first_line=1) +function SourceFile(code::AbstractString; filename=nothing, first_line=1, + first_index=1) line_starts = Int[1] for i in eachindex(code) # The line is considered to start after the `\n` @@ -27,7 +33,7 @@ function SourceFile(code::AbstractString; filename=nothing, first_line=1) if isempty(code) || last(code) != '\n' push!(line_starts, ncodeunits(code)+1) end - SourceFile(code, filename, first_line, line_starts) + SourceFile(code, first_index-1, filename, first_line, line_starts) end function SourceFile(; filename, kwargs...) @@ -36,7 +42,7 @@ end # Get line number of the given byte within the code function _source_line_index(source::SourceFile, byte_index) - lineidx = searchsortedlast(source.line_starts, byte_index) + lineidx = searchsortedlast(source.line_starts, byte_index - source.byte_offset) return (lineidx < lastindex(source.line_starts)) ? lineidx : lineidx-1 end _source_line(source::SourceFile, lineidx) = lineidx + source.first_line - 1 @@ -44,7 +50,8 @@ _source_line(source::SourceFile, lineidx) = lineidx + source.first_line - 1 """ Get the line number at the given byte index. """ -source_line(source::SourceFile, byte_index) = _source_line(source, _source_line_index(source, byte_index)) +source_line(source::SourceFile, byte_index) = + _source_line(source, _source_line_index(source, byte_index)) """ Get line number and character within the line at the given byte index. @@ -53,7 +60,7 @@ function source_location(source::SourceFile, byte_index) lineidx = _source_line_index(source, byte_index) i = source.line_starts[lineidx] column = 1 - while i < byte_index + while i < byte_index - source.byte_offset i = nextind(source.code, i) column += 1 end @@ -92,32 +99,32 @@ function Base.show(io::IO, ::MIME"text/plain", source::SourceFile) end function Base.getindex(source::SourceFile, rng::AbstractUnitRange) - i = first(rng) + i = first(rng) - source.byte_offset # Convert byte range into unicode String character range. # Assumes valid unicode! (SubString doesn't give us a reliable way to opt # out of the valid unicode check. The SubString{String} inner constructor # has some @boundscheck, but using @inbounds depends on inlining choices.) - j = prevind(source.code, last(rng)+1) + j = prevind(source.code, last(rng) + 1 - source.byte_offset) source.code[i:j] end # TODO: Change view() here to `sourcetext` ? function Base.view(source::SourceFile, rng::AbstractUnitRange) - i = first(rng) - j = prevind(source.code, last(rng)+1) + i = first(rng) - source.byte_offset + j = prevind(source.code, last(rng) + 1 - source.byte_offset) SubString(source.code, i, j) end function Base.getindex(source::SourceFile, i::Int) - source.code[i] + source.code[i - source.byte_offset] end function Base.thisind(source::SourceFile, i::Int) - thisind(source.code, i) + thisind(source.code, i - source.byte_offset) end -Base.firstindex(source::SourceFile) = firstindex(source.code) -Base.lastindex(source::SourceFile) = lastindex(source.code) +Base.firstindex(source::SourceFile) = firstindex(source.code) + source.byte_offset +Base.lastindex(source::SourceFile) = lastindex(source.code) + source.byte_offset """ sourcetext(source::SourceFile) diff --git a/src/syntax_tree.jl b/src/syntax_tree.jl index 62fe8eb3..9614a279 100644 --- a/src/syntax_tree.jl +++ b/src/syntax_tree.jl @@ -61,15 +61,18 @@ Base.show(io::IO, ::ErrorVal) = printstyled(io, "✘", color=:light_red) function SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}; keep_parens=false, position::Integer=1) - _to_SyntaxNode(source, raw, convert(Int, position), keep_parens) + offset, txtbuf = _unsafe_wrap_substring(sourcetext(source)) + _to_SyntaxNode(source, txtbuf, offset, raw, convert(Int, position), keep_parens) end -function _to_SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, +function _to_SyntaxNode(source::SourceFile, txtbuf::Vector{UInt8}, offset::Int, + raw::GreenNode{SyntaxHead}, position::Int, keep_parens::Bool) if !haschildren(raw) && !(is_syntax_kind(raw) || is_keyword(raw)) # Here we parse the values eagerly rather than representing them as # strings. Maybe this is good. Maybe not. - val = parse_julia_literal(source, head(raw), position:position + span(raw) - 1) + valrange = position:position + span(raw) - 1 + val = parse_julia_literal(txtbuf, head(raw), valrange .+ offset) return SyntaxNode(nothing, nothing, SyntaxData(source, raw, position, val)) else cs = SyntaxNode[] @@ -77,13 +80,16 @@ function _to_SyntaxNode(source::SourceFile, raw::GreenNode{SyntaxHead}, for (i,rawchild) in enumerate(children(raw)) # FIXME: Allowing trivia is_error nodes here corrupts the tree layout. if !is_trivia(rawchild) || is_error(rawchild) - push!(cs, _to_SyntaxNode(source, rawchild, pos, keep_parens)) + push!(cs, _to_SyntaxNode(source, txtbuf, offset, rawchild, pos, keep_parens)) end pos += Int(rawchild.span) end if !keep_parens && kind(raw) == K"parens" && length(cs) == 1 return cs[1] end + if kind(raw) == K"wrapper" && length(cs) == 1 + return cs[1] + end node = SyntaxNode(nothing, cs, SyntaxData(source, raw, position, nothing)) for c in cs c.parent = node @@ -130,7 +136,8 @@ function interpolate_literal(node::SyntaxNode, val) SyntaxNode(node.source, node.raw, node.position, node.parent, true, val) end -function _show_syntax_node(io, current_filename, node::AbstractSyntaxNode, indent, show_byte_offsets) +function _show_syntax_node(io, current_filename, node::AbstractSyntaxNode, + indent, show_byte_offsets) fname = node.source.filename line, col = source_location(node.source, node.position) posstr = "$(lpad(line, 4)):$(rpad(col,3))│" @@ -216,7 +223,7 @@ function build_tree(::Type{SyntaxNode}, stream::ParseStream; filename=nothing, first_line=1, keep_parens=false, kws...) green_tree = build_tree(GreenNode, stream; kws...) source = SourceFile(sourcetext(stream), filename=filename, first_line=first_line) - SyntaxNode(source, green_tree, position=first_byte(stream), keep_parens=keep_parens) + SyntaxNode(source, green_tree, position=1, keep_parens=keep_parens) end #------------------------------------------------------------------------------- diff --git a/src/utils.jl b/src/utils.jl index f30f06f9..cee9689e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -3,7 +3,7 @@ if VERSION < v"1.1" isnothing(x) = x === nothing end if VERSION < v"1.4" - function only(x::AbstractVector) + function only(x::Union{AbstractVector,AbstractString}) if length(x) != 1 error("Collection must contain exactly 1 element") end @@ -14,6 +14,8 @@ if VERSION < v"1.5" import Base.peek end +_unsafe_wrap_substring(s) = (s.offset, unsafe_wrap(Vector{UInt8}, s.string)) + #-------------------------------------------------- # # Internal error, used as assertion failure for cases we expect can't happen. @@ -48,6 +50,12 @@ function remove_linenums!(ex::Expr) return ex end +# String macro to get the UInt8 code of an ascii character +macro u8_str(str) + c = str == "\\" ? '\\' : only(unescape_string(str)) + isascii(c) || error("Non-ascii character in u8_str") + codepoint(c) % UInt8 +end #------------------------------------------------------------------------------- # Text printing/display utils diff --git a/test/literal_parsing.jl b/test/literal_parsing.jl index 7c4d1012..42fcbc44 100644 --- a/test/literal_parsing.jl +++ b/test/literal_parsing.jl @@ -153,7 +153,7 @@ end function unesc(str, firstind=firstindex(str), endind=lastindex(str)+1; diagnostics=false) io = IOBuffer() ds = JuliaSyntax.Diagnostic[] - unescape_julia_string(io, str, firstind, endind, ds) + unescape_julia_string(io, Vector{UInt8}(str), firstind, endind, ds) if diagnostics ds else @@ -204,7 +204,8 @@ end function unesc_raw(str, is_cmd) io = IOBuffer() - JuliaSyntax.unescape_raw_string(io, str, is_cmd) + JuliaSyntax.unescape_raw_string(io, Vector{UInt8}(str), + firstindex(str), lastindex(str)+1, is_cmd) return String(take!(io)) end diff --git a/test/parser.jl b/test/parser.jl index ccad1d9a..161323fc 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -1,22 +1,14 @@ """ Parse string to SyntaxNode tree and show as an sexpression """ -function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6", expr=false) +function parse_to_sexpr_str(production, code::AbstractString; v=v"1.6") stream = ParseStream(code, version=v) production(ParseState(stream)) JuliaSyntax.validate_tokens(stream) - t = build_tree(GreenNode, stream, wrap_toplevel_as_kind=K"None") + t = build_tree(GreenNode, stream) source = SourceFile(code) s = SyntaxNode(source, t, keep_parens=true) - if expr - JuliaSyntax.remove_linenums!(Expr(s)) - else - if kind(s) == K"None" - join([sprint(show, MIME("text/x.sexpression"), c) for c in children(s)], ' ') - else - sprint(show, MIME("text/x.sexpression"), s) - end - end + return sprint(show, MIME("text/x.sexpression"), s) end function test_parse(production, input, output) @@ -55,7 +47,7 @@ tests = [ "a;;;b;;" => "(toplevel-; a b)" """ "x" a ; "y" b """ => """(toplevel-; (doc (string "x") a) (doc (string "y") b))""" - "x y" => "x (error-t y)" + "x y" => "(wrapper x (error-t y))" ], JuliaSyntax.parse_eq => [ # parse_assignment @@ -411,7 +403,7 @@ tests = [ "A.@x a" => "(macrocall (. A (quote @x)) a)" "@A.B.@x a" => "(macrocall (. (. A (quote B)) (quote (error-t) @x)) a)" # .' discontinued - "f.'" => "f (error-t ')" + "f.'" => "(wrapper f (error-t '))" # Field/property syntax "f.x.y" => "(. (. f (quote x)) (quote y))" "x .y" => "(. x (error-t) (quote y))" @@ -812,6 +804,7 @@ tests = [ "`cmd`" => "(macrocall core_@cmd (cmdstring-r \"cmd\"))" "```cmd```" => "(macrocall core_@cmd (cmdstring-s-r \"cmd\"))" # literals + "true" => "true" "42" => "42" "1.0e-1000" => "0.0" "0x123456789abcdefp+0" => "8.19855292164869e16" @@ -975,7 +968,7 @@ parsestmt_test_specs = [ # The following are currently broken but at least the parser shouldn't # crash. - "x in' '" => "(call-i x in (char (error))) (error-t ')" + "x in' '" => "(wrapper (call-i x in (char (error))) (error-t '))" ] @testset "Parser does not crash on broken code" begin diff --git a/test/parser_api.jl b/test/parser_api.jl index 1a78000e..4ceb8b58 100644 --- a/test/parser_api.jl +++ b/test/parser_api.jl @@ -85,6 +85,16 @@ @test parsestmt(Expr, SubString("α+x\ny"), 1) == (:(α+x), 5) @test parseatom(Expr, SubString("x+y"), 1) == (:x, 2) @test parseatom(Expr, SubString("x+y"), 3) == (:y, 4) + + @test parseatom(Expr, SubString("x+1.0"), 3) == (1.0, 6) + @test parseatom(Expr, SubString("x+\"\n\""), 3) == ("\n", 6) + + # Line numbers are relative to the start of the string we're currently + # parsing + @test JuliaSyntax.parsestmt(Expr, "begin\na\nend\nbegin\nb\nend", 1) == + (Expr(:block, LineNumberNode(2), :a), 12) + @test JuliaSyntax.parsestmt(Expr, "begin\na\nend\nbegin\nb\nend", 12) == + (Expr(:block, LineNumberNode(3), :b), 24) end @testset "error/warning handling" begin diff --git a/test/source_files.jl b/test/source_files.jl index 5c113a73..c6329cfa 100644 --- a/test/source_files.jl +++ b/test/source_files.jl @@ -26,19 +26,37 @@ @test source_location(SourceFile(; filename=path), 1) == (1,1) @test source_location(SourceFile(; filename=path, first_line=7), 1) == (7,1) end + + # byte offset + @test source_location(SourceFile("a\nbb\nccc\ndddd", first_index=10), 13) == (2,2) + @test source_line(SourceFile("a\nbb\nccc\ndddd", first_index=10), 15) == 3 + + # source_line convenience function + @test source_line(SourceFile("a\nb\n"), 2) == 1 + @test source_line(SourceFile("a\nb\n"), 3) == 2 end @testset "SourceFile position indexing" begin @test SourceFile("a\nb\n")[1:2] == "a\n" @test SourceFile("a\nb\n")[3:end] == "b\n" - if Base.VERSION >= v"1.4" - # Protect the `[begin` from being viewed by the parser on older Julia versions - @test eval(Meta.parse("""SourceFile("a\nb\n")[begin:end]""")) == "a\nb\n" - end # unicode @test SourceFile("αβ")[1:2] == "α" @test SourceFile("αβ")[3] == 'β' + + # offsets + sf = SourceFile("abcd", first_index=10) + @test firstindex(sf) == 10 + @test lastindex(sf) == 13 + @test sf[10] == 'a' + @test sf[10:11] == "ab" + @test view(sf, 10:11) == "ab" + + if Base.VERSION >= v"1.4" + # Protect the `[begin` from being viewed by the parser on older Julia versions + @test eval(Meta.parse("SourceFile(\"a\nb\n\")[begin:end]")) == "a\nb\n" + @test eval(Meta.parse("SourceFile(\"abcd\", first_index=10)[begin+1:end-1]")) == "bc" + end end @testset "SourceFile printing and text extraction" begin diff --git a/test/syntax_tree.jl b/test/syntax_tree.jl index c0cbf547..32efac76 100644 --- a/test/syntax_tree.jl +++ b/test/syntax_tree.jl @@ -73,4 +73,11 @@ end 1:5 │ 5:5 │ b 2:3 │ 10:10 │ c """ + + t,_ = parsestmt(SyntaxNode, "begin a end\nbegin b end", 13) + @test sprint(show, MIME("text/plain"), t) == """ + line:col│ tree │ file_name + 1:1 │[block] + 1:7 │ b + """ end diff --git a/test/test_utils.jl b/test/test_utils.jl index e3228eaa..540086ff 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -8,6 +8,7 @@ using .JuliaSyntax: Diagnostic, SourceFile, source_location, + source_line, parse!, parsestmt, parseall,