diff --git a/src/ast.c b/src/ast.c index 5dfd2107d6e3e..b80ac60e17bb7 100644 --- a/src/ast.c +++ b/src/ast.c @@ -112,6 +112,7 @@ JL_DLLEXPORT jl_sym_t *jl_acquire_sym; JL_DLLEXPORT jl_sym_t *jl_release_sym; JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym; JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym; +JL_DLLEXPORT jl_sym_t *jl_julia_char_sym; static const uint8_t flisp_system_image[] = { @@ -366,6 +367,7 @@ void jl_init_common_symbols(void) jl_release_sym = jl_symbol("release"); jl_acquire_release_sym = jl_symbol("acquire_release"); jl_sequentially_consistent_sym = jl_symbol("sequentially_consistent"); + jl_julia_char_sym = jl_symbol("julia_char"); } JL_DLLEXPORT void jl_lisp_prompt(void) @@ -575,6 +577,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m ex = scm_to_julia_(fl_ctx, car_(e), mod); temp = jl_new_struct(jl_quotenode_type, ex); } + else if (sym == jl_julia_char_sym) { + value_t v = car_(e); + if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type)) + jl_error("malformed julia char"); + uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v)); + temp = jl_box_char(c); + } if (temp) { JL_GC_POP(); return temp; diff --git a/src/flisp/flisp.c b/src/flisp/flisp.c index 6f311fc7c8c1d..69b83964d82e0 100644 --- a/src/flisp/flisp.c +++ b/src/flisp/flisp.c @@ -2402,6 +2402,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize) #endif fl_ctx->jl_sym = symbol(fl_ctx, "julia_value"); + fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char"); fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR); vector_setsize(fl_ctx->the_empty_vector, 0); diff --git a/src/flisp/flisp.h b/src/flisp/flisp.h index 209a4f2d4fcdb..e77904a32d1f2 100644 --- a/src/flisp/flisp.h +++ b/src/flisp/flisp.h @@ -502,6 +502,7 @@ struct _fl_context_t { value_t apply_func, apply_v, apply_e; value_t jl_sym; + value_t jl_char_sym; // persistent buffer (avoid repeated malloc/free) // for julia_extensions.c: normalize size_t jlbuflen; diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c index 9fcd3e9789af4..4d043e7850420 100644 --- a/src/flisp/julia_extensions.c +++ b/src/flisp/julia_extensions.c @@ -361,6 +361,30 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0]))); } +// Return the uint32 representation if the string can be represented as a single Julia `Char` +// object. Otherwise return false. Note that it does allow for overlong chars like 'abcd', as +// long as they don't exceed 4 bytes +value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) { + argcount(fl_ctx, "string.only-julia-char", nargs, 1); + if (!fl_isstring(fl_ctx, args[0])) + type_error(fl_ctx, "string.only-julia-char", "string", args[0]); + uint8_t *s = (uint8_t*)cvalue_data(args[0]); + size_t len = cv_len((cvalue_t*)ptr(args[0])); + if (!(0 < len && len <= 4)) + return fl_ctx->F; + + uint32_t u = (uint32_t)s[0] << 24; + if (len == 1) goto ret; + u |= (uint32_t)s[1] << 16; + if (len == 2) goto ret; + u |= (uint32_t)s[2] << 8; + if (len == 3) goto ret; + u |= (uint32_t)s[3]; + +ret: + return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u)); +} + static const builtinspec_t julia_flisp_func_info[] = { { "skip-ws", fl_skipws }, { "accum-julia-symbol", fl_accum_julia_symbol }, @@ -371,6 +395,7 @@ static const builtinspec_t julia_flisp_func_info[] = { { "strip-op-suffix", fl_julia_strip_op_suffix }, { "underscore-symbol?", fl_julia_underscore_symbolp }, { "string->normsymbol", fl_string2normsymbol }, + { "string.only-julia-char", fl_string_only_julia_char }, { NULL, NULL } }; diff --git a/src/julia-parser.scm b/src/julia-parser.scm index 97a11df701a37..5ac849950365e 100644 --- a/src/julia-parser.scm +++ b/src/julia-parser.scm @@ -2465,29 +2465,44 @@ (cond ;; char literal ((eq? t '|'|) (take-token s) - (let ((firstch (read-char (ts:port s)))) - (if (and (not (eqv? firstch #\\)) - (not (eof-object? firstch)) - (eqv? (peek-char (ts:port s)) #\')) - ;; easy case: 1 character, no \ - (begin (read-char (ts:port s)) firstch) - (let ((b (open-output-string))) - (let loop ((c firstch)) - (if (not (eqv? c #\')) - (begin (if (eqv? c #\") ;; issue 14683 - (error "invalid character literal")) - (write-char (not-eof-1 c) b) - (if (eqv? c #\\) - (write-char (not-eof-1 (read-char (ts:port s))) - b)) - (loop (read-char (ts:port s)))))) - (let ((str (unescape-string (io.tostring! b)))) - (let ((len (string-length str))) - (if (= len 1) - (string.char str 0) - (if (= len 0) - (error "invalid empty character literal") - (error "character literal contains multiple characters"))))))))) + (let ((firstch (read-char (ts:port s))) + (b (open-output-string))) + ;; need to account for escape codes. In the case of `\x12` or `\12`, we even + ;; allow multiple codes in a single char literal to represent malformed chars + (let loop ((c firstch) (allowed-digits 0) (oct? #f) (first? #t) (only-raw? #t)) + (cond + ((eof-object? c) + (error "incomplete: invalid character syntax")) + ((= c #\') + (and first? (eqv? (peek-char (ts:port s)) #\') + (write-char (read-char (ts:port s)) b))) + ((= c #\\) + (let* ((c (not-eof-1 (read-char (ts:port s)))) + (only-raw? (and only-raw? (or (= c #\x) (char-oct? c)))) + (allowed-digits (case c (#\x 2) + (#\u 4) + (#\U 8) + (else (if (char-oct? c) 2 0))))) + (or first? only-raw? + (error "character literal contains multiple characters")) + (write-char #\\ b) + (write-char c b) + (loop (read-char (ts:port s)) allowed-digits (char-oct? c) #f only-raw?))) + ((and (> allowed-digits 0) ((if oct? char-oct? char-hex?) c)) + (write-char c b) + (loop (read-char (ts:port s)) (- allowed-digits 1) oct? #f only-raw?)) + ;; only allow one char if it's not an escape code + (first? + (if (= c #\") (write-char #\\ b)) ;; need to escape double quote + (write-char c b) + (loop (read-char (ts:port s)) 0 #f #f #f)) + (else (error "character literal contains multiple characters")))) + (let* ((str (unescape-string (io.tostring! b))) + (c (string.only-julia-char str))) + (or c + (if (= (string-length str) 0) + (error "invalid empty character literal") + (error "character literal contains multiple characters")))))) ;; symbol/expression quote ((eq? t ':) diff --git a/src/julia_internal.h b/src/julia_internal.h index 1faf8e2c25245..8e4958732e29c 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -1476,6 +1476,7 @@ extern JL_DLLEXPORT jl_sym_t *jl_acquire_sym; extern JL_DLLEXPORT jl_sym_t *jl_release_sym; extern JL_DLLEXPORT jl_sym_t *jl_acquire_release_sym; extern JL_DLLEXPORT jl_sym_t *jl_sequentially_consistent_sym; +extern JL_DLLEXPORT jl_sym_t *jl_julia_char_sym; JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order(jl_sym_t *order, char loading, char storing); JL_DLLEXPORT enum jl_memory_order jl_get_atomic_order_checked(jl_sym_t *order, char loading, char storing); diff --git a/test/syntax.jl b/test/syntax.jl index 8793a3de83bf8..dd0742f243059 100644 --- a/test/syntax.jl +++ b/test/syntax.jl @@ -276,9 +276,6 @@ end @test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42' # issue #24558 -@test_throws ParseError Meta.parse("'\\xff'") -@test_throws ParseError Meta.parse("'\\x80'") -@test_throws ParseError Meta.parse("'ab'") @test '\u2200' == "\u2200"[1] @test_throws ParseError Meta.parse("f(2x for x=1:10, y") @@ -317,19 +314,16 @@ let p = 15 @test 2p+1 == 31 # not a hex float literal end -function test_parseerror(str, msg) - try - Meta.parse(str) - @test false - catch e - @test isa(e,ParseError) && e.msg == msg - end +macro test_parseerror(str, msg) + ex = :(@test_throws ParseError($(esc(msg))) Meta.parse($(esc(str)))) + ex.args[2] = __source__ + return ex end -test_parseerror("0x", "invalid numeric constant \"0x\"") -test_parseerror("0b", "invalid numeric constant \"0b\"") -test_parseerror("0o", "invalid numeric constant \"0o\"") -test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") -test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") +@test_parseerror("0x", "invalid numeric constant \"0x\"") +@test_parseerror("0b", "invalid numeric constant \"0b\"") +@test_parseerror("0o", "invalid numeric constant \"0o\"") +@test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") +@test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") # issue #15798 @test Meta.lower(Main, Base.parse_input_line(""" @@ -345,8 +339,8 @@ test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") """)::Expr) == 23341 # issue #15763 -test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") -test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") +@test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") +@test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") # issue #15828 @test Meta.lower(Main, Meta.parse("x...")) == Expr(:error, "\"...\" expression outside call") @@ -2054,8 +2048,8 @@ end == 1 # issue #29982 @test Meta.parse("'a'") == 'a' @test Meta.parse("'\U0061'") == 'a' -test_parseerror("''", "invalid empty character literal") -test_parseerror("'abc'", "character literal contains multiple characters") +@test_parseerror("''", "invalid empty character literal") +@test_parseerror("'abc'", "character literal contains multiple characters") # optional soft scope: #28789, #33864 @@ -3280,3 +3274,25 @@ end # issue 44723 demo44723()::Any = Base.Experimental.@opaque () -> true ? 1 : 2 @test demo44723()() == 1 + +@testset "issue 25072" begin + @test '\xc0\x80' == reinterpret(Char, 0xc0800000) + @test '\x80' == reinterpret(Char, 0x80000000) + @test '\xff' == reinterpret(Char, 0xff000000) + @test '\xff\xff\xff\xff' == reinterpret(Char, 0xffffffff) + @test '\uffff' == Char(0xffff) + @test '\U00002014' == Char(0x2014) + @test '\100' == reinterpret(Char, UInt32(0o100) << 24) + @test '\100\42' == reinterpret(Char, (UInt32(0o100) << 24) | (UInt32(0o42) << 16)) + @test_parseerror "''" "invalid empty character literal" + @test_parseerror "'\\xff\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" + @test_parseerror "'abcd'" "character literal contains multiple characters" + @test_parseerror "'\\uff\\xff'" "character literal contains multiple characters" + @test_parseerror "'\\xff\\uff'" "character literal contains multiple characters" + @test_parseerror "'\\xffa'" "character literal contains multiple characters" + @test_parseerror "'\\uffffa'" "character literal contains multiple characters" + @test_parseerror "'\\U00002014a'" "character literal contains multiple characters" + @test_parseerror "'\\1000'" "character literal contains multiple characters" + @test Meta.isexpr(Meta.parse("'a"), :incomplete) + @test ''' == "'"[1] +end