From b2f6c9bed542f174fcc32339a48704ee9a7a2db6 Mon Sep 17 00:00:00 2001 From: Jad Ghalayini Date: Tue, 23 Jul 2024 20:40:14 +0100 Subject: [PATCH] Added support for UTF-8 --- src/auxl.ml | 43 ++++++++++++ src/grammar_lexer.mll | 12 ++-- src/grammar_pp.ml | 47 ++++++++----- tests/test10_unicode.ott | 45 ++++++++++++ tests/test17.12_unicode.ott | 136 ++++++++++++++++++++++++++++++++++++ 5 files changed, 260 insertions(+), 23 deletions(-) create mode 100644 tests/test10_unicode.ott create mode 100644 tests/test17.12_unicode.ott diff --git a/src/auxl.ml b/src/auxl.ml index 88ee363..3e3905e 100644 --- a/src/auxl.ml +++ b/src/auxl.ml @@ -215,6 +215,49 @@ let string_of_char_list ts = List.iter (Buffer.add_char s) ts; Buffer.contents s +exception Not_UTF8 + +let utf8_cp_list_of_string s = + let n = String.length s in + let rec f i = if i=n then [] else + let c = int_of_char (String.get s i) in + if c land 0x80 = 0x80 then + if c land 0x40 = 0x40 then + if c land 0x20 = 0x20 then + if c land 0x10 = 0x10 then + (* First byte starts with 1111; codepoint is 4 bytes long *) + if i+3 >= n then + raise Not_UTF8 + else + let c = (c land 0x0F) lsl 18 + + (int_of_char (String.get s (i+1)) land 0x3F) lsl 12 + + (int_of_char (String.get s (i+2)) land 0x3F) lsl 6 + + (int_of_char (String.get s (i+3)) land 0x3F) in + c :: f (i+4) + else + (* First byte starts with 1110; codepoint is 3 bytes long *) + if i+2 >= n then + raise Not_UTF8 + else + let c = (c land 0x0F) lsl 12 + + (int_of_char (String.get s (i+1)) land 0x3F) lsl 6 + + (int_of_char (String.get s (i+2)) land 0x3F) in + c :: f (i+3) + else + (* First byte starts with 110; codepoint is 2 bytes long *) + if i+1 >= n then + raise Not_UTF8 + else + let c = (c land 0x1F) lsl 6 + (int_of_char (String.get s (i+1)) land 0x3F) in + c :: f (i+2) + else + (* First byte starts with 10: invalid UTF 8 since leading byte cannot start with 10 *) + raise Not_UTF8 + else + c :: f (i+1) + in + f 0 + (* tests to identify strings *) exception Not_alpha diff --git a/src/grammar_lexer.mll b/src/grammar_lexer.mll index 46b7a31..5d94130 100644 --- a/src/grammar_lexer.mll +++ b/src/grammar_lexer.mll @@ -199,22 +199,24 @@ let get_first_word_and_rest = let go_back n f lexbuf = lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - n; my_lexer_state := f - } +(* remember: these codepoints are in decimal, not octal *) let whitespace = (' ' | '\t' | '\010' | '\012' | '\013') let newline = ('\010' | '\013' | "\013\010") let non_newline = [^ '\010' '\013'] let non_newline_whitespace = (' ' | '\t' |'\012') -(* let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")* *) -let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")* -let maybe_quoted_ident = pre_ident | ("'" pre_ident "'" ) | "''" +(* Note: the bit at the end allows all Unicode which is not special ASCII characters *) +let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'" | ['\128'-'\255'])+ +let maybe_quoted_ident = pre_ident (* let pre_ident_allow_minus = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")* let maybe_quoted_ident_allow_minus = pre_ident_allow_minus | ("'" pre_ident_allow_minus "'" ) | "''" *) let string = [^ ' ' '\t' '\010' '\012' '\013']+ - +(* UNICODE NOTES: The above regexes will parse unicode whitespace as identifiers. This is the same +as what the "string" regex would do in some positions previously, and the "whitespace" regex is not +Unicode-aware, so we view this as not breaking anything new. Nonetheless, caveat emptor. *) (* We use context-dependent lexing, lexing (eg) the elements of a *) (* production w.r.t. a small language so that the user rarely has to *) diff --git a/src/grammar_pp.ml b/src/grammar_pp.ml index a852020..531600e 100644 --- a/src/grammar_pp.ml +++ b/src/grammar_pp.ml @@ -797,25 +797,36 @@ let pp_tex_COMP_NAME m = "\\"^pp_tex_NAME_PREFIX m^"comp" let pp_tex_COMP_U_NAME m = "\\"^pp_tex_NAME_PREFIX m^"compu" let pp_tex_COMP_LU_NAME m = "\\"^pp_tex_NAME_PREFIX m^"complu" +(* Note: as a hack, we represent unicode characters as base16 shifted forward by 16, i.e. 0, 1, 2, +3, 4, 5, 6, 7, 8, 9, 10, a, b, c, d, e, f become g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v*) +let utf8_tex_escape c = + if c <= 255 then + match char_of_int c with + | '_' ->"XX" + | '#' -> "YY" + | '\'' -> "PP" + | '0' -> "Zero" + | '1' -> "One" + | '2' -> "Two" + | '3' -> "Three" + | '4' -> "Four" + | '5' -> "Five" + | '6' -> "Six" + | '7' -> "Seven" + | '8' -> "Eight" + | '9' -> "Nine" + | c -> String.make 1 c + else + let rec shex s c = + let d = c mod 16 in + let c = c / 16 in + let s = (String.make 1 (char_of_int ((int_of_char 'g') + d))) ^ s in + if c = 0 then s else shex s c + in + shex "" c + let rec tex_command_escape s = - String.concat "" - (List.map - (fun c -> match c with - | '_' ->"XX" - | '#' -> "YY" - | '\'' -> "PP" - | '0' -> "Zero" - | '1' -> "One" - | '2' -> "Two" - | '3' -> "Three" - | '4' -> "Four" - | '5' -> "Five" - | '6' -> "Six" - | '7' -> "Seven" - | '8' -> "Eight" - | '9' -> "Nine" - | _ -> String.make 1 c) - (Auxl.char_list_of_string s)) + String.concat "" (List.map utf8_tex_escape (Auxl.utf8_cp_list_of_string s)) and tex_rule_name m ntr = "\\"^pp_tex_NAME_PREFIX m^tex_command_escape ntr and tex_drule_name m s = "\\"^pp_tex_NAME_PREFIX m^"drule"^tex_command_escape s diff --git a/tests/test10_unicode.ott b/tests/test10_unicode.ott new file mode 100644 index 0000000..707a87d --- /dev/null +++ b/tests/test10_unicode.ott @@ -0,0 +1,45 @@ +metavar var, x ::= {{ com term variable }} +{{ isa string}} {{ coq nat}} {{ hol string}} {{ lem string }} {{ coq-equality }} +{{ ocaml int}} {{ lex alphanum}} {{ tex \mathit{[[var]]} }} + +grammar +项 {{tex \tau}}, t :: 't_' ::= {{ com term }} + | x :: :: var {{ com variable}} + | λ x . t :: :: lam (+ bind x in t +) {{ com lambda }} + | t t' :: :: app {{ com app }} + | ( t ) :: S :: paren {{ icho [[t]] }} {{ lem [[t]] }} + | { t / x } t' :: M :: sub + {{ icho (tsubst_term [[t]] [[x]] [[t']])}} + {{ lem (tsubst_term [[t]] [[x]] [[t']]) }} +val,v :: 'v_' ::= {{ com value }} + | λ x . t :: :: lam {{ com lambda }} + +terminals :: 'terminals_' ::= + | λ :: :: lambda {{ tex \lambda }} + | --> :: :: red {{ tex \longrightarrow }} + +subrules + val <:: 项 + +substitutions + single 项 var :: tsubst + +defns +Jop :: '' ::= + + defn + t1 --> t2 :: ::reduce::'' {{ com $[[t1]]$ reduces to $[[t2]]$ }} {{ lemwcf witness type reduce_witness; check reduce_check; eval : input -> output }} by + + + -------------------------- :: ax_app + (λx.t1) v2 --> {v2/x}t1 + + t1 --> t1' + -------------- :: ctx_app_fun + t1 t --> t1' t + + t1 --> t1' + -------------- :: ctx_app_arg + v t1 --> v t1' + + diff --git a/tests/test17.12_unicode.ott b/tests/test17.12_unicode.ott new file mode 100644 index 0000000..5cea56e --- /dev/null +++ b/tests/test17.12_unicode.ott @@ -0,0 +1,136 @@ +% test17.12.ott sundry list form test + +embed +{{ coq +(** some concrete labels *) +Definition LA : nat := 1. +Definition LB : nat := 2. +Definition LC : nat := 3. +}} + +metavar typevar, X ::= + {{ isa string }} {{ coq nat }} {{ coq-equality }} {{ hol string }} {{ lex Alphanum }} + {{ tex \mathit{[[typevar]]} }} {{ com type variable }} + {{ isavar ''[[typevar]]'' }} {{ texvar \mathrm{[[typevar]]} }} + +metavar termvar, x ::= + {{ isa string }} {{ coq nat }} {{ hol string }} {{ coq-equality }} {{ lex alphanum }} + {{ tex \mathit{[[termvar]]} }} {{ com term variable }} + {{ isavar ''[[termvar]]'' }} {{ texvar \mathrm{[[termvar]]} }} + +metavar label, l, k ::= + {{ isa string }} {{ coq nat }} {{ hol string }} {{ lex alphanum }} {{ tex \mathit{[[label]]} }} {{ isavar ''[[label]]'' }} {{ holvar "[[label]]" }} + {{ com field label }} + +indexvar index, i, j, n, m ::= {{ isa nat }} {{ coq nat }} {{ hol num }} {{ lex numeral }} + {{ com indices }} + + +grammar + +T {{ hol Typ }}, S, U :: 'T_' ::= {{ com type }} + | X :: :: Var {{ com type variable }} + | { l1 : T1 , .. , ln : Tn } :: :: Rec {{ com record }} + | [ l0 : T0 , .. , ln-1 : Tn-1 ] :: :: ZRec {{ com record }} + | int :: :: int + | bool :: :: bool + +t :: 't_' ::= {{ com term }} + | x :: :: Var {{ com variable }} +% | { l1 = t1 , .. , ln = tn } :: :: Rec {{ com record --- dots }} +% | { } :: :: Rec_comp_none {{ com record --- comp }} +% | { } :: :: Rec_comp_some {{ com record --- comp with terminal }} +% | { } :: :: Rec_comp_u_none {{ com record --- compu }} +% | { } :: :: Rec_comp_u_some {{ com record --- compu with terminal }} +% | { } :: :: Rec_comp_lu_none{{ com record --- complu }} + | { } :: :: Rec_comp_lu_some {{ com record --- complu with terminal }} + | [ ] :: :: ZRec_comp_lu_some {{ com record --- complu with terminal }} + | t . l :: :: Proj {{ com projection }} + | true :: :: True {{ com True }} + | false :: :: False {{ com False }} + +Γ {{ tex \Gamma }}, Δ {{ tex \Delta }} :: 'G_' ::= {{ com type environment }} + | empty :: :: empty + | Γ , X <: T :: :: type + | Γ , x : T :: :: term +% | Γ , Γ' :: M :: comma {{ ich TODO }} +% | Γ1 , .. , Γn :: M :: dots {{ ich TODO }} + + +formula :: formula_ ::= + | judgement :: :: judgement + | formula1 .. formulan :: :: dots +%{{ isa (List.list_all (\ b . b) ( [[ formula1 .. formulan ]] ) ) }} + +%formula :: formula_ ::= +% | preformula1 .. preformulan :: :: realdots {{ isa (List.list_all (\ b . b) ( [[ preformula1 .. preformulan ]] ) ) }} + +terminals :: terminals_ ::= + | |- :: :: turnstile {{ tex \vdash }} + | <: :: :: subtype {{ tex <: }} + +defns + Jtype :: '' ::= + +defn +Γ |- t : T :: :: Ty :: Ty_ {{ com term $[[t]]$ has type $[[T]]$ }} by + + Γ|-t0:T0 .. Γ|- tn-1 : Tn-1 + ------------------------------------- :: Rcd_dotform + Γ|- {l0=t0,..,ln-1=tn-1}:{l0:T0,..,ln-1:Tn-1} + + Γ|- t:{l0:T0,..,ln-1:Tn-1} + ----------------------- :: Proj_dotform + Γ|- t.lj : Tj + + + + -------------------------------------- :: Rcd_comp + Γ|- { }:{ } + + Γ|- t: { } + -------------------------------------- :: Proj_comp + Γ|- t.lj : Tj + + + -------------------------------------------------- :: Rcd_comp_u + Γ|- { }:{ } + + Γ|- t: { } + -------------------------------------------------- :: Proj_comp_u + Γ|- t.lj : Tj + + + ------------------------------------------------------- :: Rcd_comp_lu + Γ|- { }:{ } + + Γ|- t: { } + ------------------------------------------------------- :: Proj_comp_lu + Γ|- t.lj : Tj + + + Γ |- x : { l_0:T_0 ,.., l_n-1:T_n-1 } + --------------- :: foo + Γ |- x : { li0:Ti0 ,.., lin-1:Tin-1 } + + +defn +|- T :: :: Tyfoo :: Tyfoo_ by + +|- { } +|- { LA:int } +|- { LA:int, LB:bool } +|- { LA:int, LB:bool, LC : bool } +|- { l1:T1,..,lm:Tm } +|- { l:T , l1:T1,..,lm:Tm } +|- { l1:T1,..,lm:Tm, l:T } +|- { l1:T1,..,lm:Tm,l:T,l1':T1',..,ln':Tn' } +--------------------------------------------- :: 1 + |- int + +|- { } +|- { l:T , } +|- { , l:T } +|- { ,l:T, } +-------------------------------- :: 2 +|- int