From b2f6c9bed542f174fcc32339a48704ee9a7a2db6 Mon Sep 17 00:00:00 2001
From: Jad Ghalayini <jad.ghalayini@protonmail.com>
Date: Tue, 23 Jul 2024 20:40:14 +0100
Subject: [PATCH] Added support for UTF-8

---
 src/auxl.ml                 |  43 ++++++++++++
 src/grammar_lexer.mll       |  12 ++--
 src/grammar_pp.ml           |  47 ++++++++-----
 tests/test10_unicode.ott    |  45 ++++++++++++
 tests/test17.12_unicode.ott | 136 ++++++++++++++++++++++++++++++++++++
 5 files changed, 260 insertions(+), 23 deletions(-)
 create mode 100644 tests/test10_unicode.ott
 create mode 100644 tests/test17.12_unicode.ott

diff --git a/src/auxl.ml b/src/auxl.ml
index 88ee363..3e3905e 100644
--- a/src/auxl.ml
+++ b/src/auxl.ml
@@ -215,6 +215,49 @@ let string_of_char_list ts =
   List.iter (Buffer.add_char s) ts;
   Buffer.contents s
 
+exception Not_UTF8
+
+let utf8_cp_list_of_string s =
+  let n = String.length s in
+  let rec f i = if i=n then [] else 
+    let c = int_of_char (String.get s i) in
+    if c land 0x80 = 0x80 then
+      if c land 0x40 = 0x40 then
+        if c land 0x20 = 0x20 then
+          if c land 0x10 = 0x10 then
+            (* First byte starts with 1111; codepoint is 4 bytes long *)
+            if i+3 >= n then
+              raise Not_UTF8
+            else
+              let c = (c land 0x0F) lsl 18 
+                + (int_of_char (String.get s (i+1)) land 0x3F) lsl 12 
+                + (int_of_char (String.get s (i+2)) land 0x3F) lsl 6 
+                + (int_of_char (String.get s (i+3)) land 0x3F) in
+              c :: f (i+4)
+          else
+            (* First byte starts with 1110; codepoint is 3 bytes long *)
+            if i+2 >= n then
+              raise Not_UTF8
+            else
+              let c = (c land 0x0F) lsl 12 
+                + (int_of_char (String.get s (i+1)) land 0x3F) lsl 6 
+                + (int_of_char (String.get s (i+2)) land 0x3F) in
+              c :: f (i+3)
+        else
+          (* First byte starts with 110; codepoint is 2 bytes long *)
+          if i+1 >= n then
+            raise Not_UTF8
+          else
+            let c = (c land 0x1F) lsl 6 + (int_of_char (String.get s (i+1)) land 0x3F) in
+            c :: f (i+2)
+      else
+        (* First byte starts with 10: invalid UTF 8 since leading byte cannot start with 10 *)
+        raise Not_UTF8
+    else
+      c :: f (i+1) 
+    in
+  f 0
+
 (* tests to identify strings *)
 
 exception Not_alpha
diff --git a/src/grammar_lexer.mll b/src/grammar_lexer.mll
index 46b7a31..5d94130 100644
--- a/src/grammar_lexer.mll
+++ b/src/grammar_lexer.mll
@@ -199,22 +199,24 @@ let get_first_word_and_rest =
 let go_back n f lexbuf =
   lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - n;
   my_lexer_state := f
-
 } 
 
+(* remember: these codepoints are in decimal, not octal *)
 let whitespace = (' ' | '\t' | '\010' | '\012' | '\013')
 let newline = ('\010' | '\013' | "\013\010")
 let non_newline = [^ '\010' '\013']
 let non_newline_whitespace = (' ' | '\t' |'\012')
-(* let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")* *)
-let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")*
-let maybe_quoted_ident = pre_ident | ("'" pre_ident "'" ) | "''"
+(* Note: the bit at the end allows all Unicode which is not special ASCII characters *)
+let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'" | ['\128'-'\255'])+
+let maybe_quoted_ident = pre_ident
 (*
 let pre_ident_allow_minus = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")*
 let maybe_quoted_ident_allow_minus = pre_ident_allow_minus | ("'" pre_ident_allow_minus "'" ) | "''"
 *)
 let string = [^ ' '  '\t'  '\010'  '\012'  '\013']+
-
+(* UNICODE NOTES: The above regexes will parse unicode whitespace as identifiers. This is the same
+as what the "string" regex would do in some positions previously, and the "whitespace" regex is not
+Unicode-aware, so we view this as not breaking anything new. Nonetheless, caveat emptor. *)
 
 (* We use context-dependent lexing, lexing (eg) the elements of a *)
 (* production w.r.t. a small language so that the user rarely has to *)
diff --git a/src/grammar_pp.ml b/src/grammar_pp.ml
index a852020..531600e 100644
--- a/src/grammar_pp.ml
+++ b/src/grammar_pp.ml
@@ -797,25 +797,36 @@ let pp_tex_COMP_NAME m = "\\"^pp_tex_NAME_PREFIX m^"comp"
 let pp_tex_COMP_U_NAME m = "\\"^pp_tex_NAME_PREFIX m^"compu"
 let pp_tex_COMP_LU_NAME m = "\\"^pp_tex_NAME_PREFIX m^"complu"
 
+(* Note: as a hack, we represent unicode characters as base16 shifted forward by 16, i.e. 0, 1, 2,
+3, 4, 5, 6, 7, 8, 9, 10, a, b, c, d, e, f become  g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v*)
+let utf8_tex_escape c = 
+  if c <= 255 then
+    match char_of_int c with
+    | '_' ->"XX" 
+    | '#' -> "YY" 
+    | '\'' -> "PP" 
+    | '0' -> "Zero"
+    | '1' -> "One"
+    | '2' -> "Two"
+    | '3' -> "Three"
+    | '4' -> "Four"
+    | '5' -> "Five"
+    | '6' -> "Six"
+    | '7' -> "Seven"
+    | '8' -> "Eight"
+    | '9' -> "Nine"
+    | c -> String.make 1 c
+  else
+    let rec shex s c = 
+      let d = c mod 16 in
+      let c = c / 16 in
+      let s = (String.make 1 (char_of_int ((int_of_char 'g') + d))) ^ s in
+      if c = 0 then s else shex s c
+    in
+      shex "" c
+
 let rec tex_command_escape s = 
-  String.concat "" 
-    (List.map 
-       (fun c -> match c with
-       | '_' ->"XX" 
-       | '#' -> "YY" 
-       | '\'' -> "PP" 
-       | '0' -> "Zero"
-       | '1' -> "One"
-       | '2' -> "Two"
-       | '3' -> "Three"
-       | '4' -> "Four"
-       | '5' -> "Five"
-       | '6' -> "Six"
-       | '7' -> "Seven"
-       | '8' -> "Eight"
-       | '9' -> "Nine"
-       | _ -> String.make 1 c) 
-       (Auxl.char_list_of_string s))
+  String.concat "" (List.map utf8_tex_escape (Auxl.utf8_cp_list_of_string s))
 
 and tex_rule_name m ntr = "\\"^pp_tex_NAME_PREFIX m^tex_command_escape ntr 
 and tex_drule_name m s = "\\"^pp_tex_NAME_PREFIX m^"drule"^tex_command_escape s
diff --git a/tests/test10_unicode.ott b/tests/test10_unicode.ott
new file mode 100644
index 0000000..707a87d
--- /dev/null
+++ b/tests/test10_unicode.ott
@@ -0,0 +1,45 @@
+metavar var, x ::=   {{ com  term variable }} 
+{{ isa string}} {{ coq nat}} {{ hol string}} {{ lem string }} {{ coq-equality }}
+{{ ocaml int}} {{ lex alphanum}} {{ tex \mathit{[[var]]} }}
+
+grammar
+项 {{tex \tau}}, t :: 't_' ::=                               {{ com term    }}
+  | x            ::   :: var                   {{ com variable}}
+  | λ x . t      ::   :: lam (+ bind x in t +) {{ com lambda  }}
+  | t t'         ::   :: app                   {{ com app     }}
+  | ( t )        :: S :: paren                 {{ icho [[t]]  }}  {{ lem [[t]] }}
+  | { t / x } t' :: M :: sub  
+                         {{ icho (tsubst_term [[t]] [[x]] [[t']])}}
+  {{ lem (tsubst_term [[t]] [[x]] [[t']]) }}
+val,v :: 'v_' ::=                            {{ com value   }}
+  | λ x . t      ::   :: lam                   {{ com lambda  }}
+
+terminals :: 'terminals_' ::=
+  | λ            ::   :: lambda  {{ tex \lambda }}
+  | -->          ::   :: red     {{ tex \longrightarrow }}
+
+subrules
+  val <:: 项
+
+substitutions
+  single 项 var :: tsubst 
+
+defns
+Jop :: '' ::=
+
+ defn
+ t1 --> t2 :: ::reduce::'' {{ com $[[t1]]$ reduces to $[[t2]]$ }} {{ lemwcf  witness type reduce_witness; check reduce_check; eval : input -> output }} by
+
+
+    --------------------------  :: ax_app
+    (λx.t1) v2 -->  {v2/x}t1
+
+    t1 --> t1'
+    -------------- :: ctx_app_fun
+    t1 t --> t1' t
+
+    t1 --> t1'
+    -------------- :: ctx_app_arg
+    v t1 --> v t1'
+
+
diff --git a/tests/test17.12_unicode.ott b/tests/test17.12_unicode.ott
new file mode 100644
index 0000000..5cea56e
--- /dev/null
+++ b/tests/test17.12_unicode.ott
@@ -0,0 +1,136 @@
+% test17.12.ott  sundry list form test
+
+embed
+{{ coq 
+(** some concrete labels *)
+Definition LA : nat := 1.
+Definition LB : nat := 2.
+Definition LC : nat := 3.
+}}
+
+metavar typevar, X ::=
+ {{ isa string }} {{ coq nat }} {{ coq-equality }} {{ hol string }} {{ lex Alphanum }}  
+ {{ tex \mathit{[[typevar]]} }} {{ com  type variable  }}  
+ {{ isavar ''[[typevar]]'' }} {{ texvar \mathrm{[[typevar]]} }} 
+
+metavar termvar, x ::=
+ {{ isa string }} {{ coq nat }} {{ hol string }}  {{ coq-equality }} {{ lex alphanum }}  
+ {{ tex \mathit{[[termvar]]} }} {{ com  term variable  }} 
+ {{ isavar ''[[termvar]]'' }} {{ texvar \mathrm{[[termvar]]} }} 
+
+metavar label, l, k ::=
+ {{ isa string }} {{ coq nat }} {{ hol string }} {{ lex alphanum }}  {{ tex \mathit{[[label]]} }}  {{ isavar ''[[label]]'' }} {{ holvar "[[label]]" }}
+ {{ com  field label  }}
+
+indexvar index, i, j, n, m  ::= {{ isa nat }} {{ coq nat }} {{ hol num }} {{ lex numeral }}
+  {{ com indices }}
+
+
+grammar
+
+T {{ hol Typ }}, S, U :: 'T_' ::=                    {{ com type  }}
+  | X                                :: :: Var          {{ com type variable }}  
+  | { l1 : T1 , .. , ln : Tn }      :: :: Rec           {{ com record }}           
+  | [ l0 : T0 , .. , ln-1 : Tn-1 ]      :: :: ZRec           {{ com record }}           
+  | int :: :: int
+  | bool :: :: bool
+
+t :: 't_' ::=                                                      {{ com  term  }}
+  | x                                     :: :: Var                 {{ com variable }}         
+%  | { l1 = t1 ,  .. , ln = tn }           :: :: Rec             {{ com record  --- dots }}
+%  | { </ li = ti // i /> }                :: :: Rec_comp_none   {{ com record --- comp }}
+%  | { </ li = ti // , // i /> }           :: :: Rec_comp_some   {{ com record --- comp with terminal }}
+%  | { </ li = ti // i IN n /> }           :: :: Rec_comp_u_none {{ com record --- compu }} 
+%  | { </ li = ti // , // i IN n /> }      :: :: Rec_comp_u_some {{ com record --- compu with terminal }} 
+%  | { </ li = ti // i IN 1 .. n /> }      :: :: Rec_comp_lu_none{{ com record --- complu }}  
+  | { </ li = ti // , // i IN 1 .. n /> } :: :: Rec_comp_lu_some {{ com record --- complu with terminal }}  
+  | [ </ li = ti // , // i IN 0 .. n-1 /> ] :: :: ZRec_comp_lu_some {{ com record --- complu with terminal }}  
+  | t . l                             :: :: Proj                     {{ com projection }} 
+  | true                              :: :: True {{ com True }}
+  | false                             :: :: False {{ com False }}
+
+Γ {{ tex \Gamma }}, Δ {{ tex \Delta }} :: 'G_' ::= {{ com type environment }}
+  | empty                            ::   :: empty       
+  | Γ , X <: T                       ::   :: type        
+  | Γ , x : T                        ::    :: term
+%  | Γ , Γ'                           :: M :: comma    {{ ich TODO }}
+%  | Γ1 , .. , Γn                     :: M :: dots {{ ich TODO }}
+
+
+formula :: formula_ ::=          
+  | judgement              :: :: judgement
+  | formula1 .. formulan   :: :: dots 
+%{{ isa (List.list_all (\<lambda> b . b) ( [[ formula1 .. formulan ]] ) ) }}
+
+%formula :: formula_ ::=          
+%  | preformula1 .. preformulan   :: :: realdots {{ isa (List.list_all (\<lambda> b . b) ( [[ preformula1 .. preformulan ]] ) ) }}
+
+terminals :: terminals_ ::=
+  | |-                     ::   :: turnstile {{ tex \vdash }}
+  | <:                     ::   :: subtype   {{ tex <: }}
+
+defns
+  Jtype :: '' ::= 
+
+defn
+Γ |- t : T ::  :: Ty :: Ty_  {{ com term $[[t]]$ has type $[[T]]$ }} by 
+
+ Γ|-t0:T0 .. Γ|- tn-1 : Tn-1
+ ------------------------------------- :: Rcd_dotform
+ Γ|- {l0=t0,..,ln-1=tn-1}:{l0:T0,..,ln-1:Tn-1}
+ 
+ Γ|- t:{l0:T0,..,ln-1:Tn-1}
+ ----------------------- :: Proj_dotform
+ Γ|- t.lj : Tj
+
+
+ </ Γ|-ti:Ti //i/>
+ -------------------------------------- :: Rcd_comp
+ Γ|- { </li=ti//i/> }:{ </ li:Ti //i/> }
+
+ Γ|- t: { </ li:Ti // i/> }
+ -------------------------------------- :: Proj_comp
+ Γ|- t.lj : Tj
+
+ </ Γ|-ti:Ti //i IN n/>
+ -------------------------------------------------- :: Rcd_comp_u
+ Γ|- { </li=ti//i IN n/> }:{ </ li:Ti //i IN n/> }
+
+ Γ|- t: { </ li:Ti // i IN n/> }
+ -------------------------------------------------- :: Proj_comp_u
+ Γ|- t.lj : Tj
+
+ </ Γ|-ti:Ti //i IN 0..n-1/>
+ ------------------------------------------------------- :: Rcd_comp_lu
+ Γ|- { </li=ti//i IN 0..n-1/> }:{ </ li:Ti //i IN 0..n-1/> }
+
+ Γ|- t: { </ li:Ti // i IN 0..n-1/> }
+ ------------------------------------------------------- :: Proj_comp_lu
+ Γ|- t.lj : Tj
+
+
+ Γ |- x : { l_0:T_0 ,.., l_n-1:T_n-1 }
+ --------------- :: foo
+ Γ |- x : { li0:Ti0 ,.., lin-1:Tin-1 }
+
+
+defn
+|- T ::  :: Tyfoo :: Tyfoo_ by 
+
+|- { } 
+|- { LA:int }
+|- { LA:int, LB:bool }
+|- { LA:int, LB:bool, LC : bool }
+|- { l1:T1,..,lm:Tm }
+|- { l:T , l1:T1,..,lm:Tm }
+|- { l1:T1,..,lm:Tm, l:T }
+|- { l1:T1,..,lm:Tm,l:T,l1':T1',..,ln':Tn' }
+--------------------------------------------- :: 1
+ |- int
+
+|- { </ li:Ti // i IN 1..m /> }
+|- { l:T , </ li:Ti // i IN 1..m /> }
+|- { </ li:Ti // i IN 1..m /> , l:T }
+|- { </ li:Ti // i IN 1..m /> ,l:T, </ l'j:T'j // j IN 1..n /> }
+-------------------------------- :: 2
+|- int