Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for UTF-8 #110

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/auxl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,49 @@ let string_of_char_list ts =
List.iter (Buffer.add_char s) ts;
Buffer.contents s

exception Not_UTF8

let utf8_cp_list_of_string s =
let n = String.length s in
let rec f i = if i=n then [] else
let c = int_of_char (String.get s i) in
if c land 0x80 = 0x80 then
if c land 0x40 = 0x40 then
if c land 0x20 = 0x20 then
if c land 0x10 = 0x10 then
(* First byte starts with 1111; codepoint is 4 bytes long *)
if i+3 >= n then
raise Not_UTF8
else
let c = (c land 0x0F) lsl 18
+ (int_of_char (String.get s (i+1)) land 0x3F) lsl 12
+ (int_of_char (String.get s (i+2)) land 0x3F) lsl 6
+ (int_of_char (String.get s (i+3)) land 0x3F) in
c :: f (i+4)
else
(* First byte starts with 1110; codepoint is 3 bytes long *)
if i+2 >= n then
raise Not_UTF8
else
let c = (c land 0x0F) lsl 12
+ (int_of_char (String.get s (i+1)) land 0x3F) lsl 6
+ (int_of_char (String.get s (i+2)) land 0x3F) in
c :: f (i+3)
else
(* First byte starts with 110; codepoint is 2 bytes long *)
if i+1 >= n then
raise Not_UTF8
else
let c = (c land 0x1F) lsl 6 + (int_of_char (String.get s (i+1)) land 0x3F) in
c :: f (i+2)
else
(* First byte starts with 10: invalid UTF 8 since leading byte cannot start with 10 *)
raise Not_UTF8
else
c :: f (i+1)
in
f 0

(* tests to identify strings *)

exception Not_alpha
Expand Down
12 changes: 7 additions & 5 deletions src/grammar_lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -199,22 +199,24 @@ let get_first_word_and_rest =
let go_back n f lexbuf =
lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - n;
my_lexer_state := f

}

(* remember: these codepoints are in decimal, not octal *)
let whitespace = (' ' | '\t' | '\010' | '\012' | '\013')
let newline = ('\010' | '\013' | "\013\010")
let non_newline = [^ '\010' '\013']
let non_newline_whitespace = (' ' | '\t' |'\012')
(* let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")* *)
let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")*
let maybe_quoted_ident = pre_ident | ("'" pre_ident "'" ) | "''"
(* Note: the bit at the end allows all Unicode which is not special ASCII characters *)
let pre_ident = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'" | ['\128'-'\255'])+
let maybe_quoted_ident = pre_ident
(*
let pre_ident_allow_minus = (['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "'")(['A'-'Z'] | ['a'-'z'] | ['0'-'9'] | "_" | "-" | "'")*
let maybe_quoted_ident_allow_minus = pre_ident_allow_minus | ("'" pre_ident_allow_minus "'" ) | "''"
*)
let string = [^ ' ' '\t' '\010' '\012' '\013']+

(* UNICODE NOTES: The above regexes will parse unicode whitespace as identifiers. This is the same
as what the "string" regex would do in some positions previously, and the "whitespace" regex is not
Unicode-aware, so we view this as not breaking anything new. Nonetheless, caveat emptor. *)

(* We use context-dependent lexing, lexing (eg) the elements of a *)
(* production w.r.t. a small language so that the user rarely has to *)
Expand Down
47 changes: 29 additions & 18 deletions src/grammar_pp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -797,25 +797,36 @@ let pp_tex_COMP_NAME m = "\\"^pp_tex_NAME_PREFIX m^"comp"
let pp_tex_COMP_U_NAME m = "\\"^pp_tex_NAME_PREFIX m^"compu"
let pp_tex_COMP_LU_NAME m = "\\"^pp_tex_NAME_PREFIX m^"complu"

(* Note: as a hack, we represent unicode characters as base16 shifted forward by 16, i.e. 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, a, b, c, d, e, f become g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v*)
let utf8_tex_escape c =
if c <= 255 then
match char_of_int c with
| '_' ->"XX"
| '#' -> "YY"
| '\'' -> "PP"
| '0' -> "Zero"
| '1' -> "One"
| '2' -> "Two"
| '3' -> "Three"
| '4' -> "Four"
| '5' -> "Five"
| '6' -> "Six"
| '7' -> "Seven"
| '8' -> "Eight"
| '9' -> "Nine"
| c -> String.make 1 c
else
let rec shex s c =
let d = c mod 16 in
let c = c / 16 in
let s = (String.make 1 (char_of_int ((int_of_char 'g') + d))) ^ s in
if c = 0 then s else shex s c
in
shex "" c

let rec tex_command_escape s =
String.concat ""
(List.map
(fun c -> match c with
| '_' ->"XX"
| '#' -> "YY"
| '\'' -> "PP"
| '0' -> "Zero"
| '1' -> "One"
| '2' -> "Two"
| '3' -> "Three"
| '4' -> "Four"
| '5' -> "Five"
| '6' -> "Six"
| '7' -> "Seven"
| '8' -> "Eight"
| '9' -> "Nine"
| _ -> String.make 1 c)
(Auxl.char_list_of_string s))
String.concat "" (List.map utf8_tex_escape (Auxl.utf8_cp_list_of_string s))

and tex_rule_name m ntr = "\\"^pp_tex_NAME_PREFIX m^tex_command_escape ntr
and tex_drule_name m s = "\\"^pp_tex_NAME_PREFIX m^"drule"^tex_command_escape s
Expand Down
45 changes: 45 additions & 0 deletions tests/test10_unicode.ott
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
metavar var, x ::= {{ com term variable }}
{{ isa string}} {{ coq nat}} {{ hol string}} {{ lem string }} {{ coq-equality }}
{{ ocaml int}} {{ lex alphanum}} {{ tex \mathit{[[var]]} }}

grammar
项 {{tex \tau}}, t :: 't_' ::= {{ com term }}
| x :: :: var {{ com variable}}
| λ x . t :: :: lam (+ bind x in t +) {{ com lambda }}
| t t' :: :: app {{ com app }}
| ( t ) :: S :: paren {{ icho [[t]] }} {{ lem [[t]] }}
| { t / x } t' :: M :: sub
{{ icho (tsubst_term [[t]] [[x]] [[t']])}}
{{ lem (tsubst_term [[t]] [[x]] [[t']]) }}
val,v :: 'v_' ::= {{ com value }}
| λ x . t :: :: lam {{ com lambda }}

terminals :: 'terminals_' ::=
| λ :: :: lambda {{ tex \lambda }}
| --> :: :: red {{ tex \longrightarrow }}

subrules
val <:: 项

substitutions
single 项 var :: tsubst

defns
Jop :: '' ::=

defn
t1 --> t2 :: ::reduce::'' {{ com $[[t1]]$ reduces to $[[t2]]$ }} {{ lemwcf witness type reduce_witness; check reduce_check; eval : input -> output }} by


-------------------------- :: ax_app
(λx.t1) v2 --> {v2/x}t1

t1 --> t1'
-------------- :: ctx_app_fun
t1 t --> t1' t

t1 --> t1'
-------------- :: ctx_app_arg
v t1 --> v t1'


136 changes: 136 additions & 0 deletions tests/test17.12_unicode.ott
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
% test17.12.ott sundry list form test

embed
{{ coq
(** some concrete labels *)
Definition LA : nat := 1.
Definition LB : nat := 2.
Definition LC : nat := 3.
}}

metavar typevar, X ::=
{{ isa string }} {{ coq nat }} {{ coq-equality }} {{ hol string }} {{ lex Alphanum }}
{{ tex \mathit{[[typevar]]} }} {{ com type variable }}
{{ isavar ''[[typevar]]'' }} {{ texvar \mathrm{[[typevar]]} }}

metavar termvar, x ::=
{{ isa string }} {{ coq nat }} {{ hol string }} {{ coq-equality }} {{ lex alphanum }}
{{ tex \mathit{[[termvar]]} }} {{ com term variable }}
{{ isavar ''[[termvar]]'' }} {{ texvar \mathrm{[[termvar]]} }}

metavar label, l, k ::=
{{ isa string }} {{ coq nat }} {{ hol string }} {{ lex alphanum }} {{ tex \mathit{[[label]]} }} {{ isavar ''[[label]]'' }} {{ holvar "[[label]]" }}
{{ com field label }}

indexvar index, i, j, n, m ::= {{ isa nat }} {{ coq nat }} {{ hol num }} {{ lex numeral }}
{{ com indices }}


grammar

T {{ hol Typ }}, S, U :: 'T_' ::= {{ com type }}
| X :: :: Var {{ com type variable }}
| { l1 : T1 , .. , ln : Tn } :: :: Rec {{ com record }}
| [ l0 : T0 , .. , ln-1 : Tn-1 ] :: :: ZRec {{ com record }}
| int :: :: int
| bool :: :: bool

t :: 't_' ::= {{ com term }}
| x :: :: Var {{ com variable }}
% | { l1 = t1 , .. , ln = tn } :: :: Rec {{ com record --- dots }}
% | { </ li = ti // i /> } :: :: Rec_comp_none {{ com record --- comp }}
% | { </ li = ti // , // i /> } :: :: Rec_comp_some {{ com record --- comp with terminal }}
% | { </ li = ti // i IN n /> } :: :: Rec_comp_u_none {{ com record --- compu }}
% | { </ li = ti // , // i IN n /> } :: :: Rec_comp_u_some {{ com record --- compu with terminal }}
% | { </ li = ti // i IN 1 .. n /> } :: :: Rec_comp_lu_none{{ com record --- complu }}
| { </ li = ti // , // i IN 1 .. n /> } :: :: Rec_comp_lu_some {{ com record --- complu with terminal }}
| [ </ li = ti // , // i IN 0 .. n-1 /> ] :: :: ZRec_comp_lu_some {{ com record --- complu with terminal }}
| t . l :: :: Proj {{ com projection }}
| true :: :: True {{ com True }}
| false :: :: False {{ com False }}

Γ {{ tex \Gamma }}, Δ {{ tex \Delta }} :: 'G_' ::= {{ com type environment }}
| empty :: :: empty
| Γ , X <: T :: :: type
| Γ , x : T :: :: term
% | Γ , Γ' :: M :: comma {{ ich TODO }}
% | Γ1 , .. , Γn :: M :: dots {{ ich TODO }}


formula :: formula_ ::=
| judgement :: :: judgement
| formula1 .. formulan :: :: dots
%{{ isa (List.list_all (\<lambda> b . b) ( [[ formula1 .. formulan ]] ) ) }}

%formula :: formula_ ::=
% | preformula1 .. preformulan :: :: realdots {{ isa (List.list_all (\<lambda> b . b) ( [[ preformula1 .. preformulan ]] ) ) }}

terminals :: terminals_ ::=
| |- :: :: turnstile {{ tex \vdash }}
| <: :: :: subtype {{ tex <: }}

defns
Jtype :: '' ::=

defn
Γ |- t : T :: :: Ty :: Ty_ {{ com term $[[t]]$ has type $[[T]]$ }} by

Γ|-t0:T0 .. Γ|- tn-1 : Tn-1
------------------------------------- :: Rcd_dotform
Γ|- {l0=t0,..,ln-1=tn-1}:{l0:T0,..,ln-1:Tn-1}

Γ|- t:{l0:T0,..,ln-1:Tn-1}
----------------------- :: Proj_dotform
Γ|- t.lj : Tj


</ Γ|-ti:Ti //i/>
-------------------------------------- :: Rcd_comp
Γ|- { </li=ti//i/> }:{ </ li:Ti //i/> }

Γ|- t: { </ li:Ti // i/> }
-------------------------------------- :: Proj_comp
Γ|- t.lj : Tj

</ Γ|-ti:Ti //i IN n/>
-------------------------------------------------- :: Rcd_comp_u
Γ|- { </li=ti//i IN n/> }:{ </ li:Ti //i IN n/> }

Γ|- t: { </ li:Ti // i IN n/> }
-------------------------------------------------- :: Proj_comp_u
Γ|- t.lj : Tj

</ Γ|-ti:Ti //i IN 0..n-1/>
------------------------------------------------------- :: Rcd_comp_lu
Γ|- { </li=ti//i IN 0..n-1/> }:{ </ li:Ti //i IN 0..n-1/> }

Γ|- t: { </ li:Ti // i IN 0..n-1/> }
------------------------------------------------------- :: Proj_comp_lu
Γ|- t.lj : Tj


Γ |- x : { l_0:T_0 ,.., l_n-1:T_n-1 }
--------------- :: foo
Γ |- x : { li0:Ti0 ,.., lin-1:Tin-1 }


defn
|- T :: :: Tyfoo :: Tyfoo_ by

|- { }
|- { LA:int }
|- { LA:int, LB:bool }
|- { LA:int, LB:bool, LC : bool }
|- { l1:T1,..,lm:Tm }
|- { l:T , l1:T1,..,lm:Tm }
|- { l1:T1,..,lm:Tm, l:T }
|- { l1:T1,..,lm:Tm,l:T,l1':T1',..,ln':Tn' }
--------------------------------------------- :: 1
|- int

|- { </ li:Ti // i IN 1..m /> }
|- { l:T , </ li:Ti // i IN 1..m /> }
|- { </ li:Ti // i IN 1..m /> , l:T }
|- { </ li:Ti // i IN 1..m /> ,l:T, </ l'j:T'j // j IN 1..n /> }
-------------------------------- :: 2
|- int