Skip to content

Commit 4f41b7d

Browse files
authored
unicode link label normalization (fix test 539) (#277)
* unicode link label normalization (fix test 539) * address comments * use record instead of tuple * add extra tests for label normalization
1 parent 191414e commit 4f41b7d

File tree

7 files changed

+91
-12
lines changed

7 files changed

+91
-12
lines changed

dune-project

+3
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,7 @@ extension mechanism, and some other features. Note that the opam
2525
package installs both the OMD library and the command line tool `omd`.")
2626
(tags (org:ocamllabs org:mirage))
2727
(depends (ocaml (>= 4.05))
28+
uutf
29+
uucp
30+
uunf
2831
(dune-build-info (>= 2.7))))

omd.opam

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ bug-reports: "https://github.com/ocaml/omd/issues"
2323
depends: [
2424
"dune" {>= "2.7"}
2525
"ocaml" {>= "4.05"}
26+
"uutf"
27+
"uucp"
28+
"uunf"
2629
"dune-build-info" {>= "2.7"}
2730
"odoc" {with-doc}
2831
]

src/dune

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
(library
22
(name omd)
33
(public_name omd)
4+
(libraries uutf uucp uunf)
45
(flags :standard -w -30))
56

67
(rule

src/parser.ml

+51-11
Original file line numberDiff line numberDiff line change
@@ -1085,19 +1085,59 @@ let link_label allow_balanced_brackets st =
10851085
in
10861086
loop 0 false
10871087

1088+
type add_uchar_result =
1089+
{ start : bool
1090+
; seen_ws : bool
1091+
}
1092+
1093+
(* based on https://erratique.ch/software/uucp/doc/Uucp/Case/index.html#caselesseq *)
10881094
let normalize s =
1089-
let buf = Buffer.create (String.length s) in
1090-
let rec loop ~start ~seen_ws i =
1091-
if i >= String.length s then Buffer.contents buf
1092-
else
1093-
match s.[i] with
1094-
| ' ' | '\t' | '\010' .. '\013' -> loop ~start ~seen_ws:true (succ i)
1095-
| _ as c ->
1096-
if (not start) && seen_ws then Buffer.add_char buf ' ';
1097-
Buffer.add_char buf (Char.lowercase_ascii c);
1098-
loop ~start:false ~seen_ws:false (succ i)
1095+
let canonical_caseless_key s =
1096+
let b = Buffer.create (String.length s * 2) in
1097+
let to_nfd_and_utf_8 =
1098+
let n = Uunf.create `NFD in
1099+
let rec add v =
1100+
match Uunf.add n v with
1101+
| `Await | `End -> ()
1102+
| `Uchar u ->
1103+
Uutf.Buffer.add_utf_8 b u;
1104+
add `Await
1105+
in
1106+
add
1107+
in
1108+
let add_nfd =
1109+
let n = Uunf.create `NFD in
1110+
let rec add v =
1111+
match Uunf.add n v with
1112+
| `Await | `End -> ()
1113+
| `Uchar u ->
1114+
(match Uucp.Case.Fold.fold u with
1115+
| `Self -> to_nfd_and_utf_8 (`Uchar u)
1116+
| `Uchars us -> List.iter (fun u -> to_nfd_and_utf_8 (`Uchar u)) us);
1117+
add `Await
1118+
in
1119+
add
1120+
in
1121+
let uspace = `Uchar (Uchar.of_char ' ') in
1122+
let add_uchar { start; seen_ws } _ = function
1123+
| `Malformed _ ->
1124+
add_nfd (`Uchar Uutf.u_rep);
1125+
{ start = false; seen_ws = false }
1126+
| `Uchar u as uchar ->
1127+
if Uucp.White.is_white_space u then { start; seen_ws = true }
1128+
else (
1129+
if (not start) && seen_ws then add_nfd uspace;
1130+
add_nfd uchar;
1131+
{ start = false; seen_ws = false })
1132+
in
1133+
let (_ : add_uchar_result) =
1134+
Uutf.String.fold_utf_8 add_uchar { start = true; seen_ws = false } s
1135+
in
1136+
add_nfd `End;
1137+
to_nfd_and_utf_8 `End;
1138+
Buffer.contents b
10991139
in
1100-
loop ~start:true ~seen_ws:false 0
1140+
canonical_caseless_key s
11011141

11021142
let tag_name st =
11031143
match peek_exn st with

tests/blackbox/normalize-label.t

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Case insensitive comparison
2+
$ omd << "MD"
3+
> [ΑΓΩ]: /url
4+
>
5+
> [αγω]
6+
> MD
7+
<p><a href="/url">αγω</a></p>
8+
9+
Collapse consecutive internal spaces, tabs
10+
$ omd << "MD"
11+
> [ΑΓ Ω]: /url
12+
>
13+
> [αγ ω]
14+
> MD
15+
<p><a href="/url">αγ ω</a></p>
16+
17+
Strip leading and trailing spaces, tabs
18+
$ omd << "MD"
19+
> [ ΑΓΩ ]: /url
20+
>
21+
> [αγω]
22+
> MD
23+
<p><a href="/url">αγω</a></p>
24+
25+
Doesn't match due to the internal space
26+
$ omd << "MD"
27+
> [ΑΓΩ]: /url
28+
>
29+
> [α γω]
30+
> MD
31+
<p>[α γω]</p>

tests/dune.inc

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/extract_tests.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ let protect ~finally f =
88
finally ();
99
r
1010

11-
let disabled = [ 206; 215; 216; 519; 539 ]
11+
let disabled = [ 206; 215; 216; 519 ]
1212

1313
let with_open_in fn f =
1414
let ic = open_in fn in

0 commit comments

Comments
 (0)