Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unicode link label normalization (fix test 539) #277

Merged
merged 4 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,7 @@ extension mechanism, and some other features. Note that the opam
package installs both the OMD library and the command line tool `omd`.")
(tags (org:ocamllabs org:mirage))
(depends (ocaml (>= 4.05))
uutf
uucp
uunf
(dune-build-info (>= 2.7))))
3 changes: 3 additions & 0 deletions omd.opam
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ bug-reports: "https://github.com/ocaml/omd/issues"
depends: [
"dune" {>= "2.7"}
"ocaml" {>= "4.05"}
"uutf"
"uucp"
"uunf"
"dune-build-info" {>= "2.7"}
"odoc" {with-doc}
]
Expand Down
1 change: 1 addition & 0 deletions src/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(library
(name omd)
(public_name omd)
(libraries uutf uucp uunf)
(flags :standard -w -30))

(rule
Expand Down
62 changes: 51 additions & 11 deletions src/parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1085,19 +1085,59 @@ let link_label allow_balanced_brackets st =
in
loop 0 false

type add_uchar_result =
{ start : bool
; seen_ws : bool
}

(* based on https://erratique.ch/software/uucp/doc/Uucp/Case/index.html#caselesseq *)
let normalize s =
let buf = Buffer.create (String.length s) in
let rec loop ~start ~seen_ws i =
if i >= String.length s then Buffer.contents buf
else
match s.[i] with
| ' ' | '\t' | '\010' .. '\013' -> loop ~start ~seen_ws:true (succ i)
| _ as c ->
if (not start) && seen_ws then Buffer.add_char buf ' ';
Buffer.add_char buf (Char.lowercase_ascii c);
loop ~start:false ~seen_ws:false (succ i)
let canonical_caseless_key s =
let b = Buffer.create (String.length s * 2) in
let to_nfd_and_utf_8 =
let n = Uunf.create `NFD in
let rec add v =
match Uunf.add n v with
| `Await | `End -> ()
| `Uchar u ->
Uutf.Buffer.add_utf_8 b u;
add `Await
in
add
in
let add_nfd =
let n = Uunf.create `NFD in
let rec add v =
match Uunf.add n v with
| `Await | `End -> ()
| `Uchar u ->
(match Uucp.Case.Fold.fold u with
| `Self -> to_nfd_and_utf_8 (`Uchar u)
| `Uchars us -> List.iter (fun u -> to_nfd_and_utf_8 (`Uchar u)) us);
add `Await
in
add
in
let uspace = `Uchar (Uchar.of_char ' ') in
let add_uchar { start; seen_ws } _ = function
| `Malformed _ ->
add_nfd (`Uchar Uutf.u_rep);
{ start = false; seen_ws = false }
| `Uchar u as uchar ->
if Uucp.White.is_white_space u then { start; seen_ws = true }
else (
if (not start) && seen_ws then add_nfd uspace;
add_nfd uchar;
{ start = false; seen_ws = false })
in
let (_ : add_uchar_result) =
Uutf.String.fold_utf_8 add_uchar { start = true; seen_ws = false } s
in
add_nfd `End;
to_nfd_and_utf_8 `End;
Buffer.contents b
in
loop ~start:true ~seen_ws:false 0
canonical_caseless_key s

let tag_name st =
match peek_exn st with
Expand Down
31 changes: 31 additions & 0 deletions tests/blackbox/normalize-label.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Case insensitive comparison
$ omd << "MD"
> [ΑΓΩ]: /url
>
> [αγω]
> MD
<p><a href="/url">αγω</a></p>

Collapse consecutive internal spaces, tabs
$ omd << "MD"
> [ΑΓ Ω]: /url
>
> [αγ ω]
> MD
<p><a href="/url">αγ ω</a></p>

Strip leading and trailing spaces, tabs
$ omd << "MD"
> [ ΑΓΩ ]: /url
>
> [αγω]
> MD
<p><a href="/url">αγω</a></p>

Doesn't match due to the internal space
$ omd << "MD"
> [ΑΓΩ]: /url
>
> [α γω]
> MD
<p>[α γω]</p>
1 change: 1 addition & 0 deletions tests/dune.inc

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/extract_tests.ml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ let protect ~finally f =
finally ();
r

let disabled = [ 206; 215; 216; 519; 539 ]
let disabled = [ 206; 215; 216; 519 ]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outstanding progress. You're demolishing these deviations. Thanks so much @tatchi! 🎉


let with_open_in fn f =
let ic = open_in fn in
Expand Down