-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathuunf.mli
200 lines (163 loc) · 7.33 KB
/
uunf.mli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
(*---------------------------------------------------------------------------
Copyright (c) 2012 The uunf programmers. All rights reserved.
SPDX-License-Identifier: ISC
---------------------------------------------------------------------------*)
(** Unicode text normalization.
[Uunf] normalizes Unicode text. It supports all Unicode
normalization forms. The module is independent from any IO
mechanism or Unicode text data structure and it can process text
without a complete in-memory representation of the data.
The supported Unicode version is determined by the {!unicode_version}
value.
Consult the {{!basics}basics}, {{!limits}limitations} and
{{!examples}examples} of use.
{3 References}
{ul
{- The Unicode Consortium.
{e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}.
(latest version)}
{- Mark Davis.
{e {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode Normalization
Forms}}. (latest version)}
{- The Unicode Consortium.
{e {{:http://www.unicode.org/charts/normalization/}Normalization charts}.
}}} *)
(** {1 Normalize} *)
type form = [ `NFD | `NFC | `NFKD | `NFKC ]
(** The type for normalization forms.
{ul
{- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d}
normalization form D}, canonical decomposition.}
{- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c}
normalization form C}, canonical decomposition followed by
canonical composition
({{:http://www.w3.org/TR/charmod-norm/}recommended} for the www).}
{- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd}
normalization form KD}, compatibility decomposition.}
{- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc}
normalization form KC}, compatibility decomposition,
followed by canonical composition.}} *)
type t
(** The type for Unicode text normalizers. *)
type ret = [ `Uchar of Uchar.t | `End | `Await ]
(** The type for normalizer results. See {!add}. *)
val create : [< form ] -> t
(** [create nf] is an Unicode text normalizer for the normal form [nf]. *)
val form : t -> form
(** [form n] is the normalization form of [n]. *)
val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret
(** [add n v] is:
{ul
{- [`Uchar u] if [u] is the next character in the normalized
sequence. The client must then call [add] with [`Await]
until [`Await] is returned.}
{- [`Await] when the normalizer is ready to add a new
[`Uchar] or [`End].}}
For [v] use [`Uchar u] to add a new character to the sequence
to normalize and [`End] to signal the end of sequence. After
adding one of these two values, always call [add] with [`Await]
until [`Await] is returned.
{b Raises.} [Invalid_argument] if [`Uchar ] or [`End] is
added directly after an [`Uchar] was returned by the normalizer
or if an [`Uchar] is added after [`End] was added. *)
val reset : t -> unit
(** [reset n] resets the normalizer to a state equivalent to the
state of [Uunf.create (Uunf.form n)]. *)
val copy : t -> t
(** [copy n] is a copy of [n] in its current state. Subsequent
{!add}s on [n] do not affect the copy. *)
val pp_ret : Format.formatter -> ret -> unit
(** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *)
(** {1:props Normalization properties}
These properties are used internally to implement the normalizers.
They are not needed to use the module but are exposed as they may
be useful to implement other algorithms. *)
val unicode_version : string
(** [unicode_version] is the Unicode version supported by the module. *)
val ccc : Uchar.t -> int
(** [ccc u] is [u]'s
{{:http://www.unicode.org/glossary/#combining_class}canonical combining
class} value. *)
val decomp : Uchar.t -> int array
(** [decomp u] is [u]'s
{{:http://www.unicode.org/glossary/#decomposition_mapping}decomposition
mapping}. If the empty array is returned, [u] decomposes to itself.
The first number in the array contains additional information, it
cannot be used as an {!Uchar.t}. Use {!d_uchar} on the number to get the
actual character and {!d_compatibility} to find out if this is
a compatibility decomposition. All other characters of the array
are guaranteed to be convertible using {!Uchar.of_int}.
{b Warning.} Do {b not} mutate the array. *)
val d_uchar : int -> Uchar.t
(** See {!decomp}. *)
val d_compatibility : int -> bool
(** See {!decomp}. *)
val composite : Uchar.t -> Uchar.t -> Uchar.t option
(** [composite u1 u2] is the
{{:http://www.unicode.org/glossary/#primary_composite}primary composite}
canonically equivalent to the sequence [<u1,u2>], if any. *)
(** {1:limits Limitations}
An [Uunf] normalizer consumes only a small bounded amount of
memory on ordinary, {e meaningful} text. However on legal but {e
degenerate} text like a
{{:http://www.unicode.org/glossary/#starter}starter} followed by
10'000 combining
{{:http://www.unicode.org/glossary/#nonspacing_mark}non-spacing
marks} it will have to bufferize all the marks (a workaround is
to first convert your input to
{{:http://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format}stream-safe
text format}). *)
(** {1:basics Basics}
A normalizer is a stateful filter that inputs a sequence of
characters and outputs an equivalent sequence in the requested
normal form.
The function {!create} returns a new normalizer for a given normal
form:
{[
let nfd = Uunf.create `NFD
]}
To add characters to the sequence to normalize, call {!add} on
[nfd] with [`Uchar _]. To end the sequence, call {!add} on [nfd]
with [`End]. The normalized sequence of characters is returned,
character by character, by the successive calls to {!add}.
The client and the normalizer must wait on each other to limit
internal buffering: each time the client adds to the sequence by
calling {!add} with [`Uchar] or [`End] it must continue to call
{!add} with [`Await] until the normalizer returns [`Await]. In
practice this leads to the following kind of control flow:
{[
let rec add acc v = match Uunf.add nfd v with
| `Uchar u -> add (u :: acc) `Await
| `Await | `End -> acc
]}
For example to normalize the character [U+00E9] (é) with [nfd] to a list
of characters we can write:
{[
let e_acute = Uchar.of_int 0x00E9
let e_acute_nfd = List.rev (add (add [] (`Uchar e_acute)) `End)
]}
The next section has more examples.
*)
(** {1:examples Examples}
{2:utf8 UTF-8 normalization}
[utf_8_normalize nf s] is the UTF-8 encoded normal form [nf] of
the UTF-8 encoded string [s].
{[
let utf_8_normalize nf s =
let rec add buf normalizer v = match Uunf.add normalizer v with
| `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
| `Await | `End -> ()
in
let rec loop buf s i max normalizer =
if i > max then (add buf normalizer `End; Buffer.contents buf) else
let dec = String.get_utf_8_uchar s i in
add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
loop buf s (i + Uchar.utf_decode_length dec) max normalizer
in
let buf = Buffer.create (String.length s * 3) in
let normalizer = Uunf.create nf in
loop buf s 0 (String.length s - 1) normalizer
]}
Note that this functionality is available directly through
{!Uunf_string.normalize_utf_8}
*)