@@ -4,96 +4,75 @@ Module: char
4
4
Utilities for manipulating the char type
5
5
*/
6
6
7
+ /*
8
+ Lu Uppercase_Letter an uppercase letter
9
+ Ll Lowercase_Letter a lowercase letter
10
+ Lt Titlecase_Letter a digraphic character, with first part uppercase
11
+ Lm Modifier_Letter a modifier letter
12
+ Lo Other_Letter other letters, including syllables and ideographs
13
+ Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
14
+ Mc Spacing_Mark a spacing combining mark (positive advance width)
15
+ Me Enclosing_Mark an enclosing combining mark
16
+ Nd Decimal_Number a decimal digit
17
+ Nl Letter_Number a letterlike numeric character
18
+ No Other_Number a numeric character of other type
19
+ Pc Connector_Punctuation a connecting punctuation mark, like a tie
20
+ Pd Dash_Punctuation a dash or hyphen punctuation mark
21
+ Ps Open_Punctuation an opening punctuation mark (of a pair)
22
+ Pe Close_Punctuation a closing punctuation mark (of a pair)
23
+ Pi Initial_Punctuation an initial quotation mark
24
+ Pf Final_Punctuation a final quotation mark
25
+ Po Other_Punctuation a punctuation mark of other type
26
+ Sm Math_Symbol a symbol of primarily mathematical use
27
+ Sc Currency_Symbol a currency sign
28
+ Sk Modifier_Symbol a non-letterlike modifier symbol
29
+ So Other_Symbol a symbol of other type
30
+ Zs Space_Separator a space character (of various non-zero widths)
31
+ Zl Line_Separator U+2028 LINE SEPARATOR only
32
+ Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
33
+ Cc Control a C0 or C1 control code
34
+ Cf Format a format control character
35
+ Cs Surrogate a surrogate code point
36
+ Co Private_Use a private-use character
37
+ Cn Unassigned a reserved unassigned code point or a noncharacter
38
+ */
39
+
40
+ import is_alphabetic = unicode:: derived_property:: Alphabetic ;
41
+ import is_XID_start = unicode:: derived_property:: XID_Start ;
42
+ import is_XID_continue = unicode:: derived_property:: XID_Continue ;
43
+
7
44
/*
8
45
Function: is_whitespace
9
46
10
- Indicates whether a character is whitespace.
47
+ Indicates whether a character is whitespace, defined in terms of
48
+ the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
49
+ 'Cc'-category control codes in the range [0x09, 0x0d].
11
50
12
- Whitespace characters include space (U+0020), tab (U+0009), line feed
13
- (U+000A), carriage return (U+000D), and a number of less common
14
- ASCII and unicode characters.
15
51
*/
16
52
pure fn is_whitespace ( c : char ) -> bool {
17
- const ch_space: char = '\u0020' ;
18
- const ch_ogham_space_mark: char = '\u1680' ;
19
- const ch_mongolian_vowel_sep: char = '\u180e' ;
20
- const ch_en_quad: char = '\u2000' ;
21
- const ch_em_quad: char = '\u2001' ;
22
- const ch_en_space: char = '\u2002' ;
23
- const ch_em_space: char = '\u2003' ;
24
- const ch_three_per_em_space: char = '\u2004' ;
25
- const ch_four_per_em_space: char = '\u2005' ;
26
- const ch_six_per_em_space: char = '\u2006' ;
27
- const ch_figure_space: char = '\u2007' ;
28
- const ch_punctuation_space: char = '\u2008' ;
29
- const ch_thin_space: char = '\u2009' ;
30
- const ch_hair_space: char = '\u200a' ;
31
- const ch_narrow_no_break_space: char = '\u202f' ;
32
- const ch_medium_mathematical_space: char = '\u205f' ;
33
- const ch_ideographic_space: char = '\u3000' ;
34
- const ch_line_separator: char = '\u2028' ;
35
- const ch_paragraph_separator: char = '\u2029' ;
36
- const ch_character_tabulation: char = '\u0009' ;
37
- const ch_line_feed: char = '\u000a' ;
38
- const ch_line_tabulation: char = '\u000b' ;
39
- const ch_form_feed: char = '\u000c' ;
40
- const ch_carriage_return: char = '\u000d' ;
41
- const ch_next_line: char = '\u0085' ;
42
- const ch_no_break_space: char = '\u00a0' ;
43
-
44
- if c == ch_space {
45
- true
46
- } else if c == ch_ogham_space_mark {
47
- true
48
- } else if c == ch_mongolian_vowel_sep {
49
- true
50
- } else if c == ch_en_quad {
51
- true
52
- } else if c == ch_em_quad {
53
- true
54
- } else if c == ch_en_space {
55
- true
56
- } else if c == ch_em_space {
57
- true
58
- } else if c == ch_three_per_em_space {
59
- true
60
- } else if c == ch_four_per_em_space {
61
- true
62
- } else if c == ch_six_per_em_space {
63
- true
64
- } else if c == ch_figure_space {
65
- true
66
- } else if c == ch_punctuation_space {
67
- true
68
- } else if c == ch_thin_space {
69
- true
70
- } else if c == ch_hair_space {
71
- true
72
- } else if c == ch_narrow_no_break_space {
73
- true
74
- } else if c == ch_medium_mathematical_space {
75
- true
76
- } else if c == ch_ideographic_space {
77
- true
78
- } else if c == ch_line_tabulation {
79
- true
80
- } else if c == ch_paragraph_separator {
81
- true
82
- } else if c == ch_character_tabulation {
83
- true
84
- } else if c == ch_line_feed {
85
- true
86
- } else if c == ch_line_tabulation {
87
- true
88
- } else if c == ch_form_feed {
89
- true
90
- } else if c == ch_carriage_return {
91
- true
92
- } else if c == ch_next_line {
93
- true
94
- } else if c == ch_no_break_space { true } else { false }
53
+ ret ( '\x09' <= c && c <= ' \x0x0d' )
54
+ || unicode:: general_category:: Zs ( c)
55
+ || unicode:: general_category:: Zl ( c)
56
+ || unicode:: general_category:: Zp ( c) ;
57
+ }
58
+
59
+ /*
60
+ Function: is_alphanumeric
61
+
62
+ Indicates whether a character is alphanumeric, defined in terms of
63
+ the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
64
+ Core Property 'Alphabetic'.
65
+
66
+ */
67
+
68
+ pure fn is_alphanumeric ( c : char ) -> bool {
69
+ ret unicode:: derived_property:: Alphabetic ( c) ||
70
+ unicode:: general_category:: Nd ( c) ||
71
+ unicode:: general_category:: Nl ( c) ||
72
+ unicode:: general_category:: No ( c) ;
95
73
}
96
74
75
+
97
76
/*
98
77
Function: to_digit
99
78
0 commit comments