@@ -16,21 +16,82 @@ mod prim_bool { }
16
16
17
17
#[ doc( primitive = "char" ) ]
18
18
//
19
- /// A Unicode scalar value .
19
+ /// A character type .
20
20
///
21
- /// A `char` represents a
22
- /// *[Unicode scalar
23
- /// value](http://www.unicode.org/glossary/#unicode_scalar_value)*, as it can
24
- /// contain any Unicode code point except high-surrogate and low-surrogate code
25
- /// points.
21
+ /// The `char` type represents a single character. More specifically, since
22
+ /// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
23
+ /// scalar value]', which is similar to, but not the same as, a '[Unicode code
24
+ /// point]'.
26
25
///
27
- /// As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
28
- /// (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
29
- /// however the converse is not always true due to the above range limits
30
- /// and, as such, should be performed via the `from_u32` function.
26
+ /// [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value
27
+ /// [Unicode code point]: http://www.unicode.org/glossary/#code_point
31
28
///
32
- /// *[See also the `std::char` module](char/index.html).*
29
+ /// This documentation describes a number of methods and trait implementations on the
30
+ /// `char` type. For technical reasons, there is additional, separate
31
+ /// documentation in [the `std::char` module](char/index.html) as well.
33
32
///
33
+ /// # Representation
34
+ ///
35
+ /// `char` is always four bytes in size. This is a different representation than
36
+ /// a given character would have as part of a [`String`], for example:
37
+ ///
38
+ /// ```
39
+ /// let v = vec!['h', 'e', 'l', 'l', 'o'];
40
+ ///
41
+ /// // five elements times four bytes for each element
42
+ /// assert_eq!(20, v.len() * std::mem::size_of::<char>());
43
+ ///
44
+ /// let s = String::from("hello");
45
+ ///
46
+ /// // five elements times one byte per element
47
+ /// assert_eq!(5, s.len() * std::mem::size_of::<u8>());
48
+ /// ```
49
+ ///
50
+ /// [`String`]: string/struct.String.html
51
+ ///
52
+ /// As always, remember that a human intuition for 'character' may not map to
53
+ /// Unicode's definitions. For example, emoji symbols such as '❤️' are more than
54
+ /// one byte; ❤️ in particular is six:
55
+ ///
56
+ /// ```
57
+ /// let s = String::from("❤️");
58
+ ///
59
+ /// // six bytes times one byte for each element
60
+ /// assert_eq!(6, s.len() * std::mem::size_of::<u8>());
61
+ /// ```
62
+ ///
63
+ /// This also means it won't fit into a `char`, and so trying to create a
64
+ /// literal with `let heart = '❤️';` gives an error:
65
+ ///
66
+ /// ```text
67
+ /// error: character literal may only contain one codepoint: '❤
68
+ /// let heart = '❤️';
69
+ /// ^~
70
+ /// ```
71
+ ///
72
+ /// Another implication of this is that if you want to do per-`char`acter
73
+ /// processing, it can end up using a lot more memory:
74
+ ///
75
+ /// ```
76
+ /// let s = String::from("love: ❤️");
77
+ /// let v: Vec<char> = s.chars().collect();
78
+ ///
79
+ /// assert_eq!(12, s.len() * std::mem::size_of::<u8>());
80
+ /// assert_eq!(32, v.len() * std::mem::size_of::<char>());
81
+ /// ```
82
+ ///
83
+ /// Or may give you results you may not expect:
84
+ ///
85
+ /// ```
86
+ /// let s = String::from("❤️");
87
+ ///
88
+ /// let mut iter = s.chars();
89
+ ///
90
+ /// // we get two chars out of a single ❤️
91
+ /// assert_eq!(Some('\u{2764}'), iter.next());
92
+ /// assert_eq!(Some('\u{fe0f}'), iter.next());
93
+ /// assert_eq!(None, iter.next());
94
+ /// ```
34
95
mod prim_char { }
35
96
36
97
#[ doc( primitive = "unit" ) ]
0 commit comments