|
5 | 5 |
|
6 | 6 | //! Parsing of escape sequences |
7 | 7 |
|
| 8 | +use crate::format::FormatError; |
| 9 | + |
8 | 10 | #[derive(Debug)] |
9 | 11 | pub enum EscapedChar { |
10 | 12 | /// A single byte |
@@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> { |
90 | 92 |
|
91 | 93 | // spell-checker:disable-next |
92 | 94 | /// Parse `\uHHHH` and `\UHHHHHHHH` |
93 | | -// TODO: This should print warnings and possibly halt execution when it fails to parse |
94 | | -// TODO: If the character cannot be converted to u32, the input should be printed. |
95 | | -fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> { |
96 | | - let (c, rest) = input.split_first()?; |
97 | | - let mut ret = Base::Hex.convert_digit(*c)? as u32; |
98 | | - *input = rest; |
99 | | - |
100 | | - for _ in 1..digits { |
101 | | - let (c, rest) = input.split_first()?; |
102 | | - let n = Base::Hex.convert_digit(*c)?; |
103 | | - ret = ret |
104 | | - .wrapping_mul(Base::Hex.as_base() as u32) |
105 | | - .wrapping_add(n as u32); |
| 95 | +fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> { |
| 96 | + if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) { |
106 | 97 | *input = rest; |
| 98 | + let ret = new_digits |
| 99 | + .iter() |
| 100 | + .map(|c| Base::Hex.convert_digit(*c)) |
| 101 | + .collect::<Option<Vec<u8>>>() |
| 102 | + .ok_or(EscapeError::MissingHexadecimalNumber)? |
| 103 | + .iter() |
| 104 | + .map(|n| *n as u32) |
| 105 | + .reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n)) |
| 106 | + .expect("must have multiple digits in unicode string"); |
| 107 | + char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec())) |
| 108 | + } else { |
| 109 | + Err(EscapeError::MissingHexadecimalNumber) |
107 | 110 | } |
108 | | - |
109 | | - char::from_u32(ret) |
110 | 111 | } |
111 | 112 |
|
112 | 113 | /// Represents an invalid escape sequence. |
113 | | -#[derive(Debug)] |
114 | | -pub struct EscapeError {} |
| 114 | +#[derive(Debug, PartialEq)] |
| 115 | +pub enum EscapeError { |
| 116 | + InvalidCharacters(Vec<u8>), |
| 117 | + MissingHexadecimalNumber, |
| 118 | +} |
115 | 119 |
|
116 | 120 | /// Parse an escape sequence, like `\n` or `\xff`, etc. |
117 | 121 | pub fn parse_escape_code( |
118 | 122 | rest: &mut &[u8], |
119 | 123 | zero_octal_parsing: OctalParsing, |
120 | | -) -> Result<EscapedChar, EscapeError> { |
| 124 | +) -> Result<EscapedChar, FormatError> { |
121 | 125 | if let [c, new_rest @ ..] = rest { |
122 | 126 | // This is for the \NNN syntax for octal sequences. |
123 | 127 | // Note that '0' is intentionally omitted because that |
@@ -145,17 +149,89 @@ pub fn parse_escape_code( |
145 | 149 | if let Some(c) = parse_code(rest, Base::Hex) { |
146 | 150 | Ok(EscapedChar::Byte(c)) |
147 | 151 | } else { |
148 | | - Err(EscapeError {}) |
| 152 | + Err(FormatError::MissingHex) |
149 | 153 | } |
150 | 154 | } |
151 | 155 | b'0' => Ok(EscapedChar::Byte( |
152 | 156 | parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'), |
153 | 157 | )), |
154 | | - b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))), |
155 | | - b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))), |
| 158 | + b'u' => match parse_unicode(rest, 4) { |
| 159 | + Ok(c) => Ok(EscapedChar::Char(c)), |
| 160 | + Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex), |
| 161 | + Err(EscapeError::InvalidCharacters(chars)) => { |
| 162 | + Err(FormatError::InvalidCharacter('u', chars)) |
| 163 | + } |
| 164 | + }, |
| 165 | + b'U' => match parse_unicode(rest, 8) { |
| 166 | + Ok(c) => Ok(EscapedChar::Char(c)), |
| 167 | + Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex), |
| 168 | + Err(EscapeError::InvalidCharacters(chars)) => { |
| 169 | + Err(FormatError::InvalidCharacter('U', chars)) |
| 170 | + } |
| 171 | + }, |
156 | 172 | c => Ok(EscapedChar::Backslash(*c)), |
157 | 173 | } |
158 | 174 | } else { |
159 | 175 | Ok(EscapedChar::Byte(b'\\')) |
160 | 176 | } |
161 | 177 | } |
| 178 | + |
| 179 | +#[cfg(test)] |
| 180 | +mod tests { |
| 181 | + use super::*; |
| 182 | + |
| 183 | + mod parse_unicode { |
| 184 | + use super::*; |
| 185 | + |
| 186 | + #[test] |
| 187 | + fn parse_ascii() { |
| 188 | + let input = b"2a"; |
| 189 | + assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*')); |
| 190 | + |
| 191 | + let input = b"002A"; |
| 192 | + assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*')); |
| 193 | + } |
| 194 | + |
| 195 | + #[test] |
| 196 | + fn parse_emoji_codepoint() { |
| 197 | + let input = b"0001F60A"; |
| 198 | + assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊')); |
| 199 | + } |
| 200 | + |
| 201 | + #[test] |
| 202 | + fn no_characters() { |
| 203 | + let input = b""; |
| 204 | + assert_eq!( |
| 205 | + parse_unicode(&mut &input[..], 8), |
| 206 | + Err(EscapeError::MissingHexadecimalNumber) |
| 207 | + ); |
| 208 | + } |
| 209 | + |
| 210 | + #[test] |
| 211 | + fn incomplete_hexadecimal_number() { |
| 212 | + let input = b"123"; |
| 213 | + assert_eq!( |
| 214 | + parse_unicode(&mut &input[..], 4), |
| 215 | + Err(EscapeError::MissingHexadecimalNumber) |
| 216 | + ); |
| 217 | + } |
| 218 | + |
| 219 | + #[test] |
| 220 | + fn invalid_hex() { |
| 221 | + let input = b"duck"; |
| 222 | + assert_eq!( |
| 223 | + parse_unicode(&mut &input[..], 4), |
| 224 | + Err(EscapeError::MissingHexadecimalNumber) |
| 225 | + ); |
| 226 | + } |
| 227 | + |
| 228 | + #[test] |
| 229 | + fn surrogate_code_point() { |
| 230 | + let input = b"d800"; |
| 231 | + assert_eq!( |
| 232 | + parse_unicode(&mut &input[..], 4), |
| 233 | + Err(EscapeError::InvalidCharacters(Vec::from(b"d800"))) |
| 234 | + ); |
| 235 | + } |
| 236 | + } |
| 237 | +} |
0 commit comments