Skip to content

Commit 0fa652a

Browse files
committed
printf: add error handling to escaped unicode characters
1 parent 04fe62b commit 0fa652a

File tree

3 files changed

+108
-28
lines changed

3 files changed

+108
-28
lines changed

src/uucore/src/lib/features/format/escape.rs

Lines changed: 97 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
//! Parsing of escape sequences
77
8+
use crate::format::FormatError;
9+
810
#[derive(Debug)]
911
pub enum EscapedChar {
1012
/// A single byte
@@ -90,34 +92,36 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
9092

9193
// spell-checker:disable-next
9294
/// Parse `\uHHHH` and `\UHHHHHHHH`
93-
// TODO: This should print warnings and possibly halt execution when it fails to parse
94-
// TODO: If the character cannot be converted to u32, the input should be printed.
95-
fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
96-
let (c, rest) = input.split_first()?;
97-
let mut ret = Base::Hex.convert_digit(*c)? as u32;
98-
*input = rest;
99-
100-
for _ in 1..digits {
101-
let (c, rest) = input.split_first()?;
102-
let n = Base::Hex.convert_digit(*c)?;
103-
ret = ret
104-
.wrapping_mul(Base::Hex.as_base() as u32)
105-
.wrapping_add(n as u32);
95+
fn parse_unicode(input: &mut &[u8], digits: u8) -> Result<char, EscapeError> {
96+
if let Some((new_digits, rest)) = input.split_at_checked(digits as usize) {
10697
*input = rest;
98+
let ret = new_digits
99+
.iter()
100+
.map(|c| Base::Hex.convert_digit(*c))
101+
.collect::<Option<Vec<u8>>>()
102+
.ok_or(EscapeError::MissingHexadecimalNumber)?
103+
.iter()
104+
.map(|n| *n as u32)
105+
.reduce(|ret, n| ret.wrapping_mul(Base::Hex.as_base() as u32).wrapping_add(n))
106+
.expect("must have multiple digits in unicode string");
107+
char::from_u32(ret).ok_or_else(|| EscapeError::InvalidCharacters(new_digits.to_vec()))
108+
} else {
109+
Err(EscapeError::MissingHexadecimalNumber)
107110
}
108-
109-
char::from_u32(ret)
110111
}
111112

112113
/// Represents an invalid escape sequence.
113-
#[derive(Debug)]
114-
pub struct EscapeError {}
114+
#[derive(Debug, PartialEq)]
115+
pub enum EscapeError {
116+
InvalidCharacters(Vec<u8>),
117+
MissingHexadecimalNumber,
118+
}
115119

116120
/// Parse an escape sequence, like `\n` or `\xff`, etc.
117121
pub fn parse_escape_code(
118122
rest: &mut &[u8],
119123
zero_octal_parsing: OctalParsing,
120-
) -> Result<EscapedChar, EscapeError> {
124+
) -> Result<EscapedChar, FormatError> {
121125
if let [c, new_rest @ ..] = rest {
122126
// This is for the \NNN syntax for octal sequences.
123127
// Note that '0' is intentionally omitted because that
@@ -145,17 +149,89 @@ pub fn parse_escape_code(
145149
if let Some(c) = parse_code(rest, Base::Hex) {
146150
Ok(EscapedChar::Byte(c))
147151
} else {
148-
Err(EscapeError {})
152+
Err(FormatError::MissingHex)
149153
}
150154
}
151155
b'0' => Ok(EscapedChar::Byte(
152156
parse_code(rest, Base::Oct(zero_octal_parsing)).unwrap_or(b'\0'),
153157
)),
154-
b'u' => Ok(EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0'))),
155-
b'U' => Ok(EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0'))),
158+
b'u' => match parse_unicode(rest, 4) {
159+
Ok(c) => Ok(EscapedChar::Char(c)),
160+
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
161+
Err(EscapeError::InvalidCharacters(chars)) => {
162+
Err(FormatError::InvalidCharacter('u', chars))
163+
}
164+
},
165+
b'U' => match parse_unicode(rest, 8) {
166+
Ok(c) => Ok(EscapedChar::Char(c)),
167+
Err(EscapeError::MissingHexadecimalNumber) => Err(FormatError::MissingHex),
168+
Err(EscapeError::InvalidCharacters(chars)) => {
169+
Err(FormatError::InvalidCharacter('U', chars))
170+
}
171+
},
156172
c => Ok(EscapedChar::Backslash(*c)),
157173
}
158174
} else {
159175
Ok(EscapedChar::Byte(b'\\'))
160176
}
161177
}
178+
179+
#[cfg(test)]
180+
mod tests {
181+
use super::*;
182+
183+
mod parse_unicode {
184+
use super::*;
185+
186+
#[test]
187+
fn parse_ascii() {
188+
let input = b"2a";
189+
assert_eq!(parse_unicode(&mut &input[..], 2), Ok('*'));
190+
191+
let input = b"002A";
192+
assert_eq!(parse_unicode(&mut &input[..], 4), Ok('*'));
193+
}
194+
195+
#[test]
196+
fn parse_emoji_codepoint() {
197+
let input = b"0001F60A";
198+
assert_eq!(parse_unicode(&mut &input[..], 8), Ok('😊'));
199+
}
200+
201+
#[test]
202+
fn no_characters() {
203+
let input = b"";
204+
assert_eq!(
205+
parse_unicode(&mut &input[..], 8),
206+
Err(EscapeError::MissingHexadecimalNumber)
207+
);
208+
}
209+
210+
#[test]
211+
fn incomplete_hexadecimal_number() {
212+
let input = b"123";
213+
assert_eq!(
214+
parse_unicode(&mut &input[..], 4),
215+
Err(EscapeError::MissingHexadecimalNumber)
216+
);
217+
}
218+
219+
#[test]
220+
fn invalid_hex() {
221+
let input = b"duck";
222+
assert_eq!(
223+
parse_unicode(&mut &input[..], 4),
224+
Err(EscapeError::MissingHexadecimalNumber)
225+
);
226+
}
227+
228+
#[test]
229+
fn surrogate_code_point() {
230+
let input = b"d800";
231+
assert_eq!(
232+
parse_unicode(&mut &input[..], 4),
233+
Err(EscapeError::InvalidCharacters(Vec::from(b"d800")))
234+
);
235+
}
236+
}
237+
}

src/uucore/src/lib/features/format/mod.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ pub enum FormatError {
7171
EndsWithPercent(Vec<u8>),
7272
/// The escape sequence `\x` appears without a literal hexadecimal value.
7373
MissingHex,
74+
/// The hexadecimal characters represent a code point that cannot represent a
75+
/// Unicode character (e.g., a surrogate code point)
76+
InvalidCharacter(char, Vec<u8>),
7477
}
7578

7679
impl Error for FormatError {}
@@ -110,6 +113,12 @@ impl Display for FormatError {
110113
Self::NoMoreArguments => write!(f, "no more arguments"),
111114
Self::InvalidArgument(_) => write!(f, "invalid argument"),
112115
Self::MissingHex => write!(f, "missing hexadecimal number in escape"),
116+
Self::InvalidCharacter(escape_char, digits) => write!(
117+
f,
118+
"invalid universal character name \\{}{}",
119+
escape_char,
120+
String::from_utf8_lossy(digits)
121+
),
113122
}
114123
}
115124
}
@@ -186,12 +195,7 @@ pub fn parse_spec_and_escape(
186195
}
187196
[b'\\', rest @ ..] => {
188197
current = rest;
189-
Some(
190-
match parse_escape_code(&mut current, OctalParsing::default()) {
191-
Ok(c) => Ok(FormatItem::Char(c)),
192-
Err(_) => Err(FormatError::MissingHex),
193-
},
194-
)
198+
Some(parse_escape_code(&mut current, OctalParsing::default()).map(FormatItem::Char))
195199
}
196200
[c, rest @ ..] => {
197201
current = rest;

src/uucore/src/lib/features/parser/num_parser.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ fn parse(
474474

475475
let ebd_result = construct_extended_big_decimal(digits, negative, base, scale, exponent);
476476

477-
// Return what has been parsed so far. It there are extra characters, mark the
477+
// Return what has been parsed so far. If there are extra characters, mark the
478478
// parsing as a partial match.
479479
if let Some((first_unparsed, _)) = chars.next() {
480480
Err(ExtendedParserError::PartialMatch(

0 commit comments

Comments
 (0)