Skip to content

Commit

Permalink
Apply review
Browse files Browse the repository at this point in the history
  • Loading branch information
raskad committed Jan 5, 2024
1 parent fe46677 commit c4cb92e
Showing 1 changed file with 5 additions and 10 deletions.
15 changes: 5 additions & 10 deletions core/parser/src/source/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,26 @@ impl<R: Read> ReadChar for UTF8Input<R> {
fn next_char(&mut self) -> io::Result<Option<u32>> {
// Decode UTF-8
let x = match self.next_byte()? {
Some(b) if b < 128 => return Ok(Some(u32::from(b))),
Some(b) => b,
None => return Ok(None),
Some(b) if b >= 128 => b, // UTF-8 codepoint
b => return Ok(b.map(u32::from)), // ASCII or None
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(self.next_byte()?);
let y = self.next_byte()?.unwrap_or(0);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.next_byte()?);
let z = self.next_byte()?.unwrap_or(0);
let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.next_byte()?);
let w = self.next_byte()?.unwrap_or(0);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
};
Expand All @@ -71,7 +70,3 @@ fn utf8_first_byte(byte: u8, width: u32) -> u32 {
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | u32::from(byte & CONT_MASK)
}

fn unwrap_or_0(opt: Option<u8>) -> u8 {
opt.unwrap_or(0)
}

0 comments on commit c4cb92e

Please sign in to comment.