Skip to content

Commit

Permalink
Fix crash on multibyte characters
Browse files Browse the repository at this point in the history
Previously parsing would proceed byte at a time, which would crash if
the source contained multibyte characters. Instead, try all the
regular expression patterns, and jump to the next nearest match.
  • Loading branch information
Wilfred committed Jul 19, 2021
1 parent 841dba8 commit 28d5e51
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 60 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Parsing

Fixed a crash on parsing non-ASCII source files.

Improved parsing for Rust punctuation.

Improved parsing for OCaml punctuation.
Expand Down
155 changes: 95 additions & 60 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,7 @@ fn as_regex_vec(v: &Value) -> Vec<Regex> {
}

fn as_regex(s: &str) -> Regex {
let mut pattern = String::with_capacity(1 + s.len());
pattern.push('^');
pattern.push_str(s);

Regex::new(&pattern).unwrap()
Regex::new(&s).unwrap()
}

fn lang_from_value(name: &str, v: &Value) -> Language {
Expand Down Expand Up @@ -138,6 +134,13 @@ pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str, lang: &Language) -> Vec<
parse_from(arena, s, &nl_pos, lang, &mut ParseState::new())
}

enum LexKind {
Comment,
Atom,
OpenDelimiter,
CloseDelimiter,
}

fn parse_from<'a>(
arena: &'a Arena<Syntax<'a>>,
s: &str,
Expand All @@ -147,81 +150,113 @@ fn parse_from<'a>(
) -> Vec<&'a Syntax<'a>> {
let mut result: Vec<&'a Syntax<'a>> = vec![];

'outer: while state.str_i < s.len() {
while state.str_i < s.len() {
let mut current_match: Option<(LexKind, regex::Match)> = None;

for pattern in &lang.comment_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_comment(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Comment, m));
}
}
}
}

for pattern in &lang.atom_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_atom(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}

// TODO: fix duplication with previous loop
for pattern in &lang.string_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_atom(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}

if let Some(m) = lang.open_delimiter_pattern.find(&s[state.str_i..]) {
let start = state.str_i;

state.str_i += m.end();
let children = parse_from(arena, s, nl_pos, lang, state);
let (close_brace, close_pos) = state.close_brace.take().unwrap_or((
"UNCLOSED".into(),
nl_pos.from_offsets(state.str_i, state.str_i + 1),
));

let open_pos = nl_pos.from_offsets(start, start + m.end());
let items = Syntax::new_list(
arena,
m.as_str(),
open_pos,
children,
&close_brace,
close_pos,
);
result.push(items);
continue;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::OpenDelimiter, m));
}
}
};

if let Some(m) = lang.close_delimiter_pattern.find(&s[state.str_i..]) {
state.close_brace = Some((
m.as_str().into(),
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
));
state.str_i += m.end();
return result;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::CloseDelimiter, m));
}
}
};

match current_match {
Some((match_kind, m)) => match match_kind {
LexKind::Comment => {
let atom = Syntax::new_comment(
arena,
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
}
LexKind::Atom => {
let atom = Syntax::new_atom(
arena,
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
}
LexKind::OpenDelimiter => {
let start = state.str_i;

state.str_i += m.end();
let children = parse_from(arena, s, nl_pos, lang, state);
let (close_brace, close_pos) = state.close_brace.take().unwrap_or((
"UNCLOSED".into(),
nl_pos.from_offsets(state.str_i, state.str_i + 1),
));

let open_pos = nl_pos.from_offsets(start + m.start(), start + m.end());
let items = Syntax::new_list(
arena,
m.as_str(),
open_pos,
children,
&close_brace,
close_pos,
);
result.push(items);
}
LexKind::CloseDelimiter => {
state.close_brace = Some((
m.as_str().into(),
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
));
state.str_i += m.end();
return result;
}
},
None => break,
};
state.str_i += 1;
}

result
Expand Down

0 comments on commit 28d5e51

Please sign in to comment.