Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Kinda-working fancy-regex support #34

Closed
wants to merge 10 commits into from
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ exclude = [

[dependencies]
yaml-rust = { version = "0.4", optional = true }
onig = { version = "3.2.1", optional = true }
walkdir = "2.0"
regex-syntax = { version = "0.4", optional = true }
lazy_static = "1.0"
Expand All @@ -24,14 +23,15 @@ plist = "0.2"
bincode = { version = "1.0", optional = true }
flate2 = { version = "1.0", optional = true, default-features = false }
fnv = { version = "1.0", optional = true }
regex = "*"
fancy-regex = { git = "https://github.com/google/fancy-regex.git", optional = true }
serde = { version = "1.0", features = ["rc"] }
serde_derive = "1.0"
serde_json = "1.0"

[dev-dependencies]
criterion = "0.2"
rayon = "1.0.0"
regex = "0.2"
getopts = "0.2"
pretty_assertions = "0.5.0"

Expand All @@ -50,7 +50,7 @@ dump-create = ["flate2/default", "bincode"]
# Pure Rust dump creation, worse compressor so produces larger dumps than dump-create
dump-create-rs = ["flate2/rust_backend", "bincode"]

parsing = ["onig", "regex-syntax", "fnv"]
parsing = ["fancy-regex", "regex-syntax", "fnv"]
# The `assets` feature enables inclusion of the default theme and syntax packages.
# For `assets` to do anything, it requires one of `dump-load-rs` or `dump-load` to be set.
assets = []
Expand Down
8 changes: 6 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@

#[cfg(feature = "yaml-load")]
extern crate yaml_rust;
#[cfg(feature = "parsing")]
extern crate onig;
// extern crate onig;
extern crate walkdir;
#[cfg(feature = "parsing")]
extern crate regex_syntax;
Expand All @@ -42,6 +41,11 @@ extern crate serde_json;
#[cfg(test)]
#[macro_use]
extern crate pretty_assertions;
#[cfg(feature = "parsing")]
extern crate regex;
#[cfg(feature = "parsing")]
extern crate fancy_regex;

pub mod highlighting;
pub mod parsing;
pub mod util;
Expand Down
2 changes: 2 additions & 0 deletions src/parsing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
//! The most important struct here is `SyntaxSet`, check out the docs for that.
#[cfg(feature = "parsing")]
pub mod syntax_definition;
#[cfg(feature = "parsing")]
mod util;
#[cfg(all( feature = "parsing", feature = "yaml-load"))]
mod yaml_load;
#[cfg(feature = "parsing")]
Expand Down
178 changes: 150 additions & 28 deletions src/parsing/parser.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::syntax_definition::*;
use super::scope::*;
use onig::{MatchParam, Region, SearchOptions};
use super::util::Region;
use std::usize;
use std::collections::HashMap;
use std::i32;
Expand Down Expand Up @@ -204,7 +204,6 @@ impl ParseState {
self.first_line = false;
}

let mut regions = Region::with_capacity(8);
let fnv = BuildHasherDefault::<FnvHasher>::default();
let mut search_cache: SearchCache = HashMap::with_capacity_and_hasher(128, fnv);
// Used for detecting loops with push/pop, see long comment above.
Expand All @@ -213,7 +212,6 @@ impl ParseState {
while self.parse_next_token(line,
&mut match_start,
&mut search_cache,
&mut regions,
&mut non_consuming_push_at,
&mut res) {
}
Expand All @@ -225,7 +223,6 @@ impl ParseState {
line: &str,
start: &mut usize,
search_cache: &mut SearchCache,
regions: &mut Region,
non_consuming_push_at: &mut (usize, usize),
ops: &mut Vec<(usize, ScopeStackOp)>)
-> bool {
Expand All @@ -239,7 +236,7 @@ impl ParseState {
self.proto_starts.pop();
}

let best_match = self.find_best_match(line, *start, search_cache, regions, check_pop_loop);
let best_match = self.find_best_match(line, *start, search_cache, check_pop_loop);

if let Some(reg_match) = best_match {
if reg_match.would_loop {
Expand All @@ -253,13 +250,17 @@ impl ParseState {

// println!("pop_would_loop for match {:?}, start {}", reg_match, *start);

if *start == line.len() {
// nth(1) gets the next character if there is one. Need to do
// this instead of just += 1 because we have byte indices and
// unicode characters can be more than 1 byte.
if let Some((i, _)) = line[*start..].char_indices().nth(1) {
*start += i;
return true;
} else {
// End of line, no character to advance and no point trying
// any more patterns.
return false;
}
*start += 1;
return true;
}

let match_end = reg_match.regions.pos(0).unwrap().1;
Expand Down Expand Up @@ -298,7 +299,6 @@ impl ParseState {
line: &str,
start: usize,
search_cache: &mut SearchCache,
regions: &mut Region,
check_pop_loop: bool)
-> Option<RegexMatch> {
let cur_level = &self.stack[self.stack.len() - 1];
Expand Down Expand Up @@ -330,7 +330,7 @@ impl ParseState {
let match_pat = pat_context.match_at_mut(pat_index);

if let Some(match_region) = self.search(
line, start, match_pat, captures, search_cache, regions
line, start, match_pat, captures, search_cache
) {
let (match_start, match_end) = match_region.pos(0).unwrap();

Expand Down Expand Up @@ -376,8 +376,7 @@ impl ParseState {
start: usize,
match_pat: &mut MatchPattern,
captures: Option<&(Region, String)>,
search_cache: &mut SearchCache,
regions: &mut Region)
search_cache: &mut SearchCache)
-> Option<Region> {
// println!("{} - {:?} - {:?}", match_pat.regex_str, match_pat.has_captures, cur_level.captures.is_some());
let match_ptr = match_pat as *const MatchPattern;
Expand Down Expand Up @@ -408,18 +407,14 @@ impl ParseState {
} else {
match_pat.regex.as_ref().unwrap()
};
let matched = regex.search_with_param(line,
start,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(regions),
MatchParam::default());

// If there's an error during search, treat it as non-matching.
// For example, in case of catastrophic backtracking, onig should
// fail with a "retry-limit-in-match over" error eventually.
if let Ok(Some(match_start)) = matched {
let match_end = regions.pos(0).unwrap().1;
// TODO: don't panic on regex error
// TODO: avoid catastrophic backtracking
let matched = regex.captures_from_pos(line, start).unwrap();
if let Some(captures) = matched {
let match_start = captures.pos(0).unwrap().0;
let match_end = captures.pos(0).unwrap().1;
let regions = Region::from_captures(&captures);

// this is necessary to avoid infinite looping on dumb patterns
let does_something = match match_pat.operation {
MatchOperation::None => match_start != match_end,
Expand Down Expand Up @@ -686,6 +681,26 @@ mod tests {
assert_eq!(&ops2[0..test_ops2.len()], &test_ops2[..]);
}

#[test]
fn can_parse_yaml() {
let ps = SyntaxSet::load_from_folder("testdata/Packages").unwrap();
let mut state = {
let syntax = ps.find_syntax_by_name("YAML").unwrap();
ParseState::new(syntax)
};

assert_eq!(ops("key: value\n", &mut state), vec![
(0, Push(Scope::new("source.yaml").unwrap())),
(0, Push(Scope::new("string.unquoted.plain.out.yaml").unwrap())),
(0, Push(Scope::new("entity.name.tag.yaml").unwrap())),
(3, Pop(2)),
(3, Push(Scope::new("punctuation.separator.key-value.mapping.yaml").unwrap())),
(4, Pop(1)),
(5, Push(Scope::new("string.unquoted.plain.out.yaml").unwrap())),
(10, Pop(1)),
]);
}

#[test]
fn can_parse_includes() {
let ps = SyntaxSet::load_from_folder("testdata/Packages").unwrap();
Expand Down Expand Up @@ -1229,6 +1244,109 @@ contexts:
expect_scope_stacks(&line, &expect, syntax);
}

#[test]
fn can_parse_syntax_with_eol_and_newline() {
let syntax = r#"
name: test
scope: source.test
contexts:
main:
- match: foo$\n
scope: foo.newline
"#;

let line = "foo";
let expect = ["<source.test>, <foo.newline>"];
expect_scope_stacks(&line, &expect, syntax);
}

#[test]
fn can_parse_syntax_with_eol_only() {
let syntax = r#"
name: test
scope: source.test
contexts:
main:
- match: foo$
scope: foo.newline
"#;

let line = "foo";
let expect = ["<source.test>, <foo.newline>"];
expect_scope_stacks(&line, &expect, syntax);
}

#[test]
fn can_parse_syntax_with_beginning_of_line() {
let syntax = r#"
name: test
scope: source.test
contexts:
main:
- match: \w+
scope: word
push:
# this should not match at the end of the line
- match: ^\s*$
pop: true
- match: =+
scope: heading
pop: true
- match: .*
scope: other
"#;

let syntax_newlines = SyntaxDefinition::load_from_str(&syntax, true, None).unwrap();
let syntax_set = link(syntax_newlines);

let mut state = ParseState::new(&syntax_set.syntaxes()[0]);
assert_eq!(ops("foo\n", &mut state), vec![
(0, Push(Scope::new("source.test").unwrap())),
(0, Push(Scope::new("word").unwrap())),
(3, Pop(1))
]);
assert_eq!(ops("===\n", &mut state), vec![
(0, Push(Scope::new("heading").unwrap())),
(3, Pop(1))
]);

assert_eq!(ops("bar\n", &mut state), vec![
(0, Push(Scope::new("word").unwrap())),
(3, Pop(1))
]);
// This should result in popping out of the context
assert_eq!(ops("\n", &mut state), vec![]);
// So now this matches other
assert_eq!(ops("====\n", &mut state), vec![
(0, Push(Scope::new("other").unwrap())),
(4, Pop(1))
]);
}

#[test]
fn can_parse_text_with_unicode_to_skip() {
let syntax = r#"
name: test
scope: source.test
contexts:
main:
- match: (?=.)
push: test
test:
- match: (?=.)
pop: true
- match: x
scope: test.good
"#;

// U+03C0 GREEK SMALL LETTER PI, 2 bytes in UTF-8
expect_scope_stacks("\u{03C0}x", &["<source.test>, <test.good>"], syntax);
// U+0800 SAMARITAN LETTER ALAF, 3 bytes in UTF-8
expect_scope_stacks("\u{0800}x", &["<source.test>, <test.good>"], syntax);
// U+1F600 GRINNING FACE, 4 bytes in UTF-8
expect_scope_stacks("\u{1F600}x", &["<source.test>, <test.good>"], syntax);
}

fn expect_scope_stacks(line_without_newline: &str, expect: &[&str], syntax: &str) {
println!("Parsing with newlines");
let line_with_newline = format!("{}\n", line_without_newline);
Expand All @@ -1243,10 +1361,7 @@ contexts:
fn expect_scope_stacks_with_syntax(line: &str, expect: &[&str], syntax: SyntaxDefinition) {
// check that each expected scope stack appears at least once while parsing the given test line

let mut syntax_set = SyntaxSet::new();
syntax_set.add_syntax(syntax);
syntax_set.link_syntaxes();

let syntax_set = link(syntax);
let mut state = ParseState::new(&syntax_set.syntaxes()[0]);

let mut stack = ScopeStack::new();
Expand All @@ -1268,6 +1383,13 @@ contexts:
}
}

fn link(syntax: SyntaxDefinition) -> SyntaxSet {
let mut set = SyntaxSet::new();
set.add_syntax(syntax);
set.link_syntaxes();
set
}

fn ops(line: &str, state: &mut ParseState) -> Vec<(usize, ScopeStackOp)> {
let ops = state.parse_line(line);
debug_print_ops(line, &ops);
Expand Down
Loading