trishume · trishume · Mar 29, 2020 · Nov 25, 2019 · Nov 27, 2019 · Nov 28, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -26,8 +26,11 @@ cache:
 script:
   - cargo build
   - cargo test --features metadata
+  # Run these tests in release mode since they're slow as heck otherwise
+  - cargo test --features default-fancy --no-default-features --release
   - make assets
   - make syntest
+  - make syntest-fancy
   - rm -Rf examples
   # Only run doc build on stable until this is fixed: https://github.com/rust-lang/rust/issues/51661
   - |

diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,7 @@ exclude = [
 [dependencies]
 yaml-rust = { version = "0.4", optional = true }
 onig = { version = "5.0", optional = true }
+fancy-regex = { version = "0.3.2", optional = true }
 walkdir = "2.0"
 regex-syntax = { version = "0.6", optional = true }
 lazy_static = "1.0"
@@ -55,15 +56,20 @@ dump-create = ["flate2/default", "bincode"]
 # Pure Rust dump creation, worse compressor so produces larger dumps than dump-create
 dump-create-rs = ["flate2/rust_backend", "bincode"]
 
-parsing = ["onig", "regex-syntax", "fnv"]
+regex-fancy = ["fancy-regex"]
+regex-onig = ["onig"]
+parsing = ["regex-syntax", "fnv"]
 # Support for .tmPreferenes metadata files (indentation, comment syntax, etc)
 metadata = ["parsing"]
 # The `assets` feature enables inclusion of the default theme and syntax packages.
 # For `assets` to do anything, it requires one of `dump-load-rs` or `dump-load` to be set.
 assets = []
 html = ["parsing", "assets"]
 yaml-load = ["yaml-rust", "parsing"]
-default = ["parsing", "assets", "html", "yaml-load", "dump-load", "dump-create"]
+default-onig = ["parsing", "assets", "html", "yaml-load", "dump-load", "dump-create", "regex-onig"]
+# In order to switch to the fancy-regex engine, disable default features then add the default-fancy feature
+default-fancy = ["parsing", "assets", "html", "yaml-load", "dump-load", "dump-create", "regex-fancy"]
+default = ["default-onig"]
 
 # [profile.release]
 # debug = true

diff --git a/Makefile b/Makefile
@@ -26,6 +26,13 @@ syntest: $(SUBMODULES)
 	cargo run --release --example syntest -- testdata/Packages testdata/Packages --summary | diff -U 1000000 testdata/known_syntest_failures.txt -
 	@echo No new failures!
 
+syntest-fancy: $(SUBMODULES)
+	@echo Tip: Run make update-known-failures to update the known failures file.
+	cargo run --features default-fancy --no-default-features --release --example syntest -- testdata/Packages testdata/Packages --summary | diff -U 1000000 testdata/known_syntest_failures_fancy.txt -
+	@echo No new failures!
+
 update-known-failures: $(SUBMODULES)
 	cargo run --release --example syntest -- testdata/Packages testdata/Packages --summary | tee testdata/known_syntest_failures.txt
 
+update-known-failures-fancy: $(SUBMODULES)
+	cargo run --features default-fancy --no-default-features --release --example syntest -- testdata/Packages testdata/Packages --summary | tee testdata/known_syntest_failures_fancy.txt
diff --git a/Readme.md b/Readme.md
@@ -88,20 +88,19 @@ for line in LinesWithEndings::from(s) {
 
 ## Performance
 
-Currently `syntect` is one of the faster syntax highlighting engines, but not the fastest. The following perf features are done and to-be-done:
+Currently `syntect` is one of the faster syntax highlighting engines, but not the fastest. The following perf features are done:
 
 - [x] Pre-link references between languages (e.g `<script>` tags) so there are no tree traversal string lookups in the hot-path
 - [x] Compact binary representation of scopes to allow quickly passing and copying them around
 - [x] Determine if a scope is a prefix of another scope using bit manipulation in only a few instructions
 - [x] Cache regex matches to reduce number of times oniguruma is asked to search a line
 - [x] Accelerate scope lookups to reduce how much selector matching has to be done to highlight a list of scope operations
-- [x] Lazily compile regexes so startup time isn't taken compiling a thousand regexs for Actionscript that nobody will use
-- [ ] Use a better regex engine, perhaps the in progress fancy-regex crate
-- [ ] Parallelize the highlighting. Is this even possible? Would it help? To be determined.
+- [x] Lazily compile regexes so startup time isn't taken compiling a thousand regexes for Actionscript that nobody will use
+- [ ] Optionally use the fancy-regex crate. Unfortunately this isn't yet faster than oniguruma on our benchmarks but it might be in the future.
 
 The current perf numbers are below.
 These numbers may get better if more of the things above are implemented, but they're better than many other text editors.
-All measurements were taken on a mid 2012 15" retina Macbook Pro.
+All measurements were taken on a mid 2012 15" retina Macbook Pro, my new 2019 Macbook takes about 70% of these times.
 
 - Highlighting 9200 lines/247kb of jQuery 2.1 takes 600ms. For comparison:
     - Textmate 2, Spacemacs and Visual Studio Code all take around 2ish seconds (measured by hand with a stopwatch, hence approximate).
@@ -125,6 +124,30 @@ In particular, it is possible to use the highlighting component of syntect witho
 
 For more information on available features, see the features section in `Cargo.toml`.
 
+## Pure Rust `fancy-regex` mode, without `onig`
+
+Since 4.0 `syntect` offers an alternative pure-rust regex engine based on the [fancy-regex](https://github.com/fancy-regex/fancy-regex) engine which extends the awesome [regex crate](https://github.com/rust-lang/regex) with support for fancier regex features that Sublime syntaxes need like lookaheads.
+
+The advantage of `fancy-regex` is that it does not require the [onig crate](https://github.com/rust-onig/rust-onig) which requires building and linking the Oniguruma C library. Many users experience difficulty building the `onig` crate, especially on Windows and Webassembly. The `onig` crate also recently added a requirement on `bindgen` which needs Clang/LLVM and thus makes it even harder to build. The `bindgen` dependency [may eventually be removed](https://github.com/rust-onig/rust-onig/pull/126) but even if it is, a pure-Rust build still makes things better for Webassembly and systems without a C compiler.
+
+As far as our tests can tell this new engine is just as correct, but it hasn't been tested as extensively in production. It also currently seems to be about **half the speed** of the default Oniguruma engine, although further testing and optimization (perhaps by you!) may eventually see it surpass Oniguruma's speed and become the default.
+
+To use the fancy-regex engine with syntect, add it to your `Cargo.toml` like so:
+
+```toml
+syntect = { version = "4.0", default-features = false, features = ["default-fancy"]}
+```
+
+If you want to run examples with the fancy-regex engine you can use a command line like the following:
+
+```bash
+cargo run --features default-fancy --no-default-features --release --example syncat testdata/highlight_test.erb
+```
+
+Due to the way Cargo features work, if any crate you depend on depends on `syntect` without enabling `fancy-regex` then you'll get the default `onig` mode.
+
+**Note:** The `fancy-regex` engine is *absurdly* slow in debug mode, because the regex engine (the main hot spot of highlighting) is now in Rust instead of C that's always built with optimizations. Consider using release mode or `onig` when testing.
+
 ## Caching
 
 Because `syntect`'s API exposes internal cacheable data structures, there is a caching strategy that text editors can use that allows the text on screen to be re-rendered instantaneously regardless of the file size when a change is made after the initial highlight.
@@ -219,7 +242,7 @@ Below is a list of projects using Syntect, in approximate order by how long they
 ## License and Acknowledgements
 
 Thanks to [Robin Stocker](https://github.com/robinst) and also [Keith Hall](https://github.com/keith-hall) for making awesome substantial contributions of the most important impressive improvements `syntect` has had post-`v1.0`!
-They deserve lots of credit for where `syntect` is today.
+They deserve lots of credit for where `syntect` is today. For example @robinst implemented [fancy-regex support](https://github.com/trishume/syntect/pull/270) and [a massive refactor](https://github.com/trishume/syntect/pull/182) to enable parallel highlighting using an arena. @keith-hall found and fixed many bugs and [implemented Sublime syntax test support](https://github.com/trishume/syntect/pull/44).
 
 Thanks to [Textmate 2](https://github.com/textmate/textmate) and @defuz's [sublimate](https://github.com/defuz/sublimate) for the existing open source code I used as inspiration and in the case of sublimate's `tmTheme` loader, copy-pasted.
 All code (including defuz's sublimate code) is released under the MIT license.
diff --git a/assets/default_metadata.packdump b/assets/default_metadata.packdump
diff --git a/assets/default_newlines.packdump b/assets/default_newlines.packdump
diff --git a/assets/default_nonewlines.packdump b/assets/default_nonewlines.packdump
diff --git a/src/parsing/metadata.rs b/src/parsing/metadata.rs
@@ -8,11 +8,10 @@ use std::fs::File;
 use std::io::BufReader;
 use std::str::FromStr;
 
-use lazycell::AtomicLazyCell;
-use onig::{Regex, SearchOptions};
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_json;
 
+use super::regex::Regex;
 use super::scope::{MatchPower, Scope};
 use super::super::LoadingError;
 use super::super::highlighting::settings::*;
@@ -23,13 +22,6 @@ type Dict = serde_json::Map<String, Settings>;
 /// A String representation of a `ScopeSelectors` instance.
 type SelectorString = String;
 
-/// A simple regex pattern, used for checking indentation state.
-#[derive(Debug)]
-pub struct Pattern {
-    pub regex_str: String,
-    pub regex: AtomicLazyCell<Regex>,
-}
-
 /// A collection of all loaded metadata.
 #[derive(Debug, Default, Clone, Serialize, Deserialize)]
 pub struct Metadata {
@@ -54,11 +46,11 @@ pub struct MetadataSet {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(rename_all = "camelCase")]
 pub struct MetadataItems {
-    pub increase_indent_pattern: Option<Pattern>,
-    pub decrease_indent_pattern: Option<Pattern>,
-    pub bracket_indent_next_line_pattern: Option<Pattern>,
-    pub disable_indent_next_line_pattern: Option<Pattern>,
-    pub unindented_line_pattern: Option<Pattern>,
+    pub increase_indent_pattern: Option<Regex>,
+    pub decrease_indent_pattern: Option<Regex>,
+    pub bracket_indent_next_line_pattern: Option<Regex>,
+    pub disable_indent_next_line_pattern: Option<Regex>,
+    pub unindented_line_pattern: Option<Regex>,
     pub indent_parens: Option<bool>,
     #[serde(default)]
     pub shell_variables: BTreeMap<String, String>,
@@ -377,56 +369,6 @@ impl RawMetadataEntry {
     }
 }
 
-impl Pattern {
-    pub fn is_match<S: AsRef<str>>(&self, string: S) -> bool {
-        self.regex()
-            .match_with_options(
-                string.as_ref(),
-                0,
-                SearchOptions::SEARCH_OPTION_NONE,
-                None)
-            .is_some()
-    }
-
-    pub fn regex(&self) -> &Regex {
-        if let Some(regex) = self.regex.borrow() {
-            regex
-        } else {
-            let regex = Regex::new(&self.regex_str)
-                .expect("regex string should be pre-tested");
-            self.regex.fill(regex).ok();
-            self.regex.borrow().unwrap()
-        }
-    }
-}
-
-impl Clone for Pattern {
-    fn clone(&self) -> Self {
-        Pattern { regex_str: self.regex_str.clone(), regex: AtomicLazyCell::new() }
-    }
-}
-
-impl PartialEq for Pattern {
-    fn eq(&self, other: &Pattern) -> bool {
-        self.regex_str == other.regex_str
-    }
-}
-
-impl Eq for Pattern {}
-
-impl Serialize for Pattern {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer {
-        serializer.serialize_str(&self.regex_str)
-    }
-}
-
-impl<'de> Deserialize<'de> for Pattern {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: Deserializer<'de> {
-        let regex_str = String::deserialize(deserializer)?;
-        Ok(Pattern { regex_str, regex: AtomicLazyCell::new() })
-    }
-}
-
 #[derive(Serialize, Deserialize)]
 struct MetaSetSerializable {
     selector_string: String,
@@ -525,14 +467,6 @@ mod tests {
         assert!(metadata.items.increase_indent_pattern.is_none());
     }
 
-    #[test]
-    fn serde_pattern() {
-        let pattern: Pattern = serde_json::from_str("\"just a string\"").unwrap();
-        assert_eq!(pattern.regex_str, "just a string");
-        let back_to_str = serde_json::to_string(&pattern).unwrap();
-        assert_eq!(back_to_str, "\"just a string\"");
-    }
-
     #[test]
     fn indent_rust() {
         let ps = SyntaxSet::load_from_folder("testdata/Packages/Rust").unwrap();

diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs
@@ -12,6 +12,8 @@ mod syntax_set;
 mod yaml_load;
 
 mod scope;
+#[cfg(any(feature = "parsing", feature = "yaml-load", feature = "metadata"))]
+mod regex;
 
 #[cfg(feature = "parsing")]
 pub use self::syntax_definition::SyntaxDefinition;
@@ -24,4 +26,7 @@ pub use self::parser::*;
 #[cfg(feature = "metadata")]
 pub use self::metadata::*;
 
+#[cfg(any(feature = "parsing", feature = "yaml-load", feature = "metadata"))]
+pub use self::regex::*;
+
 pub use self::scope::*;