cessen · pascalkuthe · Apr 8, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,6 +13,7 @@ exclude = ["/design/*", "/benches/*.txt", "/fuzz/**", "/.github/*"]
 
 [features]
 default = ["unicode_lines", "simd"]
+memchr = ["dep:memchr"]
 cr_lines = [] # Enable recognizing carriage returns as line breaks.
 unicode_lines = ["cr_lines"] # Enable recognizing all Unicode line breaks.
 simd = ["str_indices/simd"]
@@ -23,6 +24,7 @@ simd = ["str_indices/simd"]
 small_chunks = []
 
 [dependencies]
+memchr = { version = "2.7.2", optional = true}
 smallvec = "1.0.0"
 str_indices = { version = "0.4", default-features = false }
 

diff --git a/src/iter.rs b/src/iter.rs
@@ -70,7 +70,7 @@ use std::sync::Arc;
 use crate::slice::{RSEnum, RopeSlice};
 use crate::str_utils::{
     byte_to_line_idx, char_to_byte_idx, count_chars, count_utf16_surrogates, ends_with_line_break,
-    last_line_start_byte_idx, line_to_byte_idx, trim_line_break,
+    last_line_start_byte_idx, line_to_byte_idx, next_line_byte_idx, trim_line_break,
 };
 use crate::tree::{Count, Node, TextInfo};
 
@@ -1021,7 +1021,7 @@ impl<'a> Lines<'a> {
                 *line_idx += 1;
 
                 let head = &text[*leaf_byte_idx as usize..];
-                let mut line_len = line_to_byte_idx(head, 1);
+                let mut line_len = next_line_byte_idx(head);
 
                 // Check if the iterators needs to advance to the next chunk.
                 // During this check the number of newline (0 or 1) is yielded
@@ -1125,7 +1125,7 @@ impl<'a> Lines<'a> {
                         // This chunk contains a line break so it will contain the start of our line.
                         *text = node.children().nodes()[child_i].leaf_text();
                         // Find the end of the line within the chunk.
-                        let mut line_end = line_to_byte_idx(text, 1);
+                        let mut line_end = next_line_byte_idx(text);
                         // Check if the iterator was exhausted.
                         let ends_with_newline = if line_end >= available_bytes {
                             // Handle terminating lines without a line break properly.
@@ -1202,7 +1202,7 @@ impl<'a> Lines<'a> {
                 }
 
                 let start_idx = *byte_idx;
-                let end_idx = line_to_byte_idx(&text[start_idx..], 1) + start_idx;
+                let end_idx = next_line_byte_idx(&text[start_idx..]) + start_idx;
                 *byte_idx = end_idx;
                 *line_idx += 1;
 

diff --git a/src/str_utils.rs b/src/str_utils.rs
@@ -59,10 +59,51 @@ pub(crate) fn utf16_code_unit_to_char_idx(text: &str, utf16_idx: usize) -> usize
     str_indices::chars::from_byte_idx(text, str_indices::utf16::to_byte_idx(text, utf16_idx))
 }
 
+#[cfg(all(not(feature = "unicode_lines"), feature = "memchr"))]
+pub(crate) fn next_line_byte_idx(text: &str) -> usize {
+    #[cfg(not(feature = "cr_lines"))]
+    let offset = memchr::memchr(b'\n', text.as_bytes());
+    #[cfg(feature = "cr_lines")]
+    let offset = memchr::memchr2(b'\n', b'\r', text.as_bytes());
+    match offset {
+        #[cfg(feature = "cr_lines")]
+        Some(i) if text[i..].starts_with("\r\n") => i + 2,
+        Some(i) => i + 1,
+        None => text.len(),
+    }
+}
+
+#[cfg(all(not(feature = "unicode_lines"), feature = "memchr"))]
+pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
+    #[cfg(not(feature = "cr_lines"))]
+    let offset = memchr::memrchr(b'\n', text.as_bytes());
+    #[cfg(feature = "cr_lines")]
+    let offset = memchr::memrchr2(b'\n', b'\r', text.as_bytes());
+    // TODO: this is quite slow  (3x slower than memchar). Std just uses a naive
+    // chars match loop. Using the packed (teddy) algorithm from the aoh-corasic
+    // crate would be much faster (particularly since it reports end of line
+    // matches), unfortunately that is a bit of a heavy dependency, maybe that
+    // aglorithm will be exctracted one day.
+    #[cfg(feature = "unicode_lines")]
+    let offset = text.rfind([
+        '\n', '\r', '\u{000B}', '\u{000C}', '\u{0085}', '\u{2028}', '\u{2029}',
+    ]);
+    match offset {
+        Some(i) => i + 1,
+        None => 0,
+    }
+}
+
+#[cfg(any(feature = "unicode_lines", not(feature = "memchr")))]
+pub(crate) fn next_line_byte_idx(text: &str) -> usize {
+    line_to_byte_idx(text, 1)
+}
+
 /// Returns the byte index of the start of the last line of the passed text.
 ///
 /// Note: if the text ends in a line break, that means the last line is
 /// an empty line that starts at the end of the text.
+#[cfg(any(feature = "unicode_lines", not(feature = "memchr")))]
 pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
     let mut itr = text.bytes().enumerate().rev();
 
@@ -106,6 +147,20 @@ pub(crate) fn last_line_start_byte_idx(text: &str) -> usize {
 ///
 /// If the string doesn't end in a line break, returns the string unchanged.
 #[inline]
+#[cfg(not(any(feature = "unicode_lines", feature = "cr_lines")))]
+pub(crate) fn trim_line_break(text: &str) -> &str {
+    // hacks: we don't ned to care about crlf here
+    // since we just use the in the line iterator and want to pevent
+    // matching the same line terminator a second time
+    // #[cfg(feature = "cr_lines")]
+    text.strip_suffix('\n').unwrap_or(text)
+}
+
+/// Trims a single trailing line break (if any) off the end of the passed string.
+///
+/// If the string doesn't end in a line break, returns the string unchanged.
+#[inline]
+#[cfg(any(feature = "unicode_lines", feature = "cr_lines"))]
 pub(crate) fn trim_line_break(text: &str) -> &str {
     if text.is_empty() {
         return "";
@@ -147,6 +202,14 @@ pub(crate) fn trim_line_break(text: &str) -> &str {
 
 /// Returns whether the given string ends in a line break or not.
 #[inline]
+#[cfg(not(any(feature = "cr_lines", feature = "unicode_lines")))]
+pub(crate) fn ends_with_line_break(text: &str) -> bool {
+    text.ends_with('\n')
+}
+
+/// Returns whether the given string ends in a line break or not.
+#[inline]
+#[cfg(any(feature = "cr_lines", feature = "unicode_lines"))]
 pub(crate) fn ends_with_line_break(text: &str) -> bool {
     if text.is_empty() {
         return false;