diff --git a/Cargo.toml b/Cargo.toml index 1584933..2f759d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "whitespace-sifter" -version = "2.2.0" +version = "2.3.0" edition = "2021" authors = ["JumperBot_"] description = "Sift duplicate whitespaces away!" diff --git a/README.md b/README.md index fb65009..da1dbde 100644 --- a/README.md +++ b/README.md @@ -32,15 +32,13 @@ println!( ## ✨ Sift Duplicate Whitespaces In One Function Call This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`. -Other than that, it naturally removes the whitespaces at the start and end of the `string`. +It naturally removes the whitespaces at the start and end of the `string`. --- ## ⚡️Benchmarks -Performance is one of the priorities of this crate. -One of the advises is to not listen to repository authors/maintainers when it comes to benchmarks. -You are free to run `cargo bench` on your machine after cloning this repository instead. +Performance is a priority; Most updates are performance improvements. The benchmark uses a transcript of the [Bee Movie](https://movies.fandom.com/wiki/Bee_Movie/Transcript). Execute these commands to benchmark: @@ -54,13 +52,20 @@ $ cargo bench You should only look for results that look like the following: ```bash -Sift/Sift time: [176.65 µs 177.11 µs 177.73 µs] +Sift/Sift time: [159.31 µs 159.60 µs 159.95 µs] Sift Preserved/Sift Preserved - time: [242.64 µs 243.04 µs 243.79 µs] + time: [198.11 µs 198.21 µs 198.32 µs] ``` In just 0.0001 seconds; Pretty impressive, no? -Go try it on a better machine, I guess. +
+Go try it on a better machine, I guess. +Benchmark specifications: +
--- diff --git a/src/lib.rs b/src/lib.rs index 83aa9c4..f327526 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ //! Sift duplicate whitespaces away in just one function call. //! This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`. -//! Other than that, it naturally removes the whitespaces at the start and end of the `string`. +//! It naturally removes the whitespaces at the start and end of the `string`. //! //! # Examples //! @@ -20,6 +20,10 @@ //! ); //! ``` +mod unsafe_vec; + +use unsafe_vec::UnsafeVec; + /// A trait containing all `string` whitespace-sifting functions. pub trait WhitespaceSifter: AsRef { /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef`. @@ -46,16 +50,16 @@ pub trait WhitespaceSifter: AsRef { while ind < bytes.len() { crate::sift_preallocated_until_newline(bytes, &mut ind, &mut out); } - let out_mut: &mut Vec = unsafe { out.as_mut_vec() }; - if out_mut.len() > 1 { - if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(2)) } == CARRIAGE_RETURN - { - out_mut.pop(); - out_mut.pop(); + if out.len() > 1 { + let out_mut: &mut Vec = unsafe { out.as_mut_vec() }; + let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(2) }; + if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == CARRIAGE_RETURN { + unsafe { out_mut.set_len(new_out_mut_len) }; return out; } - if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(1)) } == LINE_FEED { - out_mut.pop(); + let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) }; + if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == LINE_FEED { + unsafe { out_mut.set_len(new_out_mut_len) }; } } out @@ -67,25 +71,7 @@ impl> WhitespaceSifter for T {} /// A utility for `sift`. fn sift_preallocated(bytes: &[u8], out: &mut String) { let mut ind: usize = 0; - // Implementation of str::trim_start() - while ind < bytes.len() { - match get_char_metadata(*unsafe { bytes.get_unchecked(ind) }) { - Character::SingleByte { data } => { - ind = unsafe { ind.unchecked_add(1) }; - if !is_ascii_whitespace(data) { - unsafe { out.as_mut_vec() }.push(data); - break; - } - } - Character::MultiByte { len } => { - let new_ind: usize = unsafe { ind.unchecked_add(len) }; - unsafe { out.as_mut_vec() } - .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) }); - ind = new_ind; - break; - } - } - } + sift_trim_start(bytes, &mut ind, out); // Actual sifting let mut is_last_whitespace: bool = false; let mut is_last_carriage_return: bool = false; @@ -96,8 +82,9 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) { ind = unsafe { ind.unchecked_add(1) }; if is_ascii_whitespace(data) { if data == LINE_FEED && is_last_carriage_return { - #[allow(clippy::cast_possible_truncation)] - unsafe { out.as_mut_vec() }.push(LINE_FEED); + unsafe { + out.as_mut_vec().unsafe_push(LINE_FEED); + } is_last_carriage_return = false; is_last_carriage_return_line_feed = true; continue; @@ -109,17 +96,12 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) { } else { is_last_whitespace = false; } - unsafe { out.as_mut_vec() }.push(data); + unsafe { out.as_mut_vec().unsafe_push(data) }; is_last_carriage_return = data == CARRIAGE_RETURN; is_last_carriage_return_line_feed = false; continue; } - Character::MultiByte { len } => { - let new_ind: usize = unsafe { ind.unchecked_add(len) }; - unsafe { out.as_mut_vec() } - .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) }); - ind = new_ind; - } + Character::MultiByte { len } => extend_from_bytes_with_len(bytes, &mut ind, out, len), } is_last_carriage_return = false; is_last_whitespace = false; @@ -127,38 +109,16 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) { } // Implementation of str::trim_end() if is_last_carriage_return_line_feed { - let out_mut: &mut Vec = unsafe { out.as_mut_vec() }; - out_mut.pop(); - out_mut.pop(); + let new_out_len: usize = unsafe { out.len().unchecked_sub(2) }; + unsafe { out.as_mut_vec().set_len(new_out_len) }; return; } - if is_last_whitespace { - let out_mut: &mut Vec = unsafe { out.as_mut_vec() }; - out_mut.pop(); - } + sift_trim_end(out, is_last_whitespace); } /// A utility for `sift_preserve_newlines`. fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) { - // Implementation of str::trim_start() - while *ind < bytes.len() { - match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) { - Character::SingleByte { data } => { - *ind = unsafe { ind.unchecked_add(1) }; - if !is_ascii_whitespace(data) { - unsafe { out.as_mut_vec() }.push(data); - break; - } - } - Character::MultiByte { len } => { - let new_ind: usize = unsafe { ind.unchecked_add(len) }; - unsafe { out.as_mut_vec() } - .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) }); - *ind = new_ind; - break; - } - } - } + sift_trim_start(bytes, ind, out); // Actual sifting let mut is_last_whitespace: bool = false; let mut is_last_carriage_return: bool = false; @@ -171,13 +131,14 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri // Implementation of str::trim_end() let out_mut: &mut Vec = unsafe { out.as_mut_vec() }; if is_last_whitespace { - out_mut.pop(); + let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) }; + unsafe { out_mut.set_len(new_out_mut_len) }; } // Append newline if is_last_carriage_return { - out_mut.push(CARRIAGE_RETURN); + unsafe { out_mut.unsafe_push(CARRIAGE_RETURN) }; } - out_mut.push(LINE_FEED); + unsafe { out_mut.unsafe_push(LINE_FEED) }; return; } is_last_carriage_return = data == CARRIAGE_RETURN; @@ -188,23 +149,42 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri } else { is_last_whitespace = false; } - unsafe { out.as_mut_vec() }.push(data); + unsafe { out.as_mut_vec().unsafe_push(data) }; is_last_carriage_return = data == CARRIAGE_RETURN; continue; } - Character::MultiByte { len } => { - let new_ind: usize = unsafe { ind.unchecked_add(len) }; - unsafe { out.as_mut_vec() } - .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) }); - *ind = new_ind; - } + Character::MultiByte { len } => extend_from_bytes_with_len(bytes, ind, out, len), } is_last_carriage_return = false; is_last_whitespace = false; } - // Implementation of str::trim_end() + sift_trim_end(out, is_last_whitespace); +} + +/// A custom implementation of `str::trim_start`. +fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) { + while *ind < bytes.len() { + match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) { + Character::SingleByte { data } => { + *ind = unsafe { ind.unchecked_add(1) }; + if !is_ascii_whitespace(data) { + unsafe { out.as_mut_vec().unsafe_push(data) }; + break; + } + } + Character::MultiByte { len } => { + extend_from_bytes_with_len(bytes, ind, out, len); + break; + } + } + } +} + +/// A custom implementation for `str::trim_end`. +fn sift_trim_end(out: &mut String, is_last_whitespace: bool) { if is_last_whitespace { - unsafe { out.as_mut_vec() }.pop(); + let new_out_len: usize = unsafe { out.len().unchecked_sub(1) }; + unsafe { out.as_mut_vec().set_len(new_out_len) }; } } @@ -244,5 +224,15 @@ const fn is_ascii_whitespace(codepoint: u8) -> bool { ) } +/// A function mostly used for `Character::MultiByte` copying. +fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) { + let new_ind: usize = unsafe { ind.unchecked_add(len) }; + unsafe { + out.as_mut_vec() + .unsafe_extend(bytes.get_unchecked(*ind..new_ind)); + } + *ind = new_ind; +} + #[cfg(test)] mod tests; diff --git a/src/tests.rs b/src/tests.rs index 7802662..c46e838 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -32,19 +32,12 @@ fn test_sift_preserved() { &input.sift_preserve_newlines(), "This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces." ); - let input: String = format!( - "{}\n\n{}\n\n{}\n\n\n{}\r\n\n\r\n{}\r\n\r\n{}\r\n\r\n\r\n", - "This. \n\nis. \n\na. \n\nsentence... \n\n", - "With. \n\nsome. \n\nduplicate... \n\n", - "Whitespaces. \n\n", - "This. \r\n\r\nis. \r\n\r\na. \r\n\r\nsentence... \r\n\r\n", - "With. \r\n\r\nsome. \r\n\r\nduplicate... \r\n\r\n", - "Whitespaces." - ); - assert_eq!( - &input.sift_preserve_newlines(), - "This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces." - ); +} + +#[test] +fn test_blank_string_sifting() { + assert_eq!(&"".sift(), ""); + assert_eq!(&"".sift_preserve_newlines(), ""); } #[test] diff --git a/src/unsafe_vec.rs b/src/unsafe_vec.rs new file mode 100644 index 0000000..6edd74d --- /dev/null +++ b/src/unsafe_vec.rs @@ -0,0 +1,20 @@ +/// A trait containing all unsafe `Vec` functions used by this crate. +pub(crate) trait UnsafeVec { + /// Push to a `Vec` without checking the capacity. + unsafe fn unsafe_push(&mut self, item: T); + + /// Extend to a `Vec` without checking the capacity. + unsafe fn unsafe_extend(&mut self, item: &[T]); +} + +impl UnsafeVec for Vec { + unsafe fn unsafe_push(&mut self, item: T) { + std::ptr::write(self.as_mut_ptr().add(self.len()), item); + self.set_len(self.len().unchecked_add(1)); + } + + unsafe fn unsafe_extend(&mut self, item: &[T]) { + std::ptr::copy_nonoverlapping(item.as_ptr(), self.as_mut_ptr(), item.len()); + self.set_len(self.len().unchecked_add(item.len())); + } +}