diff --git a/Cargo.toml b/Cargo.toml
index 1584933..2f759d6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "whitespace-sifter"
-version = "2.2.0"
+version = "2.3.0"
edition = "2021"
authors = ["JumperBot_"]
description = "Sift duplicate whitespaces away!"
diff --git a/README.md b/README.md
index fb65009..da1dbde 100644
--- a/README.md
+++ b/README.md
@@ -32,15 +32,13 @@ println!(
## ✨ Sift Duplicate Whitespaces In One Function Call
This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.
-Other than that, it naturally removes the whitespaces at the start and end of the `string`.
+It naturally removes the whitespaces at the start and end of the `string`.
---
## ⚡️Benchmarks
-Performance is one of the priorities of this crate.
-One of the advises is to not listen to repository authors/maintainers when it comes to benchmarks.
-You are free to run `cargo bench` on your machine after cloning this repository instead.
+Performance is a priority; Most updates are performance improvements.
The benchmark uses a transcript of the [Bee Movie](https://movies.fandom.com/wiki/Bee_Movie/Transcript).
Execute these commands to benchmark:
@@ -54,13 +52,20 @@ $ cargo bench
You should only look for results that look like the following:
```bash
-Sift/Sift time: [176.65 µs 177.11 µs 177.73 µs]
+Sift/Sift time: [159.31 µs 159.60 µs 159.95 µs]
Sift Preserved/Sift Preserved
- time: [242.64 µs 243.04 µs 243.79 µs]
+ time: [198.11 µs 198.21 µs 198.32 µs]
```
In just 0.0001 seconds; Pretty impressive, no?
-Go try it on a better machine, I guess.
+
+Go try it on a better machine, I guess.
+Benchmark specifications:
+
+- Processor: Intel(R) Core(TM) i5-8350U CPU @ 1.70GHz 1.90 GHz
+- Memory: RAM 16.0 GB (15.8 GB usable)
+- System: GNU/Linux 5.15.153.1-microsoft-standard-WSL2 x86_64
+
---
diff --git a/src/lib.rs b/src/lib.rs
index 83aa9c4..f327526 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
//! Sift duplicate whitespaces away in just one function call.
//! This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.
-//! Other than that, it naturally removes the whitespaces at the start and end of the `string`.
+//! It naturally removes the whitespaces at the start and end of the `string`.
//!
//! # Examples
//!
@@ -20,6 +20,10 @@
//! );
//! ```
+mod unsafe_vec;
+
+use unsafe_vec::UnsafeVec;
+
/// A trait containing all `string` whitespace-sifting functions.
pub trait WhitespaceSifter: AsRef {
/// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef`.
@@ -46,16 +50,16 @@ pub trait WhitespaceSifter: AsRef {
while ind < bytes.len() {
crate::sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}
- let out_mut: &mut Vec = unsafe { out.as_mut_vec() };
- if out_mut.len() > 1 {
- if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(2)) } == CARRIAGE_RETURN
- {
- out_mut.pop();
- out_mut.pop();
+ if out.len() > 1 {
+ let out_mut: &mut Vec = unsafe { out.as_mut_vec() };
+ let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(2) };
+ if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == CARRIAGE_RETURN {
+ unsafe { out_mut.set_len(new_out_mut_len) };
return out;
}
- if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(1)) } == LINE_FEED {
- out_mut.pop();
+ let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
+ if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == LINE_FEED {
+ unsafe { out_mut.set_len(new_out_mut_len) };
}
}
out
@@ -67,25 +71,7 @@ impl> WhitespaceSifter for T {}
/// A utility for `sift`.
fn sift_preallocated(bytes: &[u8], out: &mut String) {
let mut ind: usize = 0;
- // Implementation of str::trim_start()
- while ind < bytes.len() {
- match get_char_metadata(*unsafe { bytes.get_unchecked(ind) }) {
- Character::SingleByte { data } => {
- ind = unsafe { ind.unchecked_add(1) };
- if !is_ascii_whitespace(data) {
- unsafe { out.as_mut_vec() }.push(data);
- break;
- }
- }
- Character::MultiByte { len } => {
- let new_ind: usize = unsafe { ind.unchecked_add(len) };
- unsafe { out.as_mut_vec() }
- .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
- ind = new_ind;
- break;
- }
- }
- }
+ sift_trim_start(bytes, &mut ind, out);
// Actual sifting
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;
@@ -96,8 +82,9 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
ind = unsafe { ind.unchecked_add(1) };
if is_ascii_whitespace(data) {
if data == LINE_FEED && is_last_carriage_return {
- #[allow(clippy::cast_possible_truncation)]
- unsafe { out.as_mut_vec() }.push(LINE_FEED);
+ unsafe {
+ out.as_mut_vec().unsafe_push(LINE_FEED);
+ }
is_last_carriage_return = false;
is_last_carriage_return_line_feed = true;
continue;
@@ -109,17 +96,12 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
} else {
is_last_whitespace = false;
}
- unsafe { out.as_mut_vec() }.push(data);
+ unsafe { out.as_mut_vec().unsafe_push(data) };
is_last_carriage_return = data == CARRIAGE_RETURN;
is_last_carriage_return_line_feed = false;
continue;
}
- Character::MultiByte { len } => {
- let new_ind: usize = unsafe { ind.unchecked_add(len) };
- unsafe { out.as_mut_vec() }
- .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
- ind = new_ind;
- }
+ Character::MultiByte { len } => extend_from_bytes_with_len(bytes, &mut ind, out, len),
}
is_last_carriage_return = false;
is_last_whitespace = false;
@@ -127,38 +109,16 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
}
// Implementation of str::trim_end()
if is_last_carriage_return_line_feed {
- let out_mut: &mut Vec = unsafe { out.as_mut_vec() };
- out_mut.pop();
- out_mut.pop();
+ let new_out_len: usize = unsafe { out.len().unchecked_sub(2) };
+ unsafe { out.as_mut_vec().set_len(new_out_len) };
return;
}
- if is_last_whitespace {
- let out_mut: &mut Vec = unsafe { out.as_mut_vec() };
- out_mut.pop();
- }
+ sift_trim_end(out, is_last_whitespace);
}
/// A utility for `sift_preserve_newlines`.
fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
- // Implementation of str::trim_start()
- while *ind < bytes.len() {
- match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
- Character::SingleByte { data } => {
- *ind = unsafe { ind.unchecked_add(1) };
- if !is_ascii_whitespace(data) {
- unsafe { out.as_mut_vec() }.push(data);
- break;
- }
- }
- Character::MultiByte { len } => {
- let new_ind: usize = unsafe { ind.unchecked_add(len) };
- unsafe { out.as_mut_vec() }
- .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
- *ind = new_ind;
- break;
- }
- }
- }
+ sift_trim_start(bytes, ind, out);
// Actual sifting
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;
@@ -171,13 +131,14 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
// Implementation of str::trim_end()
let out_mut: &mut Vec = unsafe { out.as_mut_vec() };
if is_last_whitespace {
- out_mut.pop();
+ let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
+ unsafe { out_mut.set_len(new_out_mut_len) };
}
// Append newline
if is_last_carriage_return {
- out_mut.push(CARRIAGE_RETURN);
+ unsafe { out_mut.unsafe_push(CARRIAGE_RETURN) };
}
- out_mut.push(LINE_FEED);
+ unsafe { out_mut.unsafe_push(LINE_FEED) };
return;
}
is_last_carriage_return = data == CARRIAGE_RETURN;
@@ -188,23 +149,42 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
} else {
is_last_whitespace = false;
}
- unsafe { out.as_mut_vec() }.push(data);
+ unsafe { out.as_mut_vec().unsafe_push(data) };
is_last_carriage_return = data == CARRIAGE_RETURN;
continue;
}
- Character::MultiByte { len } => {
- let new_ind: usize = unsafe { ind.unchecked_add(len) };
- unsafe { out.as_mut_vec() }
- .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
- *ind = new_ind;
- }
+ Character::MultiByte { len } => extend_from_bytes_with_len(bytes, ind, out, len),
}
is_last_carriage_return = false;
is_last_whitespace = false;
}
- // Implementation of str::trim_end()
+ sift_trim_end(out, is_last_whitespace);
+}
+
+/// A custom implementation of `str::trim_start`.
+fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
+ while *ind < bytes.len() {
+ match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
+ Character::SingleByte { data } => {
+ *ind = unsafe { ind.unchecked_add(1) };
+ if !is_ascii_whitespace(data) {
+ unsafe { out.as_mut_vec().unsafe_push(data) };
+ break;
+ }
+ }
+ Character::MultiByte { len } => {
+ extend_from_bytes_with_len(bytes, ind, out, len);
+ break;
+ }
+ }
+ }
+}
+
+/// A custom implementation for `str::trim_end`.
+fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
if is_last_whitespace {
- unsafe { out.as_mut_vec() }.pop();
+ let new_out_len: usize = unsafe { out.len().unchecked_sub(1) };
+ unsafe { out.as_mut_vec().set_len(new_out_len) };
}
}
@@ -244,5 +224,15 @@ const fn is_ascii_whitespace(codepoint: u8) -> bool {
)
}
+/// A function mostly used for `Character::MultiByte` copying.
+fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
+ let new_ind: usize = unsafe { ind.unchecked_add(len) };
+ unsafe {
+ out.as_mut_vec()
+ .unsafe_extend(bytes.get_unchecked(*ind..new_ind));
+ }
+ *ind = new_ind;
+}
+
#[cfg(test)]
mod tests;
diff --git a/src/tests.rs b/src/tests.rs
index 7802662..c46e838 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -32,19 +32,12 @@ fn test_sift_preserved() {
&input.sift_preserve_newlines(),
"This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
);
- let input: String = format!(
- "{}\n\n{}\n\n{}\n\n\n{}\r\n\n\r\n{}\r\n\r\n{}\r\n\r\n\r\n",
- "This. \n\nis. \n\na. \n\nsentence... \n\n",
- "With. \n\nsome. \n\nduplicate... \n\n",
- "Whitespaces. \n\n",
- "This. \r\n\r\nis. \r\n\r\na. \r\n\r\nsentence... \r\n\r\n",
- "With. \r\n\r\nsome. \r\n\r\nduplicate... \r\n\r\n",
- "Whitespaces."
- );
- assert_eq!(
- &input.sift_preserve_newlines(),
- "This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
- );
+}
+
+#[test]
+fn test_blank_string_sifting() {
+ assert_eq!(&"".sift(), "");
+ assert_eq!(&"".sift_preserve_newlines(), "");
}
#[test]
diff --git a/src/unsafe_vec.rs b/src/unsafe_vec.rs
new file mode 100644
index 0000000..6edd74d
--- /dev/null
+++ b/src/unsafe_vec.rs
@@ -0,0 +1,20 @@
+/// A trait containing all unsafe `Vec` functions used by this crate.
+pub(crate) trait UnsafeVec {
+ /// Push to a `Vec` without checking the capacity.
+ unsafe fn unsafe_push(&mut self, item: T);
+
+ /// Extend to a `Vec` without checking the capacity.
+ unsafe fn unsafe_extend(&mut self, item: &[T]);
+}
+
+impl UnsafeVec for Vec {
+ unsafe fn unsafe_push(&mut self, item: T) {
+ std::ptr::write(self.as_mut_ptr().add(self.len()), item);
+ self.set_len(self.len().unchecked_add(1));
+ }
+
+ unsafe fn unsafe_extend(&mut self, item: &[T]) {
+ std::ptr::copy_nonoverlapping(item.as_ptr(), self.as_mut_ptr(), item.len());
+ self.set_len(self.len().unchecked_add(item.len()));
+ }
+}