From 18363ea8065d0d29bf8bea12941e19597084d4fa Mon Sep 17 00:00:00 2001 From: kmaasrud Date: Wed, 15 Mar 2023 09:44:02 +0100 Subject: [PATCH] feat: add custom CowStr type Related issue: #20 --- src/attr.rs | 20 ++--- src/lib.rs | 6 +- src/string.rs | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 219 insertions(+), 16 deletions(-) create mode 100644 src/string.rs diff --git a/src/attr.rs b/src/attr.rs index 12a0b595..1220d3ae 100644 --- a/src/attr.rs +++ b/src/attr.rs @@ -45,18 +45,11 @@ impl<'s> AttributeValue<'s> { } fn extend(&mut self, s: &'s str) { - match &mut self.raw { - CowStr::Borrowed(prev) => { - if prev.is_empty() { - *prev = s; - } else { - self.raw = format!("{} {}", prev, s).into(); - } - } - CowStr::Owned(ref mut prev) => { - prev.push(' '); - prev.push_str(s); - } + if self.raw.is_empty() { + self.raw = s.into(); + } else { + self.raw.push(' '); + self.raw.push_str(s); } } } @@ -171,7 +164,8 @@ impl<'s> Attributes<'s> { if let Some(i) = attrs.iter().position(|(k, _)| *k == key) { let prev = &mut attrs[i].1; if key == "class" { - *prev = format!("{} {}", prev, val).into(); + prev.raw.push(' '); + prev.raw.push_str(&val.raw); } else { *prev = val; } diff --git a/src/lib.rs b/src/lib.rs index aa7623ab..53757240 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,7 +36,7 @@ //! let events = //! jotdown::Parser::new("a [link](https://example.com)").map(|e| match e { //! Event::Start(Link(dst, ty), attrs) => { -//! Event::Start(Link(dst.replace(".com", ".net").into(), ty), attrs) +//! Event::Start(Link(dst.replace(".com", ".net"), ty), attrs) //! } //! e => e, //! }); @@ -60,13 +60,13 @@ mod block; mod inline; mod lex; mod span; +mod string; mod tree; use span::Span; pub use attr::{AttributeValue, AttributeValueParts, Attributes}; - -type CowStr<'s> = std::borrow::Cow<'s, str>; +pub use string::CowStr; /// A trait for rendering [`Event`]s to an output format. /// diff --git a/src/string.rs b/src/string.rs new file mode 100644 index 00000000..befb3cc6 --- /dev/null +++ b/src/string.rs @@ -0,0 +1,209 @@ +use std::{borrow::Borrow, fmt::Display, ops::Deref, str::from_utf8}; + +// Largest CowStr variant is Owned(String). A String uses 3 words of memory, but a fourth word is +// needed to hold the tag (the tag takes a byte, but a full word is used for alignment reasons.) +// This means that the available space we have for an inline string is 4 words - 2 bytes for the +// tag and length. +const MAX_INLINE_STR_LEN: usize = 4 * std::mem::size_of::() - 2; + +#[derive(Debug, Eq)] +pub enum CowStr<'s> { + Owned(String), + Borrowed(&'s str), + Inlined([u8; MAX_INLINE_STR_LEN], u8), +} + +impl<'s> CowStr<'s> { + pub fn replace(self, from: &str, to: &str) -> Self { + if from.is_empty() { + return self; + } + + match self { + CowStr::Inlined(mut inner, len) => { + let mut len = len as usize; + let diff = to.len() as isize - from.len() as isize; + + while let Some(start) = from_utf8(&inner[..len]).unwrap().find(from) { + if diff.is_positive() { + len += diff as usize; + if len > MAX_INLINE_STR_LEN { + return CowStr::Owned(self.deref().replace(from, to)); + } + inner[start + from.len()..].rotate_right(diff as usize); + } else if diff.is_negative() { + len -= (-diff) as usize; + inner[start..].rotate_left((-diff) as usize); + } + + inner[start..start + to.len()].copy_from_slice(to.as_bytes()); + } + + CowStr::Inlined(inner, len as u8) + } + CowStr::Borrowed(s) if s.contains(from) => { + let mut inner = [0; MAX_INLINE_STR_LEN]; + let mut len = s.len(); + let diff = to.len() as isize - from.len() as isize; + inner[..len].copy_from_slice(s.as_bytes()); + + while let Some(start) = from_utf8(&inner[..len]).unwrap().find(from) { + if diff.is_positive() { + len += diff as usize; + if len > MAX_INLINE_STR_LEN { + return CowStr::Owned(self.deref().replace(from, to)); + } + inner[start + from.len()..].rotate_right(diff as usize); + } else if diff.is_negative() { + len -= (-diff) as usize; + inner[start..].rotate_left((-diff) as usize); + } + + inner[start..start + to.len()].copy_from_slice(to.as_bytes()); + } + + CowStr::Inlined(inner, len as u8) + } + CowStr::Owned(s) if s.contains(from) => CowStr::Owned(s.replace(from, to)), + _ => self, + } + } + + pub fn push(&mut self, c: char) { + match self { + CowStr::Owned(this) => this.push(c), + CowStr::Inlined(inner, len) => { + let l = *len as usize + c.len_utf8(); + if l > MAX_INLINE_STR_LEN { + let mut s = self.to_string(); + s.push(c); + *self = CowStr::Owned(s); + } else { + c.encode_utf8(&mut inner[*len as usize..l]); + *len = l as u8; + } + } + CowStr::Borrowed(this) => { + let len = this.len() + c.len_utf8(); + if len > MAX_INLINE_STR_LEN { + let mut s = self.to_string(); + s.push(c); + *self = CowStr::Owned(s); + } else { + let mut inner = [0; MAX_INLINE_STR_LEN]; + inner[..this.len()].copy_from_slice(this.as_bytes()); + c.encode_utf8(&mut inner[this.len()..len]); + *self = CowStr::Inlined(inner, len as u8); + } + } + } + } + + pub fn push_str(&mut self, s: &str) { + if s.is_empty() { + return; + } + + match self { + CowStr::Owned(this) => this.push_str(s), + CowStr::Inlined(inner, len) => { + let l = *len as usize + s.len(); + if l > MAX_INLINE_STR_LEN { + *self = CowStr::Owned(self.to_string() + s); + } else { + inner[*len as usize..l].copy_from_slice(s.as_bytes()); + *len = l as u8; + } + } + CowStr::Borrowed(this) => { + let len = this.len() + s.len(); + if len > MAX_INLINE_STR_LEN { + *self = CowStr::Owned(this.to_string() + s); + } else { + let mut inner = [0; MAX_INLINE_STR_LEN]; + inner[..this.len()].copy_from_slice(this.as_bytes()); + inner[this.len()..len].copy_from_slice(s.as_bytes()); + *self = CowStr::Inlined(inner, len as u8); + } + } + } + } +} + +impl<'s> Deref for CowStr<'s> { + type Target = str; + + fn deref(&self) -> &Self::Target { + match *self { + Self::Owned(ref s) => s.borrow(), + Self::Borrowed(s) => s, + // NOTE: Inlined strings can only be constructed from strings or chars, which means they + // are guaranteed to be valid UTF-8. We could consider unchecked conversion as well, but + // a benchmark should be done before introducing unsafes. + Self::Inlined(ref inner, len) => from_utf8(&inner[..len as usize]).unwrap(), + } + } +} + +impl<'s> AsRef for CowStr<'s> { + fn as_ref(&self) -> &str { + self.deref() + } +} + +impl<'s> From for CowStr<'s> { + fn from(value: char) -> Self { + let mut inner = [0u8; MAX_INLINE_STR_LEN]; + value.encode_utf8(&mut inner); + CowStr::Inlined(inner, value.len_utf8() as u8) + } +} + +impl<'s> From<&'s str> for CowStr<'s> { + fn from(value: &'s str) -> Self { + CowStr::Borrowed(value) + } +} + +impl<'s> From for CowStr<'s> { + fn from(value: String) -> Self { + CowStr::Owned(value) + } +} + +impl<'s> Clone for CowStr<'s> { + fn clone(&self) -> Self { + match self { + CowStr::Owned(s) => { + let len = s.len(); + if len > MAX_INLINE_STR_LEN { + CowStr::Owned(s.clone()) + } else { + let mut inner = [0u8; MAX_INLINE_STR_LEN]; + inner[..len].copy_from_slice(s.as_bytes()); + CowStr::Inlined(inner, len as u8) + } + } + CowStr::Borrowed(s) => CowStr::Borrowed(s), + CowStr::Inlined(inner, len) => CowStr::Inlined(*inner, *len), + } + } +} + +impl<'s> PartialEq for CowStr<'s> { + fn eq(&self, other: &Self) -> bool { + self.deref() == other.deref() + } +} + +impl<'s> Display for CowStr<'s> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.deref()) + } +} + +impl<'s, 'a> FromIterator<&'a str> for CowStr<'s> { + fn from_iter>(iter: T) -> Self { + CowStr::Owned(FromIterator::from_iter(iter)) + } +}