From 831c2a95f996f1fc6a847f9f9b4843c69a985546 Mon Sep 17 00:00:00 2001 From: Marcus Griep Date: Sun, 19 Jun 2022 12:29:07 -0400 Subject: [PATCH 1/4] feat: add impl From for String Closes #117 --- compact_str/src/lib.rs | 6 ++ compact_str/src/repr/boxed/mod.rs | 17 ++++ compact_str/src/repr/heap.rs | 5 ++ compact_str/src/repr/mod.rs | 42 +++++++++ compact_str/src/tests.rs | 138 ++++++++++++++++++++++++++++++ 5 files changed, 208 insertions(+) diff --git a/compact_str/src/lib.rs b/compact_str/src/lib.rs index deccd4be..602caaf2 100644 --- a/compact_str/src/lib.rs +++ b/compact_str/src/lib.rs @@ -610,6 +610,12 @@ impl From> for CompactString { } } +impl From for String { + fn from(s: CompactString) -> Self { + s.repr.into_string() + } +} + impl FromStr for CompactString { type Err = core::convert::Infallible; fn from_str(s: &str) -> Result { diff --git a/compact_str/src/repr/boxed/mod.rs b/compact_str/src/repr/boxed/mod.rs index bf5844d0..06e77cd2 100644 --- a/compact_str/src/repr/boxed/mod.rs +++ b/compact_str/src/repr/boxed/mod.rs @@ -151,6 +151,23 @@ impl BoxString { } } + #[inline] + pub fn into_string(self) -> String { + // Ensure that we don't free the data we are transferring to `String` + let this = ManuallyDrop::new(self); + + // SAFETY: + // * The memory in `ptr` was previously allocated by the same allocator the standard library + // uses, with a required alignment of exactly 1. + // * `length` is less than or equal to capacity, due to internal invaraints. + // * `capacity` is correctly maintained internally. + // * `BoxString` only ever contains valid UTF-8. + // * `BoxString` capacity is always on the heap. + unsafe { + String::from_raw_parts(this.ptr.as_ptr(), this.len, this.cap.as_usize_unchecked()) + } + } + #[inline] pub fn from_box_str(b: Box) -> Self { match Capacity::new(b.len()) { diff --git a/compact_str/src/repr/heap.rs b/compact_str/src/repr/heap.rs index b4e3ee96..76e34df5 100644 --- a/compact_str/src/repr/heap.rs +++ b/compact_str/src/repr/heap.rs @@ -38,6 +38,11 @@ impl HeapString { HeapString { string } } + #[inline] + pub fn into_string(self) -> String { + self.string.into_string() + } + #[inline] pub fn from_box_str(b: Box) -> Self { let string = BoxString::from_box_str(b); diff --git a/compact_str/src/repr/mod.rs b/compact_str/src/repr/mod.rs index b630cc45..75b6c755 100644 --- a/compact_str/src/repr/mod.rs +++ b/compact_str/src/repr/mod.rs @@ -122,6 +122,22 @@ impl Repr { } } + #[inline] + pub fn into_string(self) -> String { + if self.capacity() == 0 { + String::new() + } else { + match self.cast_into() { + StrongIntoRepr::Inline(inline) => String::from(inline.as_str()), + StrongIntoRepr::Heap(heap) => { + // `HeapString::into_string()` takes ownership and + // is responsible for avoiding a double-free. + ManuallyDrop::into_inner(heap).into_string() + } + } + } + } + #[inline] pub fn from_box_str(b: Box) -> Self { if b.len() == 0 { @@ -272,6 +288,20 @@ impl Repr { } } + #[inline(always)] + fn cast_into(self) -> StrongIntoRepr { + match self.discriminant() { + Discriminant::Heap => { + // SAFETY: We checked the discriminant to make sure the union is `heap` + StrongIntoRepr::Heap(unsafe { self.into_union().heap }) + } + Discriminant::Inline => { + // SAFETY: We checked the discriminant to make sure the union is `inline` + StrongIntoRepr::Inline(unsafe { self.into_union().inline }) + } + } + } + #[inline(always)] const fn from_inline(repr: InlineString) -> Self { // SAFETY: An `InlineString` and `Repr` have the same size @@ -295,6 +325,12 @@ impl Repr { // SAFETY: An `ReprUnion` and `Repr` have the same size unsafe { &mut *(self as *mut _ as *mut _) } } + + #[inline(always)] + fn into_union(self) -> ReprUnion { + // SAFETY: An `ReprUnion` and `Repr` have the same size + unsafe { std::mem::transmute(self) } + } } impl Clone for Repr { @@ -501,6 +537,12 @@ impl<'a> MutStrongRepr<'a> { } } +#[derive(Debug)] +enum StrongIntoRepr { + Inline(InlineString), + Heap(ManuallyDrop), +} + crate::asserts::assert_size_eq!(ReprUnion, Repr, Option, String, Option); #[cfg(target_pointer_width = "64")] diff --git a/compact_str/src/tests.rs b/compact_str/src/tests.rs index 80c844ee..fa0b959a 100644 --- a/compact_str/src/tests.rs +++ b/compact_str/src/tests.rs @@ -160,6 +160,16 @@ fn proptest_reserve_and_write_bytes_allocated_properly(#[strategy(rand_unicode() prop_assert_eq!(compact.is_heap_allocated(), word.len() > MAX_SIZE); } +#[proptest] +#[cfg_attr(miri, ignore)] +fn proptest_arbitrary_compact_string_converts_to_string(#[strategy(rand_unicode())] word: String) { + let compact = CompactString::new(&word); + let result = String::from(compact); + + prop_assert_eq!(result.len(), word.len()); + prop_assert_eq!(result, word); +} + #[proptest] #[cfg_attr(miri, ignore)] fn proptest_extend_chars_allocated_properly( @@ -695,3 +705,131 @@ fn test_to_compact_string() { format_compact!("0{}67890123456789{}", "12345", 999999) ); } + +#[test] +fn test_into_string_large_string_with_excess_capacity() { + let mut string = String::with_capacity(128); + string.push_str("abcdefghijklmnopqrstuvwxyz"); + let str_addr = string.as_ptr(); + let str_len = string.len(); + let str_cap = string.capacity(); + + let compact = CompactString::from(string); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_eq!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_cap, new_str_cap); +} + +#[test] +fn test_into_string_small_string_with_excess_capacity() { + let mut string = String::with_capacity(128); + string.push_str("abcdef"); + let str_addr = string.as_ptr(); + let str_len = string.len(); + let str_cap = string.capacity(); + + let compact = CompactString::from(string); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + // If small boxed strings are eagerly compacted, the address and capacity assertions won't hold. + // Compaction is not eager, so these should hold. + assert_eq!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_cap, new_str_cap); +} + +#[test] +fn test_into_string_small_string_with_no_excess_capacity() { + let string = String::from("abcdef"); + let str_addr = string.as_ptr(); + let str_len = string.len(); + let str_cap = string.capacity(); + + let compact = CompactString::from(string); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + // If small boxed strings are eagerly compacted, the address assertion won't hold. + // Compaction is not eager, so these should hold. + assert_eq!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_cap, new_str_cap); +} + +#[test] +fn test_into_string_empty_string() { + let string = String::new(); + let str_addr = string.as_ptr(); + let str_len = string.len(); + let str_cap = string.capacity(); + + let compact = CompactString::from(string); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_eq!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_cap, new_str_cap); +} + +#[test] +fn test_into_string_small_str() { + let data = "abcdef"; + let str_addr = data.as_ptr(); + let str_len = data.len(); + + let compact = CompactString::from(data); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_ne!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_len, new_str_cap); +} + +#[test] +fn test_into_string_long_str() { + let data = "abcdefghijklmnopqrstuvwxyz"; + let str_addr = data.as_ptr(); + let str_len = data.len(); + + let compact = CompactString::from(data); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_ne!(str_addr, new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_len, new_str_cap); +} + +#[test] +fn test_into_string_empty_str() { + let data = ""; + let str_len = data.len(); + + let compact = CompactString::from(data); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_eq!(String::new().as_ptr(), new_str_addr); + assert_eq!(str_len, new_str_len); + assert_eq!(str_len, new_str_cap); +} From ce5fbd3f52bb44284f096b9d9f20add82055f933 Mon Sep 17 00:00:00 2001 From: Marcus Griep Date: Sun, 19 Jun 2022 12:54:53 -0400 Subject: [PATCH 2/4] fix: handling of boxed capacity on 32-bit arch --- compact_str/src/repr/boxed/mod.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/compact_str/src/repr/boxed/mod.rs b/compact_str/src/repr/boxed/mod.rs index 06e77cd2..b148f1cd 100644 --- a/compact_str/src/repr/boxed/mod.rs +++ b/compact_str/src/repr/boxed/mod.rs @@ -162,10 +162,7 @@ impl BoxString { // * `length` is less than or equal to capacity, due to internal invaraints. // * `capacity` is correctly maintained internally. // * `BoxString` only ever contains valid UTF-8. - // * `BoxString` capacity is always on the heap. - unsafe { - String::from_raw_parts(this.ptr.as_ptr(), this.len, this.cap.as_usize_unchecked()) - } + unsafe { String::from_raw_parts(this.ptr.as_ptr(), this.len, this.capacity()) } } #[inline] From 4c2e0263e241237d13664e47d09d99e5be13cf41 Mon Sep 17 00:00:00 2001 From: Marcus Griep Date: Mon, 20 Jun 2022 21:44:24 -0400 Subject: [PATCH 3/4] fix: properly handle the 32-bit case with boxed capacity --- compact_str/src/repr/boxed/mod.rs | 25 +++++++++++++++---------- compact_str/src/repr/mod.rs | 14 +++++++------- compact_str/src/tests.rs | 30 ++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/compact_str/src/repr/boxed/mod.rs b/compact_str/src/repr/boxed/mod.rs index b148f1cd..1f724e2d 100644 --- a/compact_str/src/repr/boxed/mod.rs +++ b/compact_str/src/repr/boxed/mod.rs @@ -153,16 +153,21 @@ impl BoxString { #[inline] pub fn into_string(self) -> String { - // Ensure that we don't free the data we are transferring to `String` - let this = ManuallyDrop::new(self); - - // SAFETY: - // * The memory in `ptr` was previously allocated by the same allocator the standard library - // uses, with a required alignment of exactly 1. - // * `length` is less than or equal to capacity, due to internal invaraints. - // * `capacity` is correctly maintained internally. - // * `BoxString` only ever contains valid UTF-8. - unsafe { String::from_raw_parts(this.ptr.as_ptr(), this.len, this.capacity()) } + match self.cap.as_usize() { + Ok(cap) => { + // Ensure that we don't free the data we are transferring the buffer to `String` + let this = ManuallyDrop::new(self); + + // SAFETY: + // * The memory in `ptr` was previously allocated by the same allocator the standard + // library uses, with a required alignment of exactly 1. + // * `length` is less than or equal to capacity, due to internal invaraints. + // * `capacity` is correctly maintained internally. + // * `BoxString` only ever contains valid UTF-8. + unsafe { String::from_raw_parts(this.ptr.as_ptr(), this.len, cap) } + } + Err(_) => String::from(self.as_str()), + } } #[inline] diff --git a/compact_str/src/repr/mod.rs b/compact_str/src/repr/mod.rs index 75b6c755..03400fc0 100644 --- a/compact_str/src/repr/mod.rs +++ b/compact_str/src/repr/mod.rs @@ -127,9 +127,9 @@ impl Repr { if self.capacity() == 0 { String::new() } else { - match self.cast_into() { - StrongIntoRepr::Inline(inline) => String::from(inline.as_str()), - StrongIntoRepr::Heap(heap) => { + match self.cast_into_owned() { + StrongOwnedRepr::Inline(inline) => String::from(inline.as_str()), + StrongOwnedRepr::Heap(heap) => { // `HeapString::into_string()` takes ownership and // is responsible for avoiding a double-free. ManuallyDrop::into_inner(heap).into_string() @@ -289,15 +289,15 @@ impl Repr { } #[inline(always)] - fn cast_into(self) -> StrongIntoRepr { + fn cast_into_owned(self) -> StrongOwnedRepr { match self.discriminant() { Discriminant::Heap => { // SAFETY: We checked the discriminant to make sure the union is `heap` - StrongIntoRepr::Heap(unsafe { self.into_union().heap }) + StrongOwnedRepr::Heap(unsafe { self.into_union().heap }) } Discriminant::Inline => { // SAFETY: We checked the discriminant to make sure the union is `inline` - StrongIntoRepr::Inline(unsafe { self.into_union().inline }) + StrongOwnedRepr::Inline(unsafe { self.into_union().inline }) } } } @@ -538,7 +538,7 @@ impl<'a> MutStrongRepr<'a> { } #[derive(Debug)] -enum StrongIntoRepr { +enum StrongOwnedRepr { Inline(InlineString), Heap(ManuallyDrop), } diff --git a/compact_str/src/tests.rs b/compact_str/src/tests.rs index fa0b959a..f76e6f2d 100644 --- a/compact_str/src/tests.rs +++ b/compact_str/src/tests.rs @@ -725,6 +725,36 @@ fn test_into_string_large_string_with_excess_capacity() { assert_eq!(str_cap, new_str_cap); } +#[test] +fn test_into_string_where_32_bit_capacity_is_on_heap() { + const SIXTEEN_MB: usize = 16 * 1024 * 1024; + let buf = vec![b'a'; SIXTEEN_MB - 1]; + // SAFETY: `buf` is filled with ASCII `a`s. + // This primarily speeds up miri, as we don't need to check every byte + // in the input buffer + let string = unsafe { String::from_utf8_unchecked(buf) }; + + let str_addr = string.as_ptr(); + let str_len = string.len(); + let str_cap = string.capacity(); + + let compact = CompactString::from(string); + let new_string = String::from(compact); + let new_str_addr = new_string.as_ptr(); + let new_str_len = new_string.len(); + let new_str_cap = new_string.capacity(); + + assert_eq!(str_len, new_str_len); + + if cfg!(target_pointer_width = "64") { + assert_eq!(str_addr, new_str_addr); + assert_eq!(str_cap, new_str_cap); + } else { + assert_eq!(&new_string.as_bytes()[0..10], b"aaaaaaaaaa"); + assert_eq!(str_len, new_str_cap); + } +} + #[test] fn test_into_string_small_string_with_excess_capacity() { let mut string = String::with_capacity(128); From 2eb7ce4c2f9016ce60fa583da05bfbde7403aca0 Mon Sep 17 00:00:00 2001 From: Marcus Griep Date: Tue, 21 Jun 2022 08:34:00 -0400 Subject: [PATCH 4/4] test: speed up miri test runs By using `String::from_utf8_unchecked` we significantly speed up miri test runs. This is because miri doesn't need to spend a ton of time verifying that every character in the ~16MiB vector is UTF-8, when we already know that it is, since it's always just ASCII `A`s. --- compact_str/src/repr/boxed/mod.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/compact_str/src/repr/boxed/mod.rs b/compact_str/src/repr/boxed/mod.rs index 1f724e2d..f5f1fb73 100644 --- a/compact_str/src/repr/boxed/mod.rs +++ b/compact_str/src/repr/boxed/mod.rs @@ -612,8 +612,10 @@ mod tests { fn test_32_bit_max_inline_cap() { // 65 is the ASCII value of 'A' // `SIXTEEN_MB - 2` is the max value we can store for capacity inline, when on 32-bit archs - let word_buf = vec![65; SIXTEEN_MB - 2]; - let string = String::from_utf8(word_buf).unwrap(); + let word_buf = vec![b'A'; SIXTEEN_MB - 2]; + // SAFETY: We've just created the vector above, and it contains only ASCII characters. + // This speeds up `miri` runs significantly. + let string = unsafe { String::from_utf8_unchecked(word_buf) }; let box_string = BoxString::new(&string); @@ -627,8 +629,10 @@ mod tests { fn test_32_bit_max_inline_cap_with_modification() { // 65 is the ASCII value of 'A' // `SIXTEEN_MB - 2` is the max value we can store for capacity inline, when on 32-bit archs - let word_buf = vec![65; SIXTEEN_MB - 2]; - let string = String::from_utf8(word_buf).unwrap(); + let word_buf = vec![b'A'; SIXTEEN_MB - 2]; + // SAFETY: We've just created the vector above, and it contains only ASCII characters. + // This speeds up `miri` runs significantly. + let string = unsafe { String::from_utf8_unchecked(word_buf) }; let mut box_string = BoxString::new(&string); @@ -668,8 +672,10 @@ mod tests { fn test_32_bit_min_heap_cap() { // 65 is the ASCII value of 'A' // `SIXTEEN_MB - 1` is the min value for capacity that gets stored on the heap - let word_buf = vec![65; SIXTEEN_MB - 1]; - let string = String::from_utf8(word_buf).unwrap(); + let word_buf = vec![b'A'; SIXTEEN_MB - 1]; + // SAFETY: We've just created the vector above, and it contains only ASCII characters. + // This speeds up `miri` runs significantly. + let string = unsafe { String::from_utf8_unchecked(word_buf) }; let mut box_string = BoxString::new(&string);