Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Combine Inline and Packed Variants #49

Merged
merged 11 commits into from
Feb 9, 2022
11 changes: 4 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ name: CI

env:
RUSTFLAGS: "-D warnings"
PROPTEST_CASES: 10000

jobs:
check:
Expand All @@ -29,8 +30,6 @@ jobs:
test:
name: cargo test
runs-on: ubuntu-latest
env:
PROPTEST_CASES: 500
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
Expand All @@ -41,13 +40,11 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: test
args: --all-features
args: --release --all-features -- --include-ignored

test-nightly:
name: cargo test nightly
runs-on: ubuntu-latest
env:
PROPTEST_CASES: 500
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
Expand All @@ -58,7 +55,7 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: test
args: --all-features
args: --release --all-features -- --include-ignored

miri:
name: cargo miri test
Expand All @@ -73,7 +70,7 @@ jobs:
components: miri
- name: Run Miri
run: |
cargo miri test
cargo miri test --all-features

fuzz-creation:
name: fuzz creation
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/msrv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ on:
name: MSRV

env:
# local default for proptest is 100
PROPTEST_CASES: 1000
RUSTFLAGS: "-D warnings"

jobs:
Expand Down
4 changes: 4 additions & 0 deletions compact_str/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ harness = false
[[bench]]
name = "comparison"
harness = false

[[bench]]
name = "random"
harness = false
37 changes: 37 additions & 0 deletions compact_str/benches/random.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//! Random benchmarks to determine if one bit of code is faster than another

use criterion::{
black_box,
criterion_group,
criterion_main,
Criterion,
};

fn if_statement_min(c: &mut Criterion) {
let mask = 192;
let vals: [u8; 4] = [0, 46, 202, 255];
c.bench_function("if statement min", |b| {
b.iter(|| {
for x in vals {
let len = if x >= mask { (x & !mask) as usize } else { 24 };
black_box(len);
}
})
});
}

fn cmp_min(c: &mut Criterion) {
let mask = 192;
let vals: [u8; 4] = [0, 46, 202, 255];
c.bench_function("cmp min", |b| {
b.iter(|| {
for x in vals {
let len = core::cmp::min(x.wrapping_sub(mask), 24);
black_box(len);
}
})
});
}

criterion_group!(random, if_statement_min, cmp_min);
criterion_main!(random);
18 changes: 4 additions & 14 deletions compact_str/fuzz/fuzz_targets/creation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ fuzz_target!(|method: CreationMethod<'_>| {
assert_eq!(compact, word);

// assert the CompactStr is properly allocated
if word.len() < MAX_INLINE_LENGTH {
assert!(!compact.is_heap_allocated());
} else if word.len() == MAX_INLINE_LENGTH && word.as_bytes()[0] <= 127 {
if word.len() <= MAX_INLINE_LENGTH {
assert!(!compact.is_heap_allocated());
} else {
assert!(compact.is_heap_allocated());
Expand All @@ -43,10 +41,7 @@ fuzz_target!(|method: CreationMethod<'_>| {
assert_eq!(compact, std_str);

// assert the CompactStr is properly allocated
//
// Note: Creating a CompactStr from an iterator doesn't yet support the Packed
// representation, so we can only inline MAX_INLINE_LENGTH - 1
if std_str.len() < MAX_INLINE_LENGTH {
if std_str.len() <= MAX_INLINE_LENGTH {
assert!(!compact.is_heap_allocated());
} else {
assert!(compact.is_heap_allocated());
Expand All @@ -60,10 +55,7 @@ fuzz_target!(|method: CreationMethod<'_>| {
assert_eq!(compact, std_str);

// assert the CompactStr is properly allocated
//
// Note: Creating a CompactStr from an iterator doesn't yet support the Packed
// representation, so we can only inline MAX_INLINE_LENGTH - 1
if std_str.len() < MAX_INLINE_LENGTH {
if std_str.len() <= MAX_INLINE_LENGTH {
assert!(!compact.is_heap_allocated());
} else {
assert!(compact.is_heap_allocated());
Expand All @@ -81,9 +73,7 @@ fuzz_target!(|method: CreationMethod<'_>| {
assert_eq!(c, s);

// assert the CompactStr is properly allocated
if s.len() < MAX_INLINE_LENGTH {
assert!(!c.is_heap_allocated());
} else if s.len() == MAX_INLINE_LENGTH && s.as_bytes()[0] <= 127 {
if s.len() <= MAX_INLINE_LENGTH {
assert!(!c.is_heap_allocated());
} else {
assert!(c.is_heap_allocated());
Expand Down
6 changes: 2 additions & 4 deletions compact_str/src/features/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ mod test {

use crate::CompactStr;

const MAX_INLINED_SIZE: usize = core::mem::size_of::<String>();
const MAX_SIZE: usize = core::mem::size_of::<String>();

// generates random unicode strings, upto 80 chars long
fn rand_unicode() -> impl Strategy<Value = String> {
Expand All @@ -98,9 +98,7 @@ mod test {
let mut buf = Cursor::new(word.as_bytes());
let compact = CompactStr::from_utf8_buf(&mut buf).unwrap();

if word.len() < MAX_INLINED_SIZE {
prop_assert!(!compact.is_heap_allocated())
} else if word.len() == MAX_INLINED_SIZE && word.as_bytes()[0] <= 127 {
if word.len() <= MAX_SIZE {
prop_assert!(!compact.is_heap_allocated())
} else {
prop_assert!(compact.is_heap_allocated())
Expand Down
10 changes: 5 additions & 5 deletions compact_str/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
//! 3. A `usize` denoting the capacity of the string
//!
//! On 64-bit architectures this results in 24 bytes being stored on the stack (12 bytes for 32-bit
//! architectures). For small strings, e.g. <= 23 characters, instead of storing a pointer, length,
//! architectures). For small strings, e.g. <= 24 characters, instead of storing a pointer, length,
//! and capacity on the stack, you store the string itself! This avoids the need to heap allocate
//! which reduces the amount of memory used, and improves performance.

Expand Down Expand Up @@ -165,7 +165,7 @@ impl CompactStr {
/// ```
/// # use compact_str::CompactStr;
/// let empty = CompactStr::with_capacity(0);
/// let min_size = std::mem::size_of::<String>() - 1;
/// let min_size = std::mem::size_of::<String>();
///
/// assert_eq!(empty.capacity(), min_size);
/// assert_ne!(0, min_size);
Expand All @@ -175,10 +175,10 @@ impl CompactStr {
/// ### Heap Allocating
/// ```
/// # use compact_str::CompactStr;
/// // If you create a `CompactStr` with a capacity greater than or equal to
/// // If you create a `CompactStr` with a capacity greater than
/// // `std::mem::size_of::<String>`, it will heap allocated
///
/// let heap_size = std::mem::size_of::<String>();
/// let heap_size = std::mem::size_of::<String>() + 1;
/// let empty = CompactStr::with_capacity(heap_size);
///
/// assert_eq!(empty.capacity(), heap_size);
Expand Down Expand Up @@ -316,7 +316,7 @@ impl CompactStr {
/// ```
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.repr.as_slice()
&self.repr.as_slice()[..self.len()]
}

// TODO: Implement a `try_as_mut_slice(...)` that will fail if it results in cloning?
Expand Down
125 changes: 78 additions & 47 deletions compact_str/src/repr/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,18 @@ use bytes::Buf;

use super::{
Repr,
HEAP_MASK,
MAX_SIZE,
};

#[cfg(target_pointer_width = "32")]
const DEFAULT_TEXT: &str = "000000000000";
#[cfg(target_pointer_width = "64")]
const DEFAULT_TEXT: &str = "000000000000000000000000";

const DEFAULT_PACKED: Repr = Repr::new_const(DEFAULT_TEXT);

impl Repr {
/// Converts a buffer of bytes to a `Repr`,
pub fn from_utf8_buf<B: Buf>(buf: &mut B) -> Result<Self, Utf8Error> {
// SAFETY: We check below to make sure the provided buffer is valid UTF-8
let repr = unsafe { Self::from_utf8_buf_unchecked(buf) };
let (repr, bytes_written) = unsafe { Self::collect_buf(buf) };

// Check to make sure the provided bytes are valid UTF-8, return the Repr if they are!
match core::str::from_utf8(repr.as_slice()) {
match core::str::from_utf8(&repr.as_slice()[..bytes_written]) {
Ok(_) => Ok(repr),
Err(e) => Err(e),
}
Expand All @@ -32,46 +26,53 @@ impl Repr {
/// # Safety
/// The provided buffer must be valid UTF-8
pub unsafe fn from_utf8_buf_unchecked<B: Buf>(buf: &mut B) -> Self {
let size = buf.remaining();
let chunk = buf.chunk();

// Check to make sure we're not empty, so accessing the first byte below doesn't panic
if chunk.is_empty() {
// If the chunk is empty, then we should have 0 remaining bytes
debug_assert_eq!(size, 0);
return super::EMPTY;
let (repr, _bytes_written) = Self::collect_buf(buf);
repr
}

unsafe fn collect_buf<B: Buf>(buf: &mut B) -> (Self, usize) {
// Get an empty Repr we can write into
let mut repr = super::EMPTY;
let mut bytes_written = 0;

debug_assert_eq!(repr.len(), bytes_written);

while buf.has_remaining() {
let chunk = buf.chunk();
let chunk_len = chunk.len();

// There's an edge case where the final byte of this buffer == `HEAP_MASK`, which is
// invalid UTF-8, but would result in us creating an inline variant, that identifies as
// a heap variant. If a user ever tried to reference the data at all, we'd incorrectly
// try and read data from an invalid memory address, causing undefined behavior.
if bytes_written < MAX_SIZE && bytes_written + chunk_len == MAX_SIZE {
let last_byte = chunk[chunk_len - 1];
// If we hit the edge case, reserve additional space to make the repr becomes heap
// allocated, which prevents us from writing this last byte inline
if last_byte == HEAP_MASK {
repr.reserve(MAX_SIZE + 1);
}
}

// reserve at least enough space to fit this chunk
repr.reserve(chunk_len);

// SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This
// invariant is documented in the public API
let slice = repr.as_mut_slice();
// write the chunk into the Repr
slice[bytes_written..bytes_written + chunk_len].copy_from_slice(chunk);

// Set the length of the Repr
// SAFETY: We just wrote an additional `chunk_len` bytes into the Repr
bytes_written += chunk_len;
repr.set_len(bytes_written);

// advance the pointer of the buffer
buf.advance(chunk_len);
}
let first_byte = buf.chunk()[0];

// Get an "empty" Repr we can write into
//
// HACK: There currently isn't a way to provide an "empty" Packed repr, so we do this check
// and return a "default" Packed repr if the buffer can fit
let mut repr = if size == MAX_SIZE && first_byte <= 127 {
// Note: No need to reserve additional bytes here, because we know we can fit all
// remaining bytes of `buf` into a Packed repr
DEFAULT_PACKED
} else {
let mut default = super::EMPTY;
debug_assert_eq!(default.len(), 0);

// Reserve enough bytes, possibly allocating on the heap, to store the text
default.reserve(size);

default
};

// SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This
// invariant is documented in the public API
let slice = repr.as_mut_slice();
// Copy the bytes from the buffer into our Repr!
buf.copy_to_slice(&mut slice[..size]);

// Set the length of the Repr
// SAFETY: We just wrote `size` bytes into the Repr
repr.set_len(size);

repr
(repr, bytes_written)
}
}

Expand Down Expand Up @@ -124,6 +125,36 @@ mod test {
assert!(!repr.is_heap_allocated());
}

#[test]
fn test_fuzz_panic() {
let bytes = &[
255, 255, 255, 255, 255, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 12, 0, 0, 96,
];
let mut buf: Cursor<&[u8]> = Cursor::new(bytes);

assert!(Repr::from_utf8_buf(&mut buf).is_err());
}

#[test]
fn test_valid_repr_but_invalid_utf8() {
let bytes = &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 192,
];
let mut buf: Cursor<&[u8]> = Cursor::new(bytes);

assert!(Repr::from_utf8_buf(&mut buf).is_err());
}

#[test]
fn test_fake_heap_variant() {
let bytes = &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255,
];
let mut buf: Cursor<&[u8]> = Cursor::new(bytes);

assert!(Repr::from_utf8_buf(&mut buf).is_err());
}

#[test]
#[should_panic(expected = "Utf8Error")]
fn test_invalid_utf8() {
Expand Down
Loading