Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

features: bytes #40

Merged
merged 18 commits into from
Jan 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@ jobs:
run: |
cargo miri test

example-bytes:
name: example - bytes
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly
override: true
- uses: actions-rs/cargo@v1
with:
command: run
args: --manifest-path examples/bytes/Cargo.toml

example-serde:
name: example - serde
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
/target
Cargo.lock
**/target
**/Cargo.lock
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[workspace]
members = ["examples/serde", "compact_str", "tracing_alloc"]
members = ["examples/bytes", "examples/serde", "compact_str", "tracing_alloc"]
1 change: 1 addition & 0 deletions compact_str/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ categories = ["encoding", "parsing", "memory-management", "text-processing"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
bytes = { version = "1", optional = true }
serde = { version = "1", optional = true }
static_assertions = "1"

Expand Down
125 changes: 125 additions & 0 deletions compact_str/src/features/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
use core::str::Utf8Error;

use bytes::Buf;

use crate::{
CompactStr,
Repr,
};

impl CompactStr {
/// Converts a buffer of bytes to a `CompactStr`
///
/// # Examples
/// ### Basic usage
/// ```
/// # use compact_str::CompactStr;
/// # use std::collections::VecDeque;
///
/// // `bytes::Buf` is implemented for `VecDeque<u8>`
/// let mut sparkle_heart = VecDeque::from(vec![240, 159, 146, 150]);
/// // We know these bytes are valid, so we can `.unwrap()` or `.expect(...)` here
/// let compact_str = CompactStr::from_utf8_buf(&mut sparkle_heart).expect("valid utf-8");
///
/// assert_eq!(compact_str, "💖");
/// ```
///
/// ### With invalid/non-UTF8 bytes
/// ```
/// # use compact_str::CompactStr;
/// # use std::io;
///
/// // `bytes::Buf` is implemented for `std::io::Cursor<&[u8]>`
/// let mut invalid = io::Cursor::new(&[0, 159]);
///
/// // The provided buffer is invalid, so trying to create a `ComapctStr` will fail
/// assert!(CompactStr::from_utf8_buf(&mut invalid).is_err());
/// ```
pub fn from_utf8_buf<B: Buf>(buf: &mut B) -> Result<Self, Utf8Error> {
Repr::from_utf8_buf(buf).map(|repr| CompactStr { repr })
}

/// Converts a buffer of bytes to a `CompactStr`, without checking that the provided buffer is
/// valid UTF-8.
///
/// # Safety
/// This function is unsafe because it does not check that the provided bytes are valid UTF-8.
/// If this constraint is violated, it may cause memory unsafety issues with futures uses of
/// the `ComapctStr`, as the rest of the library assumes that `CompactStr`s are valid UTF-8
///
/// # Examples
/// ```
/// # use compact_str::CompactStr;
/// # use std::io;
///
/// let word = "hello world";
/// // `bytes::Buf` is implemented for `std::io::Cursor<&[u8]>`
/// let mut buffer = io::Cursor::new(word.as_bytes());
/// let compact_str = unsafe { CompactStr::from_utf8_buf_unchecked(&mut buffer) };
///
/// assert_eq!(compact_str, word);
/// ```
pub unsafe fn from_utf8_buf_unchecked<B: Buf>(buf: &mut B) -> Self {
let repr = Repr::from_utf8_buf_unchecked(buf);
CompactStr { repr }
}
}

#[cfg(test)]
mod test {
use std::io::Cursor;

use proptest::prelude::*;
use proptest::strategy::Strategy;

use crate::CompactStr;

const MAX_INLINED_SIZE: usize = core::mem::size_of::<String>();

// generates random unicode strings, upto 80 chars long
fn rand_unicode() -> impl Strategy<Value = String> {
proptest::collection::vec(proptest::char::any(), 0..80)
.prop_map(|v| v.into_iter().collect())
}

proptest! {
#[test]
#[cfg_attr(miri, ignore)]
fn test_buffers_roundtrip(word in rand_unicode()) {
let mut buf = Cursor::new(word.as_bytes());
let compact = CompactStr::from_utf8_buf(&mut buf).unwrap();

prop_assert_eq!(&word, &compact);
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_allocated_properly(word in rand_unicode()) {
let mut buf = Cursor::new(word.as_bytes());
let compact = CompactStr::from_utf8_buf(&mut buf).unwrap();

if word.len() < MAX_INLINED_SIZE {
prop_assert!(!compact.is_heap_allocated())
} else if word.len() == MAX_INLINED_SIZE && word.as_bytes()[0] <= 127 {
prop_assert!(!compact.is_heap_allocated())
} else {
prop_assert!(compact.is_heap_allocated())
}
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_only_accept_valid_utf8(bytes in proptest::collection::vec(any::<u8>(), 0..80)) {
let mut buf = Cursor::new(bytes.as_slice());

let compact_result = CompactStr::from_utf8_buf(&mut buf);
let str_result = core::str::from_utf8(bytes.as_slice());

match (compact_result, str_result) {
(Ok(c), Ok(s)) => prop_assert_eq!(c, s),
(Err(c_err), Err(s_err)) => prop_assert_eq!(c_err, s_err),
_ => panic!("CompactStr and core::str read UTF-8 differently?"),
}
}
}
}
6 changes: 6 additions & 0 deletions compact_str/src/features/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//! A module that contains the implementations for optional features. For example `serde` support

#[cfg(feature = "bytes")]
mod bytes;
#[cfg(feature = "serde")]
mod serde;
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use serde::de::{
Visitor,
};

use super::CompactStr;
use crate::CompactStr;

fn compact_str<'de: 'a, 'a, D: Deserializer<'de>>(deserializer: D) -> Result<CompactStr, D::Error> {
struct CompactStrVisitor;
Expand Down
10 changes: 7 additions & 3 deletions compact_str/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ use core::iter::FromIterator;
use core::ops::Deref;
use core::str::FromStr;

mod features;

mod repr;
use repr::Repr;

#[cfg(feature = "serde")]
mod serde;

#[cfg(test)]
mod tests;

Expand Down Expand Up @@ -134,6 +133,11 @@ impl CompactStr {
self.repr.as_str()
}

#[inline]
pub fn as_slice(&self) -> &[u8] {
self.repr.as_slice()
}

// TODO: Implement a `try_as_mut_slice(...)` that will fail if it results in cloning?
//
/// Provides a mutable reference to the underlying buffer of bytes.
Expand Down
135 changes: 135 additions & 0 deletions compact_str/src/repr/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
use core::str::Utf8Error;

use bytes::Buf;

use super::{
Repr,
MAX_SIZE,
};

#[cfg(target_pointer_width = "32")]
const DEFAULT_TEXT: &str = "000000000000";
#[cfg(target_pointer_width = "64")]
const DEFAULT_TEXT: &str = "000000000000000000000000";

const DEFAULT_PACKED: Repr = Repr::new_const(DEFAULT_TEXT);

impl Repr {
/// Converts a buffer of bytes to a `Repr`,
pub fn from_utf8_buf<B: Buf>(buf: &mut B) -> Result<Self, Utf8Error> {
// SAFETY: We check below to make sure the provided buffer is valid UTF-8
let repr = unsafe { Self::from_utf8_buf_unchecked(buf) };

// Check to make sure the provided bytes are valid UTF-8, return the Repr if they are!
match core::str::from_utf8(repr.as_slice()) {
Ok(_) => Ok(repr),
Err(e) => Err(e),
}
}

/// Converts a buffer of bytes to a `Repr`, without checking for valid UTF-8
///
/// # Safety
/// The provided buffer must be valid UTF-8
pub unsafe fn from_utf8_buf_unchecked<B: Buf>(buf: &mut B) -> Self {
let size = buf.remaining();
let chunk = buf.chunk();

// Check to make sure we're not empty, so accessing the first byte below doesn't panic
if chunk.is_empty() {
// If the chunk is empty, then we should have 0 remaining bytes
debug_assert_eq!(size, 0);
return super::EMPTY;
}
let first_byte = buf.chunk()[0];

// Get an "empty" Repr we can write into
//
// HACK: There currently isn't a way to provide an "empty" Packed repr, so we do this check
// and return a "default" Packed repr if the buffer can fit
let mut repr = if size == MAX_SIZE && first_byte <= 127 {
// Note: No need to reserve additional bytes here, because we know we can fit all
// remaining bytes of `buf` into a Packed repr
DEFAULT_PACKED
} else {
let mut default = super::EMPTY;
debug_assert_eq!(default.len(), 0);

// Reserve enough bytes, possibly allocating on the heap, to store the text
default.reserve(size);

default
};

// SAFETY: The caller is responsible for making sure the provided buffer is UTF-8. This
// invariant is documented in the public API
let slice = repr.as_mut_slice();
// Copy the bytes from the buffer into our Repr!
buf.copy_to_slice(&mut slice[..size]);

// Set the length of the Repr
// SAFETY: We just wrote `size` bytes into the Repr
repr.set_len(size);

repr
}
}

#[cfg(test)]
mod test {
use std::io::Cursor;

use super::Repr;

#[test]
fn test_smoke() {
let word = "hello world";
let mut buf = Cursor::new(word.as_bytes());

let repr = Repr::from_utf8_buf(&mut buf).unwrap();
assert_eq!(repr.as_str(), word);
}

#[test]
fn test_heap_allocated() {
let word = "hello, this is a long string which should be heap allocated";
let mut buf = Cursor::new(word.as_bytes());

let repr = Repr::from_utf8_buf(&mut buf).unwrap();
assert_eq!(repr.as_str(), word);
}

#[test]
fn test_empty() {
let mut buf: Cursor<&[u8]> = Cursor::new(&[]);

let repr = Repr::from_utf8_buf(&mut buf).unwrap();
assert_eq!(repr.len(), 0);
assert_eq!(repr.as_str(), "");
}

#[test]
fn test_packed() {
#[cfg(target_pointer_width = "64")]
let packed = "this string is 24 chars!";
#[cfg(target_pointer_width = "32")]
let packed = "i am 12 char";

let mut buf = Cursor::new(packed.as_bytes());

let repr = Repr::from_utf8_buf(&mut buf).unwrap();
assert_eq!(repr.as_str(), packed);

// This repr should __not__ be heap allocated
assert!(!repr.is_heap_allocated());
}

#[test]
#[should_panic(expected = "Utf8Error")]
fn test_invalid_utf8() {
let invalid = &[0, 159];
let mut buf: Cursor<&[u8]> = Cursor::new(invalid);

Repr::from_utf8_buf(&mut buf).unwrap();
}
}
11 changes: 8 additions & 3 deletions compact_str/src/repr/heap/arc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ pub struct ArcString {
len: usize,
ptr: ptr::NonNull<ArcStringInner>,
}
unsafe impl Sync for ArcString {}
unsafe impl Send for ArcString {}

impl ArcString {
#[inline]
Expand Down Expand Up @@ -54,11 +56,14 @@ impl ArcString {

#[inline]
pub fn as_str(&self) -> &str {
let buffer = self.inner().as_bytes();

// SAFETY: The only way you can construct an `ArcString` is via a `&str` so it must be valid
// UTF-8, or the caller has manually made those guarantees
unsafe { str::from_utf8_unchecked(&buffer[..self.len]) }
unsafe { str::from_utf8_unchecked(self.as_slice()) }
}

#[inline(always)]
pub fn as_slice(&self) -> &[u8] {
&self.inner().as_bytes()[..self.len]
}

#[inline]
Expand Down
10 changes: 6 additions & 4 deletions compact_str/src/repr/inline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ impl InlineString {

#[inline]
pub fn as_str(&self) -> &str {
let len = self.len();
let slice = &self.buffer[..len];

// SAFETY: You can only construct an InlineString via a &str
unsafe { ::std::str::from_utf8_unchecked(slice) }
unsafe { ::std::str::from_utf8_unchecked(self.as_slice()) }
}

#[inline(always)]
pub fn as_slice(&self) -> &[u8] {
&self.buffer[..self.len()]
}

/// Provides a mutable reference to the underlying buffer
Expand Down
Loading