From cead90b6c4bf704541de8796dc7c0fd178f8b631 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 17:41:03 +0200 Subject: [PATCH 1/7] lib: move code to encoding module In a next step, we can add more modules. --- src/{ => encoding}/macros.rs | 2 +- src/encoding/mod.rs | 223 +++++++++++++++++++++++++++++++++++ src/lib.rs | 222 +--------------------------------- 3 files changed, 225 insertions(+), 222 deletions(-) rename src/{ => encoding}/macros.rs (98%) create mode 100644 src/encoding/mod.rs diff --git a/src/macros.rs b/src/encoding/macros.rs similarity index 98% rename from src/macros.rs rename to src/encoding/macros.rs index b30f8bc..c87f75d 100644 --- a/src/macros.rs +++ b/src/encoding/macros.rs @@ -1,4 +1,4 @@ -use crate::{ucs2_from_utf8_at_offset, Error}; +use super::{ucs2_from_utf8_at_offset, Error}; /// Count the number of UCS-2 characters in a string. Return an error if /// the string cannot be encoded in UCS-2. diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs new file mode 100644 index 0000000..69b7022 --- /dev/null +++ b/src/encoding/mod.rs @@ -0,0 +1,223 @@ +//! Low-level encoding and decoding facilities for UCS-2 strings. + +mod macros; + +/// These need to be public for the `ucs2_cstr!` macro, but are not +/// intended to be called directly. +#[doc(hidden)] +pub use macros::{str_num_ucs2_chars, str_to_ucs2}; + +use bit_field::BitField; +use core::fmt::{self, Display, Formatter}; + +/// Possible errors when encoding UCS-2 strings.. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Error { + /// Not enough space left in the output buffer. + BufferOverflow, + /// Input contained a character which cannot be represented in UCS-2. + MultiByte, +} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::BufferOverflow => f.write_str("output buffer is too small"), + Self::MultiByte => { + f.write_str("input contains a character which cannot be represented in UCS-2") + } + } + } +} + +type Result = core::result::Result; + +/// Value returned by `ucs2_from_utf8_at_offset`. +struct Ucs2CharFromUtf8 { + /// UCS-2 character. + val: u16, + /// Number of bytes needed to encode the character in UTF-8. + num_bytes: u8, +} + +/// Get a UCS-2 character from a UTF-8 byte slice at the given offset. +/// +/// # Safety +/// +/// The input `bytes` must be valid UTF-8. +const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result { + let len = bytes.len(); + let ch; + let ch_len; + + if bytes[offset] & 0b1000_0000 == 0b0000_0000 { + ch = bytes[offset] as u16; + ch_len = 1; + } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 { + // 2 byte codepoint + if offset + 1 >= len { + // safe: len is the length of bytes, + // and bytes is a direct view into the + // buffer of input, which in order to be a valid + // utf-8 string _must_ contain `i + 1`. + unsafe { core::hint::unreachable_unchecked() } + } + + let a = (bytes[offset] & 0b0001_1111) as u16; + let b = (bytes[offset + 1] & 0b0011_1111) as u16; + ch = a << 6 | b; + ch_len = 2; + } else if bytes[offset] & 0b1111_0000 == 0b1110_0000 { + // 3 byte codepoint + if offset + 2 >= len || offset + 1 >= len { + // safe: impossible utf-8 string. + unsafe { core::hint::unreachable_unchecked() } + } + + let a = (bytes[offset] & 0b0000_1111) as u16; + let b = (bytes[offset + 1] & 0b0011_1111) as u16; + let c = (bytes[offset + 2] & 0b0011_1111) as u16; + ch = a << 12 | b << 6 | c; + ch_len = 3; + } else if bytes[offset] & 0b1111_0000 == 0b1111_0000 { + return Err(Error::MultiByte); // UTF-16 + } else { + // safe: impossible utf-8 string. + unsafe { core::hint::unreachable_unchecked() } + } + + Ok(Ucs2CharFromUtf8 { + val: ch, + num_bytes: ch_len, + }) +} + +/// Encodes an input UTF-8 string into a UCS-2 string. +/// +/// The returned `usize` represents the length of the returned buffer, +/// measured in 2-byte characters. +pub fn encode(input: &str, buffer: &mut [u16]) -> Result { + let buffer_size = buffer.len(); + let mut i = 0; + + encode_with(input, |ch| { + if i >= buffer_size { + Err(Error::BufferOverflow) + } else { + buffer[i] = ch; + i += 1; + Ok(()) + } + })?; + + Ok(i) +} + +/// Encode UTF-8 string to UCS-2 with a custom callback function. +/// +/// `output` is a function which receives every encoded character. +pub fn encode_with(input: &str, mut output: F) -> Result<()> +where + F: FnMut(u16) -> Result<()>, +{ + let bytes = input.as_bytes(); + let len = bytes.len(); + let mut i = 0; + + while i < len { + // SAFETY: `bytes` is valid UTF-8. + let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?; + i += usize::from(ch.num_bytes); + output(ch.val)?; + } + Ok(()) +} + +/// Decode UCS-2 string to UTF-8 with a custom callback function. +/// +/// `output` is a function which receives every decoded character. +/// Due to the nature of UCS-2, the function can receive an UTF-8 character +/// of up to three bytes, for every input character. +pub fn decode_with(input: &[u16], mut output: F) -> Result +where + F: FnMut(&[u8]) -> Result<()>, +{ + let mut written = 0; + + for ch in input.iter() { + /* + * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode + * the Basic Multilingual Plane, a maximum of three bytes are needed. + */ + if (0x000..0x0080).contains(ch) { + output(&[*ch as u8])?; + + written += 1; + } else if (0x0080..0x0800).contains(ch) { + let first = 0b1100_0000 + ch.get_bits(6..11) as u8; + let last = 0b1000_0000 + ch.get_bits(0..6) as u8; + + output(&[first, last])?; + + written += 2; + } else { + let first = 0b1110_0000 + ch.get_bits(12..16) as u8; + let mid = 0b1000_0000 + ch.get_bits(6..12) as u8; + let last = 0b1000_0000 + ch.get_bits(0..6) as u8; + + output(&[first, mid, last])?; + + written += 3; + } + } + + Ok(written) +} + +/// Decode an input UCS-2 string into a UTF-8 string. +/// +/// The returned `usize` represents the length of the returned buffer, +/// in bytes. Due to the nature of UCS-2, the output buffer could end up with +/// three bytes for every character in the input buffer. +pub fn decode(input: &[u16], output: &mut [u8]) -> Result { + let buffer_size = output.len(); + let mut i = 0; + + decode_with(input, |bytes| { + if bytes.len() == 1 { + // Can be encoded in a single byte + if i >= buffer_size { + return Err(Error::BufferOverflow); + } + + output[i] = bytes[0]; + + i += 1; + } else if bytes.len() == 2 { + // Can be encoded two bytes + if i + 1 >= buffer_size { + return Err(Error::BufferOverflow); + } + + output[i] = bytes[0]; + output[i + 1] = bytes[1]; + + i += 2; + } else if bytes.len() == 3 { + // Can be encoded three bytes + if i + 2 >= buffer_size { + return Err(Error::BufferOverflow); + } + + output[i] = bytes[0]; + output[i + 1] = bytes[1]; + output[i + 2] = bytes[2]; + + i += 3; + } else { + unreachable!("More than three bytes per UCS-2 character."); + } + + Ok(()) + }) +} diff --git a/src/lib.rs b/src/lib.rs index 6d3aef1..5dcb4d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,224 +4,4 @@ #![deny(missing_docs)] #![deny(clippy::all)] -mod macros; - -/// These need to be public for the `ucs2_cstr!` macro, but are not -/// intended to be called directly. -#[doc(hidden)] -pub use macros::{str_num_ucs2_chars, str_to_ucs2}; - -use bit_field::BitField; -use core::fmt::{self, Display, Formatter}; - -/// Possible errors returned by the API. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub enum Error { - /// Not enough space left in the output buffer. - BufferOverflow, - /// Input contained a character which cannot be represented in UCS-2. - MultiByte, -} - -impl Display for Error { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match self { - Self::BufferOverflow => f.write_str("output buffer is too small"), - Self::MultiByte => { - f.write_str("input contains a character which cannot be represented in UCS-2") - } - } - } -} - -type Result = core::result::Result; - -/// Value returned by `ucs2_from_utf8_at_offset`. -struct Ucs2CharFromUtf8 { - /// UCS-2 character. - val: u16, - /// Number of bytes needed to encode the character in UTF-8. - num_bytes: u8, -} - -/// Get a UCS-2 character from a UTF-8 byte slice at the given offset. -/// -/// # Safety -/// -/// The input `bytes` must be valid UTF-8. -const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result { - let len = bytes.len(); - let ch; - let ch_len; - - if bytes[offset] & 0b1000_0000 == 0b0000_0000 { - ch = bytes[offset] as u16; - ch_len = 1; - } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 { - // 2 byte codepoint - if offset + 1 >= len { - // safe: len is the length of bytes, - // and bytes is a direct view into the - // buffer of input, which in order to be a valid - // utf-8 string _must_ contain `i + 1`. - unsafe { core::hint::unreachable_unchecked() } - } - - let a = (bytes[offset] & 0b0001_1111) as u16; - let b = (bytes[offset + 1] & 0b0011_1111) as u16; - ch = a << 6 | b; - ch_len = 2; - } else if bytes[offset] & 0b1111_0000 == 0b1110_0000 { - // 3 byte codepoint - if offset + 2 >= len || offset + 1 >= len { - // safe: impossible utf-8 string. - unsafe { core::hint::unreachable_unchecked() } - } - - let a = (bytes[offset] & 0b0000_1111) as u16; - let b = (bytes[offset + 1] & 0b0011_1111) as u16; - let c = (bytes[offset + 2] & 0b0011_1111) as u16; - ch = a << 12 | b << 6 | c; - ch_len = 3; - } else if bytes[offset] & 0b1111_0000 == 0b1111_0000 { - return Err(Error::MultiByte); // UTF-16 - } else { - // safe: impossible utf-8 string. - unsafe { core::hint::unreachable_unchecked() } - } - - Ok(Ucs2CharFromUtf8 { - val: ch, - num_bytes: ch_len, - }) -} - -/// Encodes an input UTF-8 string into a UCS-2 string. -/// -/// The returned `usize` represents the length of the returned buffer, -/// measured in 2-byte characters. -pub fn encode(input: &str, buffer: &mut [u16]) -> Result { - let buffer_size = buffer.len(); - let mut i = 0; - - encode_with(input, |ch| { - if i >= buffer_size { - Err(Error::BufferOverflow) - } else { - buffer[i] = ch; - i += 1; - Ok(()) - } - })?; - - Ok(i) -} - -/// Encode UTF-8 string to UCS-2 with a custom callback function. -/// -/// `output` is a function which receives every encoded character. -pub fn encode_with(input: &str, mut output: F) -> Result<()> -where - F: FnMut(u16) -> Result<()>, -{ - let bytes = input.as_bytes(); - let len = bytes.len(); - let mut i = 0; - - while i < len { - // SAFETY: `bytes` is valid UTF-8. - let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?; - i += usize::from(ch.num_bytes); - output(ch.val)?; - } - Ok(()) -} - -/// Decode UCS-2 string to UTF-8 with a custom callback function. -/// -/// `output` is a function which receives every decoded character. -/// Due to the nature of UCS-2, the function can receive an UTF-8 character -/// of up to three bytes, for every input character. -pub fn decode_with(input: &[u16], mut output: F) -> Result -where - F: FnMut(&[u8]) -> Result<()>, -{ - let mut written = 0; - - for ch in input.iter() { - /* - * We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode - * the Basic Multilingual Plane, a maximum of three bytes are needed. - */ - if (0x000..0x0080).contains(ch) { - output(&[*ch as u8])?; - - written += 1; - } else if (0x0080..0x0800).contains(ch) { - let first = 0b1100_0000 + ch.get_bits(6..11) as u8; - let last = 0b1000_0000 + ch.get_bits(0..6) as u8; - - output(&[first, last])?; - - written += 2; - } else { - let first = 0b1110_0000 + ch.get_bits(12..16) as u8; - let mid = 0b1000_0000 + ch.get_bits(6..12) as u8; - let last = 0b1000_0000 + ch.get_bits(0..6) as u8; - - output(&[first, mid, last])?; - - written += 3; - } - } - - Ok(written) -} - -/// Decode an input UCS-2 string into a UTF-8 string. -/// -/// The returned `usize` represents the length of the returned buffer, -/// in bytes. Due to the nature of UCS-2, the output buffer could end up with -/// three bytes for every character in the input buffer. -pub fn decode(input: &[u16], output: &mut [u8]) -> Result { - let buffer_size = output.len(); - let mut i = 0; - - decode_with(input, |bytes| { - if bytes.len() == 1 { - // Can be encoded in a single byte - if i >= buffer_size { - return Err(Error::BufferOverflow); - } - - output[i] = bytes[0]; - - i += 1; - } else if bytes.len() == 2 { - // Can be encoded two bytes - if i + 1 >= buffer_size { - return Err(Error::BufferOverflow); - } - - output[i] = bytes[0]; - output[i + 1] = bytes[1]; - - i += 2; - } else if bytes.len() == 3 { - // Can be encoded three bytes - if i + 2 >= buffer_size { - return Err(Error::BufferOverflow); - } - - output[i] = bytes[0]; - output[i + 1] = bytes[1]; - output[i + 2] = bytes[2]; - - i += 3; - } else { - unreachable!("More than three bytes per UCS-2 character."); - } - - Ok(()) - }) -} +pub mod encoding; From 1d91bb93f785ac719ce5ad54e9fcbfed8ecfb30f Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 17:43:38 +0200 Subject: [PATCH 2/7] cargo: badges should no longer be part of Cargo.toml https://doc.rust-lang.org/cargo/reference/manifest.html#the-badges-section --- Cargo.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ac37922..3e29539 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,5 +13,3 @@ rust-version = "1.56" [dependencies] bit_field = "0.10" -[badges] -maintenance = { status = "passively-maintained" } From 95b574667c4a7eb3309bd62419bb8b1d4baebf81 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 17:43:53 +0200 Subject: [PATCH 3/7] style: init .editorconfig --- .editorconfig | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..07cf6a4 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +max_line_length = 80 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.{rs,py}] +indent_style = space +indent_size = 4 + +[*.{json,md,nix,toml,yml}] +indent_style = space +indent_size = 2 From 80cf03ef05f1c17cb2a9b76cb7685da1380c519c Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 17:47:50 +0200 Subject: [PATCH 4/7] types: init basic skeleton and copy chars.rs from uefi-crate --- Cargo.toml | 9 ++- src/lib.rs | 4 + src/types/chars.rs | 186 ++++++++++++++++++++++++++++++++++++++++++++ src/types/cstr16.rs | 0 src/types/cstr8.rs | 3 + src/types/mod.rs | 9 +++ 6 files changed, 208 insertions(+), 3 deletions(-) create mode 100644 src/types/chars.rs create mode 100644 src/types/cstr16.rs create mode 100644 src/types/cstr8.rs create mode 100644 src/types/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 3e29539..7f7e650 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "ucs2" -version = "0.3.3" -authors = ["Gabriel Majeri ", "Fredrik Aleksander", "Isaac Woods"] -description = "UCS-2 decoding and encoding functions" +version = "0.4.0" +authors = ["The Rust OSDev team"] +description = "UCS-2 decoding and encoding functions as well as convenient types." repository = "https://github.com/rust-osdev/ucs2-rs" keywords = ["ucs2", "no-std", "encoding"] categories = ["encoding", "no-std"] @@ -13,3 +13,6 @@ rust-version = "1.56" [dependencies] bit_field = "0.10" +[features] +default = [] +alloc = [] diff --git a/src/lib.rs b/src/lib.rs index 5dcb4d0..7583b4e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,4 +4,8 @@ #![deny(missing_docs)] #![deny(clippy::all)] +#[cfg(feature = "alloc")] +extern crate alloc; + pub mod encoding; +pub mod types; diff --git a/src/types/chars.rs b/src/types/chars.rs new file mode 100644 index 0000000..c2f5bd6 --- /dev/null +++ b/src/types/chars.rs @@ -0,0 +1,186 @@ +//! UEFI character handling +//! +//! UEFI uses both Latin-1 and UCS-2 character encoding, this module implements +//! support for the associated character types. + +use core::fmt::{self, Display, Formatter}; + +/// Character conversion error +#[derive(Clone, Copy, Debug)] +pub struct CharConversionError; + +impl Display for CharConversionError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{self:?}") + } +} + +#[cfg(feature = "unstable")] +impl core::error::Error for CharConversionError {} + +/// A Latin-1 character +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct Char8(u8); + +impl TryFrom for Char8 { + type Error = CharConversionError; + + fn try_from(value: char) -> Result { + let code_point = u32::from(value); + u8::try_from(code_point) + .map(Char8) + .map_err(|_| CharConversionError) + } +} + +impl From for char { + fn from(char: Char8) -> Self { + Self::from(char.0) + } +} + +impl From for Char8 { + fn from(value: u8) -> Self { + Self(value) + } +} + +impl From for u8 { + fn from(char: Char8) -> Self { + char.0 + } +} + +impl fmt::Debug for Char8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::fmt(&From::from(self.0), f) + } +} + +impl fmt::Display for Char8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::fmt(&From::from(self.0), f) + } +} + +impl PartialEq for Char8 { + fn eq(&self, other: &char) -> bool { + u32::from(self.0) == u32::from(*other) + } +} + +/// Latin-1 version of the NUL character +pub const NUL_8: Char8 = Char8(0); + +/// An UCS-2 code point +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct Char16(u16); + +impl Char16 { + /// Creates a UCS-2 character from a Rust character without checks. + /// + /// # Safety + /// The caller must be sure that the character is valid. + #[must_use] + pub const unsafe fn from_u16_unchecked(val: u16) -> Self { + Self(val) + } + + /// Checks if the value is within the ASCII range. + #[must_use] + pub const fn is_ascii(&self) -> bool { + self.0 <= 127 + } +} + +impl TryFrom for Char16 { + type Error = CharConversionError; + + fn try_from(value: char) -> Result { + let code_point = u32::from(value); + u16::try_from(code_point) + .map(Char16) + .map_err(|_| CharConversionError) + } +} + +impl From for char { + fn from(char: Char16) -> Self { + u32::from(char.0).try_into().unwrap() + } +} + +impl TryFrom for Char16 { + type Error = CharConversionError; + + fn try_from(value: u16) -> Result { + // We leverage char's TryFrom impl for Unicode validity checking + let res: Result = u32::from(value).try_into(); + if let Ok(ch) = res { + ch.try_into() + } else { + Err(CharConversionError) + } + } +} + +impl From for u16 { + fn from(char: Char16) -> Self { + char.0 + } +} + +impl fmt::Debug for Char16 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Ok(c) = u32::from(self.0).try_into() { + ::fmt(&c, f) + } else { + write!(f, "Char16({:?})", self.0) + } + } +} + +impl fmt::Display for Char16 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if let Ok(c) = u32::from(self.0).try_into() { + ::fmt(&c, f) + } else { + write!(f, "{}", core::char::REPLACEMENT_CHARACTER) + } + } +} + +impl PartialEq for Char16 { + fn eq(&self, other: &char) -> bool { + u32::from(self.0) == u32::from(*other) + } +} + +/// UCS-2 version of the NUL character +pub const NUL_16: Char16 = unsafe { Char16::from_u16_unchecked(0) }; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_char8_from_char() { + assert_eq!(Char8::try_from('A').unwrap(), Char8(0x41)); + } + + #[test] + fn test_char16_from_char() { + assert_eq!(Char16::try_from('A').unwrap(), Char16(0x41)); + assert_eq!(Char16::try_from('ꋃ').unwrap(), Char16(0xa2c3)); + } + + /// Test that `Char8` and `Char16` can be directly compared with `char`. + #[test] + fn test_char_eq() { + let primitive_char: char = 'A'; + assert_eq!(Char8(0x41), primitive_char); + assert_eq!(Char16(0x41), primitive_char); + } +} diff --git a/src/types/cstr16.rs b/src/types/cstr16.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/types/cstr8.rs b/src/types/cstr8.rs new file mode 100644 index 0000000..5ccabb3 --- /dev/null +++ b/src/types/cstr8.rs @@ -0,0 +1,3 @@ +//! Rusty-types to work with UCS-2 strings and for convenient interoperability +//! with Rust string literals (`&str`) and Rust strings (`String`). + diff --git a/src/types/mod.rs b/src/types/mod.rs new file mode 100644 index 0000000..26b7bc2 --- /dev/null +++ b/src/types/mod.rs @@ -0,0 +1,9 @@ + +pub(self) mod chars; +mod cstr8; +#[cfg(feature = "alloc")] +mod cstr16; + +pub use cstr8::*; +#[cfg(feature = "alloc")] +pub use cstr16::*; From 0018d87220e0e1c325c816d8de7239d5dd575817 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 18:21:01 +0200 Subject: [PATCH 5/7] init modules cstr8 and cstr16 (plus dependencies) --- Cargo.toml | 1 + src/encoding/macros.rs | 4 +- src/encoding/mod.rs | 1 + src/lib.rs | 1 + src/polyfill.rs | 29 ++ src/types/cstr16.rs | 406 ++++++++++++++++++++++++++ src/types/cstr8.rs | 297 +++++++++++++++++++ src/types/macros.rs | 82 ++++++ src/types/mod.rs | 95 +++++- src/types/unaligned_slice.rs | 254 ++++++++++++++++ tests/{tests.rs => encoding_tests.rs} | 2 +- 11 files changed, 1165 insertions(+), 7 deletions(-) create mode 100644 src/polyfill.rs create mode 100644 src/types/macros.rs create mode 100644 src/types/unaligned_slice.rs rename tests/{tests.rs => encoding_tests.rs} (96%) diff --git a/Cargo.toml b/Cargo.toml index 7f7e650..e572f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,3 +16,4 @@ bit_field = "0.10" [features] default = [] alloc = [] +unstable = [] diff --git a/src/encoding/macros.rs b/src/encoding/macros.rs index c87f75d..77af4fa 100644 --- a/src/encoding/macros.rs +++ b/src/encoding/macros.rs @@ -82,13 +82,13 @@ macro_rules! ucs2_cstr { // Use `const` values here to force errors to happen at compile // time. - const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) { + const NUM_CHARS: usize = match $crate::encoding::str_num_ucs2_chars($s) { // Add one for the null char. Ok(num) => num + 1, Err(_) => panic!("input contains a character which cannot be represented in UCS-2"), }; - const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) { + const VAL: [u16; NUM_CHARS] = match $crate::encoding::str_to_ucs2($s) { Ok(val) => val, // The string was already checked by `str_num_ucs2_chars`, // so this error is unreachable. diff --git a/src/encoding/mod.rs b/src/encoding/mod.rs index 69b7022..9c95cad 100644 --- a/src/encoding/mod.rs +++ b/src/encoding/mod.rs @@ -2,6 +2,7 @@ mod macros; +pub use crate::ucs2_cstr; /// These need to be public for the `ucs2_cstr!` macro, but are not /// intended to be called directly. #[doc(hidden)] diff --git a/src/lib.rs b/src/lib.rs index 7583b4e..78cf49c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,4 +8,5 @@ extern crate alloc; pub mod encoding; +pub mod polyfill; pub mod types; diff --git a/src/polyfill.rs b/src/polyfill.rs new file mode 100644 index 0000000..435481b --- /dev/null +++ b/src/polyfill.rs @@ -0,0 +1,29 @@ +//! Polyfills for functions in the standard library that are currently gated +//! behind unstable features. + +use core::mem::MaybeUninit; +#[cfg(feature = "alloc")] +use {alloc::vec::Vec, core::mem::ManuallyDrop}; + +/// Polyfill for the unstable `MaybeUninit::slice_assume_init_ref` function. +/// +/// See . +pub const unsafe fn maybe_uninit_slice_assume_init_ref(s: &[MaybeUninit]) -> &[T] { + unsafe { &*(s as *const [MaybeUninit] as *const [T]) } +} + +/// Polyfill for the unstable `MaybeUninit::slice_as_mut_ptr` function. +/// +/// See . +pub fn maybe_uninit_slice_as_mut_ptr(s: &mut [MaybeUninit]) -> *mut T { + s.as_mut_ptr().cast::() +} + +/// Polyfill for the unstable `Vec::into_raw_parts` function. +/// +/// See . +#[cfg(feature = "alloc")] +pub fn vec_into_raw_parts(v: Vec) -> (*mut T, usize, usize) { + let mut v = ManuallyDrop::new(v); + (v.as_mut_ptr(), v.len(), v.capacity()) +} diff --git a/src/types/cstr16.rs b/src/types/cstr16.rs index e69de29..2abe512 100644 --- a/src/types/cstr16.rs +++ b/src/types/cstr16.rs @@ -0,0 +1,406 @@ +use crate::polyfill::maybe_uninit_slice_assume_init_ref; +use crate::types::chars::{Char16, NUL_16}; +use crate::types::unaligned_slice::UnalignedSlice; +use crate::types::{EqStrUntilNul, FromSliceWithNulError, FromStrWithBufError}; +use core::borrow::Borrow; +use core::fmt::{Display, Formatter}; +use core::mem::MaybeUninit; +use core::{fmt, slice}; + +/// Error returned by [`CStr16::from_unaligned_slice`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum UnalignedCStr16Error { + /// An invalid character was encountered. + InvalidChar(usize), + + /// A null character was encountered before the end of the data. + InteriorNul(usize), + + /// The data was not null-terminated. + NotNulTerminated, + + /// The buffer is not big enough to hold the entire string and + /// trailing null character. + BufferTooSmall, +} + +impl Display for UnalignedCStr16Error { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidChar(usize) => write!(f, "invalid character at index {}", usize), + Self::InteriorNul(usize) => write!(f, "interior null character at index {}", usize), + Self::NotNulTerminated => write!(f, "not null-terminated"), + Self::BufferTooSmall => write!(f, "buffer too small"), + } + } +} + +#[cfg(feature = "unstable")] +impl core::error::Error for UnalignedCStr16Error {} + +/// An UCS-2 null-terminated string slice. +/// +/// This type is largely inspired by [`core::ffi::CStr`] with the exception that all characters are +/// guaranteed to be 16 bit long. +/// +/// For convenience, a [`CStr16`] is comparable with [`core::str`] and +/// `alloc::string::String` from the standard library through the trait [`EqStrUntilNul`]. +#[derive(Eq, PartialEq, Ord, PartialOrd, Hash)] +#[repr(transparent)] +pub struct CStr16([Char16]); + +impl CStr16 { + /// Wraps a raw UEFI string with a safe C string wrapper + /// + /// # Safety + /// + /// The function will start accessing memory from `ptr` until the first + /// null character. It's the callers responsibility to ensure `ptr` points to + /// a valid string, in accessible memory. + #[must_use] + pub unsafe fn from_ptr<'ptr>(ptr: *const Char16) -> &'ptr Self { + let mut len = 0; + while *ptr.add(len) != NUL_16 { + len += 1 + } + let ptr = ptr.cast::(); + Self::from_u16_with_nul_unchecked(slice::from_raw_parts(ptr, len + 1)) + } + + /// Creates a `&CStr16` from a u16 slice, if the slice contains exactly + /// one terminating null-byte and all chars are valid UCS-2 chars. + pub fn from_u16_with_nul(codes: &[u16]) -> Result<&Self, FromSliceWithNulError> { + for (pos, &code) in codes.iter().enumerate() { + match code.try_into() { + Ok(NUL_16) => { + if pos != codes.len() - 1 { + return Err(FromSliceWithNulError::InteriorNul(pos)); + } else { + return Ok(unsafe { Self::from_u16_with_nul_unchecked(codes) }); + } + } + Err(_) => { + return Err(FromSliceWithNulError::InvalidChar(pos)); + } + _ => {} + } + } + Err(FromSliceWithNulError::NotNulTerminated) + } + + /// Unsafely creates a `&CStr16` from a u16 slice. + /// + /// # Safety + /// + /// It's the callers responsibility to ensure chars is a valid UCS-2 + /// null-terminated string, with no interior null characters. + #[must_use] + pub const unsafe fn from_u16_with_nul_unchecked(codes: &[u16]) -> &Self { + &*(codes as *const [u16] as *const Self) + } + + /// Creates a `&CStr16` from a [`Char16`] slice, if the slice is + /// null-terminated and has no interior null characters. + pub fn from_char16_with_nul(chars: &[Char16]) -> Result<&Self, FromSliceWithNulError> { + // Fail early if the input is empty. + if chars.is_empty() { + return Err(FromSliceWithNulError::NotNulTerminated); + } + + // Find the index of the first null char. + if let Some(null_index) = chars.iter().position(|c| *c == NUL_16) { + // Verify the null character is at the end. + if null_index == chars.len() - 1 { + // Safety: the input is null-terminated and has no interior nulls. + Ok(unsafe { Self::from_char16_with_nul_unchecked(chars) }) + } else { + Err(FromSliceWithNulError::InteriorNul(null_index)) + } + } else { + Err(FromSliceWithNulError::NotNulTerminated) + } + } + + /// Unsafely creates a `&CStr16` from a `Char16` slice. + /// + /// # Safety + /// + /// It's the callers responsibility to ensure chars is null-terminated and + /// has no interior null characters. + #[must_use] + pub const unsafe fn from_char16_with_nul_unchecked(chars: &[Char16]) -> &Self { + let ptr: *const [Char16] = chars; + &*(ptr as *const Self) + } + + /// Convert a [`&str`] to a `&CStr16`, backed by a buffer. + /// + /// The input string must contain only characters representable with + /// UCS-2, and must not contain any null characters (even at the end of + /// the input). + /// + /// The backing buffer must be big enough to hold the converted string as + /// well as a trailing null character. + /// + /// # Examples + /// + /// Convert the UTF-8 string "ABC" to a `&CStr16`: + /// + /// ``` + /// use ucs2::types::CStr16; + /// + /// let mut buf = [0; 4]; + /// CStr16::from_str_with_buf("ABC", &mut buf).unwrap(); + /// ``` + pub fn from_str_with_buf<'a>( + input: &str, + buf: &'a mut [u16], + ) -> Result<&'a Self, FromStrWithBufError> { + let mut index = 0; + + // Convert to UTF-16. + for c in input.encode_utf16() { + *buf.get_mut(index) + .ok_or(FromStrWithBufError::BufferTooSmall)? = c; + index += 1; + } + + // Add trailing null character. + *buf.get_mut(index) + .ok_or(FromStrWithBufError::BufferTooSmall)? = 0; + + // Convert from u16 to Char16. This checks for invalid UCS-2 chars and + // interior nulls. The NotNulTerminated case is unreachable because we + // just added a trailing null character. + Self::from_u16_with_nul(&buf[..index + 1]).map_err(|err| match err { + FromSliceWithNulError::InvalidChar(p) => FromStrWithBufError::InvalidChar(p), + FromSliceWithNulError::InteriorNul(p) => FromStrWithBufError::InteriorNul(p), + FromSliceWithNulError::NotNulTerminated => { + unreachable!() + } + }) + } + + /// Create a `&CStr16` from an [`UnalignedSlice`] using an aligned + /// buffer for storage. The lifetime of the output is tied to `buf`, + /// not `src`. + pub fn from_unaligned_slice<'buf>( + src: &UnalignedSlice<'_, u16>, + buf: &'buf mut [MaybeUninit], + ) -> Result<&'buf Self, UnalignedCStr16Error> { + // The input `buf` might be longer than needed, so get a + // subslice of the required length. + let buf = buf + .get_mut(..src.len()) + .ok_or(UnalignedCStr16Error::BufferTooSmall)?; + + src.copy_to_maybe_uninit(buf); + let buf = unsafe { + // Safety: `copy_buf` fully initializes the slice. + maybe_uninit_slice_assume_init_ref(buf) + }; + Self::from_u16_with_nul(buf).map_err(|e| match e { + FromSliceWithNulError::InvalidChar(v) => UnalignedCStr16Error::InvalidChar(v), + FromSliceWithNulError::InteriorNul(v) => UnalignedCStr16Error::InteriorNul(v), + FromSliceWithNulError::NotNulTerminated => UnalignedCStr16Error::NotNulTerminated, + }) + } + + /// Returns the inner pointer to this C16 string. + #[must_use] + pub const fn as_ptr(&self) -> *const Char16 { + self.0.as_ptr() + } + + /// Get the underlying [`Char16`]s as slice without the trailing null. + #[must_use] + pub fn as_slice(&self) -> &[Char16] { + &self.0[..self.num_chars()] + } + + /// Get the underlying [`Char16`]s as slice including the trailing null. + #[must_use] + pub const fn as_slice_with_nul(&self) -> &[Char16] { + &self.0 + } + + /// Converts this C string to a u16 slice without the trailing null. + #[must_use] + pub fn to_u16_slice(&self) -> &[u16] { + let chars = self.to_u16_slice_with_nul(); + &chars[..chars.len() - 1] + } + + /// Converts this C string to a u16 slice containing the trailing null. + #[must_use] + pub const fn to_u16_slice_with_nul(&self) -> &[u16] { + unsafe { &*(&self.0 as *const [Char16] as *const [u16]) } + } + + /// Returns an iterator over this C string + #[must_use] + pub const fn iter(&self) -> CStr16Iter { + CStr16Iter { + inner: self, + pos: 0, + } + } + + /// Returns the number of characters without the trailing null. character + #[must_use] + pub const fn num_chars(&self) -> usize { + self.0.len() - 1 + } + + /// Returns if the string is empty. This ignores the null character. + #[must_use] + pub const fn is_empty(&self) -> bool { + self.num_chars() == 0 + } + + /// Get the number of bytes in the string (including the trailing null). + #[must_use] + pub const fn num_bytes(&self) -> usize { + self.0.len() * 2 + } + + /// Checks if all characters in this string are within the ASCII range. + #[must_use] + pub fn is_ascii(&self) -> bool { + self.0.iter().all(|c| c.is_ascii()) + } + + /// Writes each [`Char16`] as a [`char`] (4 bytes long in Rust language) into the buffer. + /// It is up to the implementer of [`core::fmt::Write`] to convert the char to a string + /// with proper encoding/charset. For example, in the case of [`alloc::string::String`] + /// all Rust chars (UTF-32) get converted to UTF-8. + /// + /// ## Example + /// + /// ```ignore + /// let firmware_vendor_c16_str: CStr16 = ...; + /// // crate "arrayvec" uses stack-allocated arrays for Strings => no heap allocations + /// let mut buf = arrayvec::ArrayString::<128>::new(); + /// firmware_vendor_c16_str.as_str_in_buf(&mut buf); + /// log::info!("as rust str: {}", buf.as_str()); + /// ``` + /// + /// [`alloc::string::String`]: https://doc.rust-lang.org/nightly/alloc/string/struct.String.html + pub fn as_str_in_buf(&self, buf: &mut dyn core::fmt::Write) -> core::fmt::Result { + for c16 in self.iter() { + buf.write_char(char::from(*c16))?; + } + Ok(()) + } + + /// Returns the underlying bytes as slice including the terminating null + /// character. + #[must_use] + pub const fn as_bytes(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.0.as_ptr().cast(), self.num_bytes()) } + } +} + +impl AsRef<[u8]> for CStr16 { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl Borrow<[u8]> for CStr16 { + fn borrow(&self) -> &[u8] { + self.as_bytes() + } +} + +#[cfg(feature = "alloc")] +impl From<&CStr16> for alloc::string::String { + fn from(value: &CStr16) -> Self { + value + .as_slice() + .iter() + .copied() + .map(u16::from) + .map(u32::from) + .map(|int| char::from_u32(int).expect("Should be encodable as UTF-8")) + .collect::() + } +} + +impl + ?Sized> EqStrUntilNul for CStr16 { + fn eq_str_until_nul(&self, other: &StrType) -> bool { + let other = other.as_ref(); + + let any_not_equal = self + .iter() + .copied() + .map(char::from) + .zip(other.chars()) + // This only works as CStr16 is guaranteed to have a fixed character length + // (unlike UTF-8 or UTF-16). + .take_while(|(l, r)| *l != '\0' && *r != '\0') + .any(|(l, r)| l != r); + + !any_not_equal + } +} + +impl AsRef for CStr16 { + fn as_ref(&self) -> &Self { + self + } +} + +/// An iterator over the [`Char16`]s in a [`CStr16`]. +#[derive(Debug)] +pub struct CStr16Iter<'a> { + inner: &'a CStr16, + pos: usize, +} + +impl<'a> Iterator for CStr16Iter<'a> { + type Item = &'a Char16; + + fn next(&mut self) -> Option { + if self.pos >= self.inner.0.len() - 1 { + None + } else { + self.pos += 1; + self.inner.0.get(self.pos - 1) + } + } +} + +impl fmt::Debug for CStr16 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CStr16({:?})", &self.0) + } +} + +impl fmt::Display for CStr16 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.iter() { + ::fmt(c, f)?; + } + Ok(()) + } +} + +#[cfg(feature = "alloc")] +impl PartialEq for &CStr16 { + fn eq(&self, other: &CString16) -> bool { + PartialEq::eq(*self, other.as_ref()) + } +} + +impl<'a> UnalignedSlice<'a, u16> { + /// Create a [`CStr16`] from an [`UnalignedSlice`] using an aligned + /// buffer for storage. The lifetime of the output is tied to `buf`, + /// not `self`. + pub fn to_cstr16<'buf>( + &self, + buf: &'buf mut [MaybeUninit], + ) -> Result<&'buf CStr16, UnalignedCStr16Error> { + CStr16::from_unaligned_slice(self, buf) + } +} diff --git a/src/types/cstr8.rs b/src/types/cstr8.rs index 5ccabb3..d46517b 100644 --- a/src/types/cstr8.rs +++ b/src/types/cstr8.rs @@ -1,3 +1,300 @@ //! Rusty-types to work with UCS-2 strings and for convenient interoperability //! with Rust string literals (`&str`) and Rust strings (`String`). +use crate::types::chars::{Char8, NUL_8}; +use crate::types::{EqStrUntilNul, FromSliceWithNulError}; +use core::borrow::Borrow; +use core::ffi::CStr; +use core::{fmt, slice}; + +/// A null-terminated Latin-1 string. +/// +/// This type is largely inspired by [`core::ffi::CStr`] with the exception that all characters are +/// guaranteed to be 8 bit long. +/// +/// A [`CStr8`] can be constructed from a [`core::ffi::CStr`] via a `try_from` call: +/// ```ignore +/// let cstr8: &CStr8 = TryFrom::try_from(cstr).unwrap(); +/// ``` +/// +/// For convenience, a [`CStr8`] is comparable with [`core::str`] and +/// `alloc::string::String` from the standard library through the trait [`EqStrUntilNul`]. +#[repr(transparent)] +#[derive(Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct CStr8([Char8]); + +impl CStr8 { + /// Takes a raw pointer to a null-terminated Latin-1 string and wraps it in a CStr8 reference. + /// + /// # Safety + /// + /// The function will start accessing memory from `ptr` until the first + /// null byte. It's the callers responsibility to ensure `ptr` points to + /// a valid null-terminated string in accessible memory. + #[must_use] + pub unsafe fn from_ptr<'ptr>(ptr: *const Char8) -> &'ptr Self { + let mut len = 0; + while *ptr.add(len) != NUL_8 { + len += 1 + } + let ptr = ptr.cast::(); + Self::from_bytes_with_nul_unchecked(slice::from_raw_parts(ptr, len + 1)) + } + + /// Creates a CStr8 reference from bytes. + pub fn from_bytes_with_nul(chars: &[u8]) -> Result<&Self, FromSliceWithNulError> { + let nul_pos = chars.iter().position(|&c| c == 0); + if let Some(nul_pos) = nul_pos { + if nul_pos + 1 != chars.len() { + return Err(FromSliceWithNulError::InteriorNul(nul_pos)); + } + Ok(unsafe { Self::from_bytes_with_nul_unchecked(chars) }) + } else { + Err(FromSliceWithNulError::NotNulTerminated) + } + } + + /// Unsafely creates a CStr8 reference from bytes. + /// + /// # Safety + /// + /// It's the callers responsibility to ensure chars is a valid Latin-1 + /// null-terminated string, with no interior null bytes. + #[must_use] + pub const unsafe fn from_bytes_with_nul_unchecked(chars: &[u8]) -> &Self { + &*(chars as *const [u8] as *const Self) + } + + /// Returns the inner pointer to this CStr8. + #[must_use] + pub const fn as_ptr(&self) -> *const Char8 { + self.0.as_ptr() + } + + /// Returns the underlying bytes as slice including the terminating null + /// character. + #[must_use] + pub const fn as_bytes(&self) -> &[u8] { + unsafe { &*(&self.0 as *const [Char8] as *const [u8]) } + } +} + +impl fmt::Debug for CStr8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "CStr8({:?})", &self.0) + } +} + +impl fmt::Display for CStr8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.0.iter() { + ::fmt(c, f)?; + } + Ok(()) + } +} + +impl AsRef<[u8]> for CStr8 { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl Borrow<[u8]> for CStr8 { + fn borrow(&self) -> &[u8] { + self.as_bytes() + } +} + +impl + ?Sized> EqStrUntilNul for CStr8 { + fn eq_str_until_nul(&self, other: &StrType) -> bool { + let other = other.as_ref(); + + // TODO: CStr16 has .iter() implemented, CStr8 not yet + let any_not_equal = self + .0 + .iter() + .copied() + .map(char::from) + .zip(other.chars()) + // This only works as CStr8 is guaranteed to have a fixed character length + // (unlike UTF-8). + .take_while(|(l, r)| *l != '\0' && *r != '\0') + .any(|(l, r)| l != r); + + !any_not_equal + } +} + +impl<'a> TryFrom<&'a CStr> for &'a CStr8 { + type Error = FromSliceWithNulError; + + fn try_from(cstr: &'a CStr) -> Result { + CStr8::from_bytes_with_nul(cstr.to_bytes_with_nul()) + } +} + +/// Get a Latin-1 character from a UTF-8 byte slice at the given offset. +/// +/// Returns a pair containing the Latin-1 character and the number of bytes in +/// the UTF-8 encoding of that character. +/// +/// Panics if the string cannot be encoded in Latin-1. +/// +/// # Safety +/// +/// The input `bytes` must be valid UTF-8. +const unsafe fn latin1_from_utf8_at_offset(bytes: &[u8], offset: usize) -> (u8, usize) { + if bytes[offset] & 0b1000_0000 == 0b0000_0000 { + (bytes[offset], 1) + } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 { + let a = (bytes[offset] & 0b0001_1111) as u16; + let b = (bytes[offset + 1] & 0b0011_1111) as u16; + let ch = a << 6 | b; + if ch > 0xff { + panic!("input string cannot be encoded as Latin-1"); + } + (ch as u8, 2) + } else { + // Latin-1 code points only go up to 0xff, so if the input contains any + // UTF-8 characters larger than two bytes it cannot be converted to + // Latin-1. + panic!("input string cannot be encoded as Latin-1"); + } +} + +/// Count the number of Latin-1 characters in a string. +/// +/// Panics if the string cannot be encoded in Latin-1. +/// +/// This is public but hidden; it is used in the `cstr8` macro. +#[must_use] +pub const fn str_num_latin1_chars(s: &str) -> usize { + let bytes = s.as_bytes(); + let len = bytes.len(); + + let mut offset = 0; + let mut num_latin1_chars = 0; + + while offset < len { + // SAFETY: `bytes` is valid UTF-8. + let (_, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, offset) }; + offset += num_utf8_bytes; + num_latin1_chars += 1; + } + + num_latin1_chars +} + +/// Convert a `str` into a null-terminated Latin-1 character array. +/// +/// Panics if the string cannot be encoded in Latin-1. +/// +/// This is public but hidden; it is used in the `cstr8` macro. +#[must_use] +pub const fn str_to_latin1(s: &str) -> [u8; N] { + let bytes = s.as_bytes(); + let len = bytes.len(); + + let mut output = [0; N]; + + let mut output_offset = 0; + let mut input_offset = 0; + while input_offset < len { + // SAFETY: `bytes` is valid UTF-8. + let (ch, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, input_offset) }; + if ch == 0 { + panic!("interior null character"); + } else { + output[output_offset] = ch; + output_offset += 1; + input_offset += num_utf8_bytes; + } + } + + // The output array must be one bigger than the converted string, + // to leave room for the trailing null character. + if output_offset + 1 != N { + panic!("incorrect array length"); + } + + output +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cstr8; + + // Tests if our CStr8 type can be constructed from a valid core::ffi::CStr + #[test] + fn test_cstr8_from_cstr() { + let msg = "hello world\0"; + let cstr = unsafe { CStr::from_ptr(msg.as_ptr().cast()) }; + let cstr8: &CStr8 = TryFrom::try_from(cstr).unwrap(); + assert!(cstr8.eq_str_until_nul(msg)); + assert!(msg.eq_str_until_nul(cstr8)); + } + + #[test] + fn test_cstr8_as_bytes() { + let string: &CStr8 = cstr8!("a"); + assert_eq!(string.as_bytes(), &[b'a', 0]); + assert_eq!(>::as_ref(string), &[b'a', 0]); + assert_eq!(>::borrow(string), &[b'a', 0]); + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests_with_alloc { + use super::*; + use crate::cstr8; + use alloc::string::String; + + // Code generation helper for the compare tests of our CStrX types against "str" and "String" + // from the standard library. + #[allow(non_snake_case)] + macro_rules! test_compare_cstrX { + ($input:ident) => { + assert!($input.eq_str_until_nul(&"test")); + assert!($input.eq_str_until_nul(&String::from("test"))); + + // now other direction + assert!(String::from("test").eq_str_until_nul($input)); + assert!("test".eq_str_until_nul($input)); + + // some more tests + // this is fine: compare until the first null + assert!($input.eq_str_until_nul(&"te\0st")); + // this is fine + assert!($input.eq_str_until_nul(&"test\0")); + assert!(!$input.eq_str_until_nul(&"hello")); + }; + } + + /// Tests the trait implementation of trait [`EqStrUntilNul]` for [`CStr8`]. + /// + /// This tests that `String` and `str` from the standard library can be + /// checked for equality against a [`CStr8`]. It checks both directions, + /// i.e., the equality is reflexive. + #[test] + fn test_cstr8_eq_std_str() { + let input: &CStr8 = cstr8!("test"); + + // test various comparisons with different order (left, right) + assert!(input.eq_str_until_nul("test")); // requires ?Sized constraint + assert!(input.eq_str_until_nul(&"test")); + assert!(input.eq_str_until_nul(&String::from("test"))); + + // now other direction + assert!(String::from("test").eq_str_until_nul(input)); + assert!("test".eq_str_until_nul(input)); + } + + #[test] + fn test_compare_cstr8() { + // test various comparisons with different order (left, right) + let input: &CStr8 = cstr8!("test"); + test_compare_cstrX!(input); + } +} diff --git a/src/types/macros.rs b/src/types/macros.rs new file mode 100644 index 0000000..c07e083 --- /dev/null +++ b/src/types/macros.rs @@ -0,0 +1,82 @@ +//! todo. + +/// Encode a string literal as a [`&CStr8`]. +/// +/// The encoding is done at compile time, so the result can be used in a +/// `const` item. +/// +/// An empty string containing just a null character can be created with either +/// `cstr8!()` or `cstr8!("")`. +/// +/// # Example +/// +/// ``` +/// use ucs2::types::{CStr8, cstr8}; +/// +/// const S: &CStr8 = cstr8!("abÿ"); +/// assert_eq!(S.as_bytes(), [97, 98, 255, 0]); +/// +/// const EMPTY: &CStr8 = cstr8!(); +/// assert_eq!(EMPTY.as_bytes(), [0]); +/// assert_eq!(cstr8!(""), EMPTY); +/// ``` +/// +/// [`&CStr8`]: crate::CStr8 +#[macro_export] +macro_rules! cstr8 { + () => {{ + const S: &[u8] = &[0]; + // SAFETY: `S` is a trivially correct Latin-1 C string. + unsafe { $crate::types::CStr8::from_bytes_with_nul_unchecked(S) } + }}; + ($s:literal) => {{ + // Use `const` values here to force errors to happen at compile + // time. + + // Add one for the null char. + const NUM_CHARS: usize = $crate::types::str_num_latin1_chars($s) + 1; + + const VAL: [u8; NUM_CHARS] = $crate::types::str_to_latin1($s); + + // SAFETY: the `str_to_latin1` function always produces a valid Latin-1 + // string with a trailing null character. + unsafe { $crate::types::CStr8::from_bytes_with_nul_unchecked(&VAL) } + }}; +} + +/// Encode a string literal as a [`&CStr16`]. +/// +/// The encoding is done at compile time, so the result can be used in a +/// `const` item. +/// +/// An empty string containing just a null character can be created with either +/// `cstr16!()` or `cstr16!("")`. +/// +/// # Example +/// +/// ``` +/// use ucs2::types::{CStr16, cstr16}; +/// +/// const S: &CStr16 = cstr16!("abc"); +/// assert_eq!(S.to_u16_slice_with_nul(), [97, 98, 99, 0]); +/// +/// const EMPTY: &CStr16 = cstr16!(); +/// assert_eq!(EMPTY.to_u16_slice_with_nul(), [0]); +/// assert_eq!(cstr16!(""), EMPTY); +/// ``` +/// +/// [`&CStr16`]: crate::CStr16 +#[macro_export] +macro_rules! cstr16 { + () => {{ + const S: &[u16] = &[0]; + // SAFETY: `S` is a trivially correct UCS-2 C string. + unsafe { $crate::types::CStr16::from_u16_with_nul_unchecked(S) } + }}; + ($s:literal) => {{ + const S: &[u16] = &$crate::ucs2_cstr!($s); + // SAFETY: the ucs2_cstr macro always produces a valid UCS-2 string with + // a trailing null character. + unsafe { $crate::types::CStr16::from_u16_with_nul_unchecked(S) } + }}; +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 26b7bc2..7623c1d 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,9 +1,96 @@ +//! TODO. pub(self) mod chars; -mod cstr8; -#[cfg(feature = "alloc")] mod cstr16; +mod cstr8; +mod macros; +mod unaligned_slice; -pub use cstr8::*; -#[cfg(feature = "alloc")] +pub use crate::cstr16; +pub use crate::cstr8; +use core::fmt; +use core::fmt::{Display, Formatter}; pub use cstr16::*; +pub use cstr8::*; + +/// Errors which can occur during checked `[uN]` -> `CStrN` conversions +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum FromSliceWithNulError { + /// An invalid character was encountered before the end of the slice + InvalidChar(usize), + + /// A null character was encountered before the end of the slice + InteriorNul(usize), + + /// The slice was not null-terminated + NotNulTerminated, +} + +impl Display for FromSliceWithNulError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidChar(usize) => write!(f, "invalid character at index {}", usize), + Self::InteriorNul(usize) => write!(f, "interior null character at index {}", usize), + Self::NotNulTerminated => write!(f, "not null-terminated"), + } + } +} + +#[cfg(feature = "unstable")] +impl core::error::Error for FromSliceWithNulError {} + +/// Error returned by [`CStr16::from_str_with_buf`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum FromStrWithBufError { + /// An invalid character was encountered before the end of the string + InvalidChar(usize), + + /// A null character was encountered in the string + InteriorNul(usize), + + /// The buffer is not big enough to hold the entire string and + /// trailing null character + BufferTooSmall, +} + +impl Display for FromStrWithBufError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidChar(usize) => write!(f, "invalid character at index {}", usize), + Self::InteriorNul(usize) => write!(f, "interior null character at index {}", usize), + Self::BufferTooSmall => write!(f, "buffer too small"), + } + } +} + +#[cfg(feature = "unstable")] +impl core::error::Error for FromStrWithBufError {} + +/// The EqStrUntilNul trait helps to compare Rust strings against UEFI string types (UCS-2 strings). +/// The given generic implementation of this trait enables us that we only have to +/// implement one direction (`left.eq_str_until_nul(&right)`) for each UEFI string type and we +/// get the other direction (`right.eq_str_until_nul(&left)`) for free. Hence, the relation is +/// reflexive. +pub trait EqStrUntilNul { + /// Checks if the provided Rust string `StrType` is equal to [Self] until the first null character + /// is found. An exception is the terminating null character of [Self] which is ignored. + /// + /// As soon as the first null character in either `&self` or `other` is found, this method returns. + /// Note that Rust strings are allowed to contain null bytes that do not terminate the string. + /// Although this is rather unusual, you can compare `"foo\0bar"` with an instance of [Self]. + /// In that case, only `foo"` is compared against [Self] (if [Self] is long enough). + fn eq_str_until_nul(&self, other: &StrType) -> bool; +} + +// magic implementation which transforms an existing `left.eq_str_until_nul(&right)` implementation +// into an additional working `right.eq_str_until_nul(&left)` implementation. +impl EqStrUntilNul for StrType +where + StrType: AsRef, + C16StrType: EqStrUntilNul + ?Sized, +{ + fn eq_str_until_nul(&self, other: &C16StrType) -> bool { + // reuse the existing implementation + other.eq_str_until_nul(self) + } +} diff --git a/src/types/unaligned_slice.rs b/src/types/unaligned_slice.rs new file mode 100644 index 0000000..f79d932 --- /dev/null +++ b/src/types/unaligned_slice.rs @@ -0,0 +1,254 @@ +use core::fmt::{self, Debug, Formatter}; +use core::marker::PhantomData; +use core::mem::MaybeUninit; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +/// Slice backed by a potentially-unaligned pointer. +/// +/// This wrapper can be used to safely expose slices that are inside a +/// [`repr(packed)`] struct. The element type must be [`Copy`]. +/// +/// [`repr(packed)`]: https://doc.rust-lang.org/nomicon/other-reprs.html#reprpacked +#[derive(Clone)] +pub struct UnalignedSlice<'a, T: Copy> { + data: *const T, + len: usize, + _phantom_lifetime: PhantomData<&'a T>, +} + +impl<'a, T: Copy> UnalignedSlice<'a, T> { + /// Create an `UnalignedSlice` from a raw pointer. The pointer must + /// not be dangling but can be unaligned. The `len` parameter is the + /// number of elements in the slice (not the number of bytes). + /// + /// # Safety + /// + /// The `data` pointer must point to a packed array of at least + /// `len` elements of type `T`. The pointer must remain valid for as + /// long as the `'a` lifetime. + pub const unsafe fn new(data: *const T, len: usize) -> Self { + Self { + data, + len, + _phantom_lifetime: PhantomData, + } + } + + /// Returns true if the slice has a length of 0. + #[must_use] + pub const fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Get the underlying pointer, which may be unaligned. + #[must_use] + pub const fn as_ptr(&self) -> *const T { + self.data + } + + /// Returns the number of elements in the slice. + #[must_use] + pub const fn len(&self) -> usize { + self.len + } + + /// Returns the element at `index`, or `None` if the `index` is out + /// of bounds. + #[must_use] + pub fn get(&self, index: usize) -> Option { + if index < self.len { + Some(unsafe { self.data.add(index).read_unaligned() }) + } else { + None + } + } + + /// Returns an iterator over the slice. + /// + /// The iterator yields all items from start to end. + #[must_use] + pub const fn iter(&'a self) -> UnalignedSliceIter<'a, T> { + UnalignedSliceIter { + slice: self, + index: 0, + } + } + + /// Copy the data to an aligned buffer. + /// + /// The length of `dest` must be the same as `self`. + /// + /// # Panics + /// + /// This function will panic if the two slices have different lengths. + pub fn copy_to(&self, dest: &mut [T]) { + if dest.len() != self.len { + panic!( + "source slice length ({}) does not match destination slice length ({})", + self.len(), + dest.len(), + ); + } + + for (i, elem) in dest.iter_mut().enumerate() { + *elem = unsafe { self.data.add(i).read_unaligned() }; + } + } + + /// Copy the data to an aligned [`MaybeUninit`] buffer. + /// + /// The length of `dest` must be the same as `self`. + /// + /// This function fully initializes the `dest` slice. + /// + /// # Panics + /// + /// This function will panic if the two slices have different lengths. + pub fn copy_to_maybe_uninit(&self, dest: &mut [MaybeUninit]) { + if dest.len() != self.len { + panic!( + "source slice length ({}) does not match destination slice length ({})", + self.len(), + dest.len(), + ); + } + + for (i, elem) in dest.iter_mut().enumerate() { + unsafe { elem.write(self.data.add(i).read_unaligned()) }; + } + } + + /// Copies `self` into a new `Vec`. + #[cfg(feature = "alloc")] + #[must_use] + pub fn to_vec(&self) -> Vec { + let len = self.len(); + let mut v = Vec::with_capacity(len); + unsafe { + self.copy_to_maybe_uninit(v.spare_capacity_mut()); + v.set_len(len); + } + v + } +} + +impl<'a, T: Copy + Debug> Debug for UnalignedSlice<'a, T> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.iter()).finish() + } +} + +#[cfg(feature = "alloc")] +impl<'a, T: Copy> From> for Vec { + fn from(input: UnalignedSlice<'a, T>) -> Self { + input.to_vec() + } +} + +impl<'a, T: Copy> IntoIterator for UnalignedSlice<'a, T> { + type Item = T; + type IntoIter = UnalignedSliceIntoIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + UnalignedSliceIntoIter { + slice: self, + index: 0, + } + } +} + +impl<'a, T: Copy> IntoIterator for &'a UnalignedSlice<'a, T> { + type Item = T; + type IntoIter = UnalignedSliceIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Iterator for a [`UnalignedSlice`]. +#[derive(Debug)] +pub struct UnalignedSliceIntoIter<'a, T: Copy> { + slice: UnalignedSlice<'a, T>, + index: usize, +} + +impl<'a, T: Copy> Iterator for UnalignedSliceIntoIter<'a, T> { + type Item = T; + + fn next(&mut self) -> Option { + let output = self.slice.get(self.index)?; + self.index += 1; + Some(output) + } +} + +/// Iterator for a [`UnalignedSlice`] reference. +#[derive(Debug)] +pub struct UnalignedSliceIter<'a, T: Copy> { + slice: &'a UnalignedSlice<'a, T>, + index: usize, +} + +impl<'a, T: Copy> Iterator for UnalignedSliceIter<'a, T> { + type Item = T; + + fn next(&mut self) -> Option { + let output = self.slice.get(self.index)?; + self.index += 1; + Some(output) + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + use alloc::vec::Vec; + + #[test] + fn test_unaligned_slice() { + #[rustfmt::skip] + let bytes: [u8; 13] = [ + // Extra byte to make the rest of the data unaligned. + 0, + // First element. + 0x10, 0x11, 0x12, 0x13, + // Second element. + 0x20, 0x21, 0x22, 0x23, + // Third element. + 0x30, 0x31, 0x32, 0x33, + ]; + + // Skip past the first byte and create an unaligned `*const u32` pointer. + let bytes = &bytes[1..]; + let slice_ptr: *const u32 = bytes.as_ptr().cast(); + + let slice: UnalignedSlice = unsafe { UnalignedSlice::new(slice_ptr, 0) }; + assert!(slice.is_empty()); + + let slice: UnalignedSlice = unsafe { UnalignedSlice::new(slice_ptr, 3) }; + assert!(!slice.is_empty()); + assert_eq!(slice.len(), 3); + + assert_eq!(slice.get(0), Some(0x13121110)); + assert_eq!(slice.get(1), Some(0x23222120)); + assert_eq!(slice.get(2), Some(0x33323130)); + assert_eq!(slice.get(3), None); + + let mut copy = [0; 3]; + slice.copy_to(&mut copy); + assert_eq!(copy, [0x13121110, 0x23222120, 0x33323130]); + + assert_eq!( + slice.iter().collect::>(), + [0x13121110, 0x23222120, 0x33323130] + ); + + assert_eq!( + slice.into_iter().collect::>(), + [0x13121110, 0x23222120, 0x33323130] + ); + } +} diff --git a/tests/tests.rs b/tests/encoding_tests.rs similarity index 96% rename from tests/tests.rs rename to tests/encoding_tests.rs index f2ab0f4..84be261 100644 --- a/tests/tests.rs +++ b/tests/encoding_tests.rs @@ -1,4 +1,4 @@ -use ucs2::{decode, decode_with, encode, ucs2_cstr, Error}; +use ucs2::encoding::{decode, decode_with, encode, ucs2_cstr, Error}; #[test] fn encoding() { From 95ca4a8174db5005ea60e4dea32c060c1299576d Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 18:29:43 +0200 Subject: [PATCH 6/7] init modules cstring16 --- src/polyfill.rs | 1 + src/types/cstr16.rs | 2 + src/types/cstr8.rs | 3 - src/types/cstring16.rs | 380 +++++++++++++++++++++++++++++++++++++++++ src/types/macros.rs | 4 +- src/types/mod.rs | 11 +- 6 files changed, 394 insertions(+), 7 deletions(-) create mode 100644 src/types/cstring16.rs diff --git a/src/polyfill.rs b/src/polyfill.rs index 435481b..336a9be 100644 --- a/src/polyfill.rs +++ b/src/polyfill.rs @@ -8,6 +8,7 @@ use {alloc::vec::Vec, core::mem::ManuallyDrop}; /// Polyfill for the unstable `MaybeUninit::slice_assume_init_ref` function. /// /// See . +#[allow(clippy::missing_safety_doc)] pub const unsafe fn maybe_uninit_slice_assume_init_ref(s: &[MaybeUninit]) -> &[T] { unsafe { &*(s as *const [MaybeUninit] as *const [T]) } } diff --git a/src/types/cstr16.rs b/src/types/cstr16.rs index 2abe512..52ed004 100644 --- a/src/types/cstr16.rs +++ b/src/types/cstr16.rs @@ -1,6 +1,8 @@ use crate::polyfill::maybe_uninit_slice_assume_init_ref; use crate::types::chars::{Char16, NUL_16}; use crate::types::unaligned_slice::UnalignedSlice; +#[cfg(feature = "alloc")] +use crate::types::CString16; use crate::types::{EqStrUntilNul, FromSliceWithNulError, FromStrWithBufError}; use core::borrow::Borrow; use core::fmt::{Display, Formatter}; diff --git a/src/types/cstr8.rs b/src/types/cstr8.rs index d46517b..f53d0cf 100644 --- a/src/types/cstr8.rs +++ b/src/types/cstr8.rs @@ -1,6 +1,3 @@ -//! Rusty-types to work with UCS-2 strings and for convenient interoperability -//! with Rust string literals (`&str`) and Rust strings (`String`). - use crate::types::chars::{Char8, NUL_8}; use crate::types::{EqStrUntilNul, FromSliceWithNulError}; use core::borrow::Borrow; diff --git a/src/types/cstring16.rs b/src/types/cstring16.rs new file mode 100644 index 0000000..1d095fb --- /dev/null +++ b/src/types/cstring16.rs @@ -0,0 +1,380 @@ +use super::chars::{Char16, NUL_16}; +use crate::polyfill::vec_into_raw_parts; +use alloc::borrow::{Borrow, ToOwned}; +use alloc::string::String; +use alloc::vec; +use alloc::vec::Vec; +use core::fmt::{self, Display, Formatter}; +use core::ops; +use crate::types::{CStr16, EqStrUntilNul, FromSliceWithNulError}; +use crate::types::unaligned_slice::UnalignedSlice; + +/// Error returned by [`CString16::try_from::<&str>`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum FromStrError { + /// Character conversion error. + InvalidChar, + /// Nul character found in the input. + InteriorNul, +} + +impl Display for FromStrError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "UCS-2 Conversion Error: {}", + match self { + Self::InvalidChar => "Invalid character", + Self::InteriorNul => "Interior null terminator", + } + ) + } +} + +#[cfg(feature = "unstable")] +impl core::error::Error for FromStrError {} + +/// An owned UCS-2 null-terminated string. +/// +/// For convenience, a [`CString16`] is comparable with `&str` and `String` from +/// the standard library through the trait [`EqStrUntilNul`]. +/// +/// # Examples +/// +/// Round-trip conversion from a [`&str`] to a `CString16` and back: +/// +/// ``` +/// use ucs2::types::CString16; +/// +/// let s = CString16::try_from("abc").unwrap(); +/// assert_eq!(s.to_string(), "abc"); +/// ``` +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct CString16(Vec); + +impl CString16 { + /// Creates a new empty string with a terminating null character. + #[must_use] + pub fn new() -> Self { + Self(vec![NUL_16]) + } + + /// Inserts a character at the end of the string, right before the null + /// character. + /// + /// # Panics + /// Panics if the char is a null character. + pub fn push(&mut self, char: Char16) { + assert_ne!(char, NUL_16, "Pushing a null-character is illegal"); + let last_elem = self + .0 + .last_mut() + .expect("There should be at least a null character"); + *last_elem = char; + self.0.push(NUL_16); + } + + /// Extends the string with the given [`CStr16`]. The null character is + /// automatically kept at the end. + pub fn push_str(&mut self, str: &CStr16) { + str.as_slice() + .iter() + .copied() + .for_each(|char| self.push(char)); + } + + /// Replaces all chars in the string with the replace value in-place. + pub fn replace_char(&mut self, search: Char16, replace: Char16) { + assert_ne!(search, NUL_16, "Replacing a null character is illegal"); + assert_ne!( + replace, NUL_16, + "Replacing with a null character is illegal" + ); + self.0 + .as_mut_slice() + .iter_mut() + .filter(|char| **char == search) + .for_each(|char| *char = replace); + } + + /// Returns the number of characters without the trailing null character. + #[must_use] + pub fn num_chars(&self) -> usize { + self.0.len() - 1 + } + + /// Returns if the string is empty. This ignores the null character. + #[must_use] + pub fn is_empty(&self) -> bool { + self.num_chars() == 0 + } +} + +impl Default for CString16 { + fn default() -> Self { + Self::new() + } +} + +impl TryFrom<&str> for CString16 { + type Error = FromStrError; + + fn try_from(input: &str) -> Result { + // Initially allocate one Char16 for each byte of the input, plus + // one for the null character. This should be a good guess for ASCII-ish + // input. + let mut output = Vec::with_capacity(input.len() + 1); + + // Convert to UTF-16, then convert to UCS-2. + for c in input.encode_utf16() { + let c = Char16::try_from(c).map_err(|_| FromStrError::InvalidChar)?; + + // Check for interior nul chars. + if c == NUL_16 { + return Err(FromStrError::InteriorNul); + } + + output.push(c); + } + + // Add trailing nul. + output.push(NUL_16); + + Ok(Self(output)) + } +} + +impl TryFrom> for CString16 { + type Error = FromSliceWithNulError; + + fn try_from(input: Vec) -> Result { + // Try creating a CStr16 from the input. We throw away the + // result if successful, but it takes care of all the necessary + // validity checks (valid UCS-2, ends in null, contains no + // interior nulls). + CStr16::from_u16_with_nul(&input)?; + + // Convert the input vector from `u16` to `Char16`. + // + // Safety: `Char16` is a transparent struct wrapping `u16`, so + // the types are compatible. The pattern used here matches the + // example in the docs for `into_raw_parts`. + let (ptr, len, cap) = vec_into_raw_parts(input); + let rebuilt = unsafe { + let ptr = ptr.cast::(); + Vec::from_raw_parts(ptr, len, cap) + }; + + Ok(Self(rebuilt)) + } +} + +impl<'a> TryFrom<&UnalignedSlice<'a, u16>> for CString16 { + type Error = FromSliceWithNulError; + + fn try_from(input: &UnalignedSlice) -> Result { + let v = input.to_vec(); + Self::try_from(v) + } +} + +impl From<&CStr16> for CString16 { + fn from(value: &CStr16) -> Self { + let vec = value.as_slice_with_nul().to_vec(); + Self(vec) + } +} + +impl From<&CString16> for String { + fn from(value: &CString16) -> Self { + let slice: &CStr16 = value.as_ref(); + Self::from(slice) + } +} + +impl<'a> UnalignedSlice<'a, u16> { + /// Copies `self` to a new [`CString16`]. + pub fn to_cstring16(&self) -> Result { + CString16::try_from(self) + } +} + +impl ops::Deref for CString16 { + type Target = CStr16; + + fn deref(&self) -> &CStr16 { + unsafe { &*(self.0.as_slice() as *const [Char16] as *const CStr16) } + } +} + +impl AsRef for CString16 { + fn as_ref(&self) -> &CStr16 { + self + } +} + +impl Borrow for CString16 { + fn borrow(&self) -> &CStr16 { + self + } +} + +impl ToOwned for CStr16 { + type Owned = CString16; + + fn to_owned(&self) -> CString16 { + CString16(self.as_slice_with_nul().to_vec()) + } +} + +impl fmt::Display for CString16 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.as_ref().fmt(f) + } +} + +impl PartialEq<&CStr16> for CString16 { + fn eq(&self, other: &&CStr16) -> bool { + PartialEq::eq(self.as_ref(), other) + } +} + +impl + ?Sized> EqStrUntilNul for CString16 { + fn eq_str_until_nul(&self, other: &StrType) -> bool { + let this = self.as_ref(); + this.eq_str_until_nul(other) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cstr16; + use alloc::string::String; + use alloc::vec; + + #[test] + fn test_cstring16_from_str() { + assert_eq!( + CString16::try_from("x").unwrap(), + CString16(vec![Char16::try_from('x').unwrap(), NUL_16]) + ); + + assert_eq!(CString16::try_from("😀"), Err(FromStrError::InvalidChar)); + + assert_eq!(CString16::try_from("x\0"), Err(FromStrError::InteriorNul)); + } + + #[test] + fn test_cstring16_from_u16_vec() { + // Test that invalid inputs are caught. + assert_eq!( + CString16::try_from(vec![]), + Err(FromSliceWithNulError::NotNulTerminated) + ); + assert_eq!( + CString16::try_from(vec![b'a'.into(), 0, b'b'.into(), 0]), + Err(FromSliceWithNulError::InteriorNul(1)) + ); + assert_eq!( + CString16::try_from(vec![0xd800, 0]), + Err(FromSliceWithNulError::InvalidChar(0)) + ); + + // Test valid input. + assert_eq!( + CString16::try_from(vec![b'x'.into(), 0]).unwrap(), + CString16::try_from("x").unwrap() + ); + } + + /// Test `CString16 == &CStr16` and `&CStr16 == CString16`. + #[test] + fn test_cstring16_cstr16_eq() { + assert_eq!( + cstr16!("abc"), + CString16::try_from("abc").unwrap() + ); + + assert_eq!( + CString16::try_from("abc").unwrap(), + cstr16!("abc") + ); + } + + /// Tests the trait implementation of trait [`EqStrUntilNul]` for [`CString16`]. + /// + /// This tests that `String` and `str` from the standard library can be + /// checked for equality against a [`CString16`]. It checks both directions, + /// i.e., the equality is reflexive. + #[test] + fn test_cstring16_eq_std_str() { + let input = CString16::try_from("test").unwrap(); + + assert!(input.eq_str_until_nul("test")); // requires ?Sized constraint + assert!(input.eq_str_until_nul(&"test")); + assert!(input.eq_str_until_nul(&String::from("test"))); + + // now other direction + assert!(String::from("test").eq_str_until_nul(&input)); + assert!("test".eq_str_until_nul(&input)); + } + + /// Test the `Borrow` and `ToOwned` impls. + #[test] + fn test_borrow_and_to_owned() { + let s1: &CStr16 = cstr16!("ab"); + let owned: CString16 = s1.to_owned(); + let s2: &CStr16 = owned.borrow(); + assert_eq!(s1, s2); + assert_eq!( + owned.0, + [ + Char16::try_from('a').unwrap(), + Char16::try_from('b').unwrap(), + NUL_16 + ] + ); + } + + /// This tests the following UCS-2 string functions: + /// - runtime constructor + /// - len() + /// - push() / push_str() + /// - to rust string + #[test] + fn test_push_str() { + let mut str1 = CString16::new(); + assert_eq!(str1.num_bytes(), 2, "Should have null character"); + assert_eq!(str1.num_chars(), 0); + str1.push(Char16::try_from('h').unwrap()); + str1.push(Char16::try_from('i').unwrap()); + assert_eq!(str1.num_chars(), 2); + + let mut str2 = CString16::new(); + str2.push(Char16::try_from('!').unwrap()); + + str2.push_str(str1.as_ref()); + assert_eq!(str2.num_chars(), 3); + + let rust_str = String::from(&str2); + assert_eq!(rust_str, "!hi"); + } + + #[test] + #[should_panic] + fn test_push_str_panic() { + CString16::new().push(NUL_16); + } + + #[test] + fn test_char_replace_all_in_place() { + let mut input = CString16::try_from("foo/bar/foobar//").unwrap(); + let search = Char16::try_from('/').unwrap(); + let replace = Char16::try_from('\\').unwrap(); + input.replace_char(search, replace); + + let input = String::from(&input); + assert_eq!(input, "foo\\bar\\foobar\\\\") + } +} diff --git a/src/types/macros.rs b/src/types/macros.rs index c07e083..77a45b8 100644 --- a/src/types/macros.rs +++ b/src/types/macros.rs @@ -21,7 +21,7 @@ /// assert_eq!(cstr8!(""), EMPTY); /// ``` /// -/// [`&CStr8`]: crate::CStr8 +/// [`&CStr8`]: crate::types::CStr8 #[macro_export] macro_rules! cstr8 { () => {{ @@ -65,7 +65,7 @@ macro_rules! cstr8 { /// assert_eq!(cstr16!(""), EMPTY); /// ``` /// -/// [`&CStr16`]: crate::CStr16 +/// [`&CStr16`]: crate::types::CStr16 #[macro_export] macro_rules! cstr16 { () => {{ diff --git a/src/types/mod.rs b/src/types/mod.rs index 7623c1d..14ba9f5 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,17 +1,24 @@ -//! TODO. +//! Rusty-types to work with UCS-2 strings and for convenient interoperability +//! with Rust string literals (`&str`) and Rust strings (`String`). -pub(self) mod chars; +mod chars; mod cstr16; mod cstr8; +#[cfg(feature = "alloc")] +mod cstring16; mod macros; mod unaligned_slice; pub use crate::cstr16; pub use crate::cstr8; +pub use chars::*; use core::fmt; use core::fmt::{Display, Formatter}; pub use cstr16::*; pub use cstr8::*; +#[cfg(feature = "alloc")] +pub use cstring16::*; +pub use unaligned_slice::*; /// Errors which can occur during checked `[uN]` -> `CStrN` conversions #[derive(Clone, Copy, Debug, Eq, PartialEq)] From 7cafab4125f5aaa40a60e9ecfa45ba18eaa5b5e9 Mon Sep 17 00:00:00 2001 From: Philipp Schuster Date: Tue, 30 Jul 2024 18:33:38 +0200 Subject: [PATCH 7/7] cargo fmt --- src/types/cstring16.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/types/cstring16.rs b/src/types/cstring16.rs index 1d095fb..e6b03bc 100644 --- a/src/types/cstring16.rs +++ b/src/types/cstring16.rs @@ -1,13 +1,13 @@ use super::chars::{Char16, NUL_16}; use crate::polyfill::vec_into_raw_parts; +use crate::types::unaligned_slice::UnalignedSlice; +use crate::types::{CStr16, EqStrUntilNul, FromSliceWithNulError}; use alloc::borrow::{Borrow, ToOwned}; use alloc::string::String; use alloc::vec; use alloc::vec::Vec; use core::fmt::{self, Display, Formatter}; use core::ops; -use crate::types::{CStr16, EqStrUntilNul, FromSliceWithNulError}; -use crate::types::unaligned_slice::UnalignedSlice; /// Error returned by [`CString16::try_from::<&str>`]. #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -291,15 +291,9 @@ mod tests { /// Test `CString16 == &CStr16` and `&CStr16 == CString16`. #[test] fn test_cstring16_cstr16_eq() { - assert_eq!( - cstr16!("abc"), - CString16::try_from("abc").unwrap() - ); + assert_eq!(cstr16!("abc"), CString16::try_from("abc").unwrap()); - assert_eq!( - CString16::try_from("abc").unwrap(), - cstr16!("abc") - ); + assert_eq!(CString16::try_from("abc").unwrap(), cstr16!("abc")); } /// Tests the trait implementation of trait [`EqStrUntilNul]` for [`CString16`].