Skip to content

move CStr8, CStr16, and CString16 from uefi to ucs2 #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
root = true

[*]
charset = utf-8
end_of_line = lf
max_line_length = 80
insert_final_newline = true
trim_trailing_whitespace = true

[*.{rs,py}]
indent_style = space
indent_size = 4

[*.{json,md,nix,toml,yml}]
indent_style = space
indent_size = 2
12 changes: 7 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[package]
name = "ucs2"
version = "0.3.3"
authors = ["Gabriel Majeri <gabriel.majeri6@gmail.com>", "Fredrik Aleksander", "Isaac Woods"]
description = "UCS-2 decoding and encoding functions"
version = "0.4.0"
authors = ["The Rust OSDev team"]
description = "UCS-2 decoding and encoding functions as well as convenient types."
repository = "https://github.com/rust-osdev/ucs2-rs"
keywords = ["ucs2", "no-std", "encoding"]
categories = ["encoding", "no-std"]
Expand All @@ -13,5 +13,7 @@ rust-version = "1.56"
[dependencies]
bit_field = "0.10"

[badges]
maintenance = { status = "passively-maintained" }
[features]
default = []
alloc = []
unstable = []
6 changes: 3 additions & 3 deletions src/macros.rs → src/encoding/macros.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{ucs2_from_utf8_at_offset, Error};
use super::{ucs2_from_utf8_at_offset, Error};

/// Count the number of UCS-2 characters in a string. Return an error if
/// the string cannot be encoded in UCS-2.
Expand Down Expand Up @@ -82,13 +82,13 @@ macro_rules! ucs2_cstr {
// Use `const` values here to force errors to happen at compile
// time.

const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) {
const NUM_CHARS: usize = match $crate::encoding::str_num_ucs2_chars($s) {
// Add one for the null char.
Ok(num) => num + 1,
Err(_) => panic!("input contains a character which cannot be represented in UCS-2"),
};

const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) {
const VAL: [u16; NUM_CHARS] = match $crate::encoding::str_to_ucs2($s) {
Ok(val) => val,
// The string was already checked by `str_num_ucs2_chars`,
// so this error is unreachable.
Expand Down
224 changes: 224 additions & 0 deletions src/encoding/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
//! Low-level encoding and decoding facilities for UCS-2 strings.

mod macros;

pub use crate::ucs2_cstr;
/// These need to be public for the `ucs2_cstr!` macro, but are not
/// intended to be called directly.
#[doc(hidden)]
pub use macros::{str_num_ucs2_chars, str_to_ucs2};

use bit_field::BitField;
use core::fmt::{self, Display, Formatter};

/// Possible errors when encoding UCS-2 strings..
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Error {
/// Not enough space left in the output buffer.
BufferOverflow,
/// Input contained a character which cannot be represented in UCS-2.
MultiByte,
}

impl Display for Error {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::BufferOverflow => f.write_str("output buffer is too small"),
Self::MultiByte => {
f.write_str("input contains a character which cannot be represented in UCS-2")
}
}
}
}

type Result<T> = core::result::Result<T, Error>;

/// Value returned by `ucs2_from_utf8_at_offset`.
struct Ucs2CharFromUtf8 {
/// UCS-2 character.
val: u16,
/// Number of bytes needed to encode the character in UTF-8.
num_bytes: u8,
}

/// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
///
/// # Safety
///
/// The input `bytes` must be valid UTF-8.
const unsafe fn ucs2_from_utf8_at_offset(bytes: &[u8], offset: usize) -> Result<Ucs2CharFromUtf8> {
let len = bytes.len();
let ch;
let ch_len;

if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
ch = bytes[offset] as u16;
ch_len = 1;
} else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
// 2 byte codepoint
if offset + 1 >= len {
// safe: len is the length of bytes,
// and bytes is a direct view into the
// buffer of input, which in order to be a valid
// utf-8 string _must_ contain `i + 1`.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0001_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
ch = a << 6 | b;
ch_len = 2;
} else if bytes[offset] & 0b1111_0000 == 0b1110_0000 {
// 3 byte codepoint
if offset + 2 >= len || offset + 1 >= len {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

let a = (bytes[offset] & 0b0000_1111) as u16;
let b = (bytes[offset + 1] & 0b0011_1111) as u16;
let c = (bytes[offset + 2] & 0b0011_1111) as u16;
ch = a << 12 | b << 6 | c;
ch_len = 3;
} else if bytes[offset] & 0b1111_0000 == 0b1111_0000 {
return Err(Error::MultiByte); // UTF-16
} else {
// safe: impossible utf-8 string.
unsafe { core::hint::unreachable_unchecked() }
}

Ok(Ucs2CharFromUtf8 {
val: ch,
num_bytes: ch_len,
})
}

/// Encodes an input UTF-8 string into a UCS-2 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// measured in 2-byte characters.
pub fn encode(input: &str, buffer: &mut [u16]) -> Result<usize> {
let buffer_size = buffer.len();
let mut i = 0;

encode_with(input, |ch| {
if i >= buffer_size {
Err(Error::BufferOverflow)
} else {
buffer[i] = ch;
i += 1;
Ok(())
}
})?;

Ok(i)
}

/// Encode UTF-8 string to UCS-2 with a custom callback function.
///
/// `output` is a function which receives every encoded character.
pub fn encode_with<F>(input: &str, mut output: F) -> Result<()>
where
F: FnMut(u16) -> Result<()>,
{
let bytes = input.as_bytes();
let len = bytes.len();
let mut i = 0;

while i < len {
// SAFETY: `bytes` is valid UTF-8.
let ch = unsafe { ucs2_from_utf8_at_offset(bytes, i) }?;
i += usize::from(ch.num_bytes);
output(ch.val)?;
}
Ok(())
}

/// Decode UCS-2 string to UTF-8 with a custom callback function.
///
/// `output` is a function which receives every decoded character.
/// Due to the nature of UCS-2, the function can receive an UTF-8 character
/// of up to three bytes, for every input character.
pub fn decode_with<F>(input: &[u16], mut output: F) -> Result<usize>
where
F: FnMut(&[u8]) -> Result<()>,
{
let mut written = 0;

for ch in input.iter() {
/*
* We need to find how many bytes of UTF-8 this UCS-2 code-point needs. Because UCS-2 can only encode
* the Basic Multilingual Plane, a maximum of three bytes are needed.
*/
if (0x000..0x0080).contains(ch) {
output(&[*ch as u8])?;

written += 1;
} else if (0x0080..0x0800).contains(ch) {
let first = 0b1100_0000 + ch.get_bits(6..11) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

output(&[first, last])?;

written += 2;
} else {
let first = 0b1110_0000 + ch.get_bits(12..16) as u8;
let mid = 0b1000_0000 + ch.get_bits(6..12) as u8;
let last = 0b1000_0000 + ch.get_bits(0..6) as u8;

output(&[first, mid, last])?;

written += 3;
}
}

Ok(written)
}

/// Decode an input UCS-2 string into a UTF-8 string.
///
/// The returned `usize` represents the length of the returned buffer,
/// in bytes. Due to the nature of UCS-2, the output buffer could end up with
/// three bytes for every character in the input buffer.
pub fn decode(input: &[u16], output: &mut [u8]) -> Result<usize> {
let buffer_size = output.len();
let mut i = 0;

decode_with(input, |bytes| {
if bytes.len() == 1 {
// Can be encoded in a single byte
if i >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];

i += 1;
} else if bytes.len() == 2 {
// Can be encoded two bytes
if i + 1 >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];
output[i + 1] = bytes[1];

i += 2;
} else if bytes.len() == 3 {
// Can be encoded three bytes
if i + 2 >= buffer_size {
return Err(Error::BufferOverflow);
}

output[i] = bytes[0];
output[i + 1] = bytes[1];
output[i + 2] = bytes[2];

i += 3;
} else {
unreachable!("More than three bytes per UCS-2 character.");
}

Ok(())
})
}
Loading
Loading