Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ffi::OsString and OsStr #21488

Merged
merged 1 commit into from
Jan 24, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 58 additions & 38 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,49 +258,69 @@ impl CharExt for char {
#[inline]
#[unstable = "pending decision about Iterator/Writer/Reader"]
fn encode_utf8(self, dst: &mut [u8]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let code = self as u32;
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(4)
} else {
None
}
encode_utf8_raw(self as u32, dst)
}

#[inline]
#[unstable = "pending decision about Iterator/Writer/Reader"]
fn encode_utf16(self, dst: &mut [u16]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
let mut ch = self as u32;
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
Some(2)
} else {
None
}
encode_utf16_raw(self as u32, dst)
}
}

/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}

/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable]
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<uint> {
// Marked #[inline] to allow llvm optimizing it away
if (ch & 0xFFFF_u32) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
Some(2)
} else {
None
}
}

Expand Down
116 changes: 67 additions & 49 deletions src/libcore/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
}
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[unstable]
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
// Decode UTF-8
let x = match bytes.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
Some(&next_byte) => next_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte!(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
}
}

Some(ch)
}

#[stable]
impl<'a> Iterator for Chars<'a> {
type Item = char;

#[inline]
fn next(&mut self) -> Option<char> {
// Decode UTF-8, using the valid UTF-8 invariant
let x = match self.iter.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
Some(&next_byte) => next_byte,
};

// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte!(x, 2);
let y = unwrap_or_0(self.iter.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.iter.next());
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.iter.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
next_code_point(&mut self.iter).map(|ch| {
// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
mem::transmute(ch)
}
}

// str invariant says `ch` is a valid Unicode Scalar Value
unsafe {
Some(mem::transmute(ch))
}
})
}

#[inline]
Expand Down Expand Up @@ -1517,25 +1526,8 @@ impl StrExt for str {

#[inline]
fn char_range_at(&self, i: uint) -> CharRange {
if self.as_bytes()[i] < 128u8 {
return CharRange {ch: self.as_bytes()[i] as char, next: i + 1 };
}

// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
let mut val = s.as_bytes()[i] as u32;
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
assert!((w != 0));

val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s.as_bytes()[i + 3]); }

return CharRange {ch: unsafe { mem::transmute(val) }, next: i + w};
}

return multibyte_char_range_at(self, i);
let (c, n) = char_range_at_raw(self.as_bytes(), i);
CharRange { ch: unsafe { mem::transmute(c) }, next: n }
}

#[inline]
Expand Down Expand Up @@ -1653,6 +1645,32 @@ impl StrExt for str {
fn parse<T: FromStr>(&self) -> Option<T> { FromStr::from_str(self) }
}

/// Pluck a code point out of a UTF-8-like byte slice and return the
/// index of the next code point.
#[inline]
#[unstable]
pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) {
if bytes[i] < 128u8 {
return (bytes[i] as u32, i + 1);
}

// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) {
let mut val = bytes[i] as u32;
let w = UTF8_CHAR_WIDTH[val as uint] as uint;
assert!((w != 0));

val = utf8_first_byte!(val, w);
val = utf8_acc_cont_byte!(val, bytes[i + 1]);
if w > 2 { val = utf8_acc_cont_byte!(val, bytes[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, bytes[i + 3]); }

return (val, i + w);
}

multibyte_char_range_at(bytes, i)
}

#[stable]
impl<'a> Default for &'a str {
#[stable]
Expand Down
5 changes: 5 additions & 0 deletions src/libstd/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,9 @@ pub use self::c_str::CString;
pub use self::c_str::c_str_to_bytes;
pub use self::c_str::c_str_to_bytes_with_nul;

pub use self::os_str::OsString;
pub use self::os_str::OsStr;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should also export AsOsStr

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(not sure if we also want to add IntoOsString yet)

also raises questions as to why other conversion traits for CString don't exist yet, but they're backwards compatible to add!

pub use self::os_str::AsOsStr;

mod c_str;
mod os_str;
Loading