Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some work on std::ascii #9059

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 77 additions & 20 deletions src/libstd/str/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ use str::StrSlice;
use str::OwnedStr;
use container::Container;
use cast;
use ptr;
use iter::Iterator;
use vec::{CopyableVector, ImmutableVector};
use vec::{CopyableVector, ImmutableVector, MutableVector};
use to_bytes::IterBytes;
use option::{Some, None};

Expand Down Expand Up @@ -61,7 +60,7 @@ impl Ascii {
impl ToStr for Ascii {
#[inline]
fn to_str(&self) -> ~str {
// self.chr is allways a valid utf8 byte, no need for the check
// self.chr is always a valid utf8 byte, no need for the check
unsafe { str::raw::from_byte(self.chr) }
}
}
Expand Down Expand Up @@ -253,16 +252,29 @@ impl ToBytesConsume for ~[Ascii] {
}
}

/// Extension methods for ASCII-subset only operations on strings
pub trait StrAsciiExt {
/// Extension methods for ASCII-subset only operations on owned strings
pub trait OwnedStrAsciiExt {
/// Convert the string to ASCII upper case:
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
fn to_ascii_upper(&self) -> ~str;
fn into_ascii_upper(self) -> ~str;

/// Convert the string to ASCII lower case:
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
fn into_ascii_lower(self) -> ~str;
}

/// Extension methods for ASCII-subset only operations on string slices
pub trait StrAsciiExt {
/// Makes a copy of the string in ASCII upper case:
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
fn to_ascii_upper(&self) -> ~str;

/// Makes a copy of the string in ASCII lower case:
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
fn to_ascii_lower(&self) -> ~str;

/// Check that two strings are an ASCII case-insensitive match.
Expand All @@ -274,12 +286,12 @@ pub trait StrAsciiExt {
impl<'self> StrAsciiExt for &'self str {
#[inline]
fn to_ascii_upper(&self) -> ~str {
map_bytes(*self, ASCII_UPPER_MAP)
unsafe { str_copy_map_bytes(*self, ASCII_UPPER_MAP) }
}

#[inline]
fn to_ascii_lower(&self) -> ~str {
map_bytes(*self, ASCII_LOWER_MAP)
unsafe { str_copy_map_bytes(*self, ASCII_LOWER_MAP) }
}

#[inline]
Expand All @@ -289,20 +301,34 @@ impl<'self> StrAsciiExt for &'self str {
}
}

impl OwnedStrAsciiExt for ~str {
#[inline]
fn into_ascii_upper(self) -> ~str {
unsafe { str_map_bytes(self, ASCII_UPPER_MAP) }
}

#[inline]
fn into_ascii_lower(self) -> ~str {
unsafe { str_map_bytes(self, ASCII_LOWER_MAP) }
}
}

#[inline]
fn map_bytes(string: &str, map: &'static [u8]) -> ~str {
let len = string.len();
let mut result = str::with_capacity(len);
unsafe {
do result.as_mut_buf |mut buf, _| {
for c in string.as_bytes().iter() {
*buf = map[*c];
buf = ptr::mut_offset(buf, 1)
}
}
str::raw::set_len(&mut result, len);
unsafe fn str_map_bytes(string: ~str, map: &'static [u8]) -> ~str {
let mut bytes = string.into_bytes();

for b in bytes.mut_iter() {
*b = map[*b];
}
result

str::raw::from_utf8_owned(bytes)
}

#[inline]
unsafe fn str_copy_map_bytes(string: &str, map: &'static [u8]) -> ~str {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this just be str_map_bytes(string.to_owned(), map)? (Or, is having this as a separate function even necessary, can that snippet just be inlined?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm... Keeping them separated is vaguely motivated by not having unnecessary memory accesses.

This is purely baseless speculation, but:

  • If I make a copy first, and then map over that again, that's iterating twice over the same memory segment.
  • If instead I iterate over the source, and directly collect the mapped values in an already preallocated target, that's just one iteration that might even get vectorized in the future because no aliasing is happening.

let bytes = string.byte_iter().map(|b| map[b]).to_owned_vec();

str::raw::from_utf8_owned(bytes)
}

static ASCII_LOWER_MAP: &'static [u8] = &[
Expand Down Expand Up @@ -494,6 +520,37 @@ mod tests {
}
}

#[test]
fn test_into_ascii_upper() {
assert_eq!((~"url()URL()uRl()ürl").into_ascii_upper(), ~"URL()URL()URL()üRL");
assert_eq!((~"hıKß").into_ascii_upper(), ~"HıKß");

let mut i = 0;
while i <= 500 {
let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
else { i };
assert_eq!(from_char(from_u32(i).unwrap()).into_ascii_upper(),
from_char(from_u32(upper).unwrap()))
i += 1;
}
}

#[test]
fn test_into_ascii_lower() {
assert_eq!((~"url()URL()uRl()Ürl").into_ascii_lower(), ~"url()url()url()Ürl");
// Dotted capital I, Kelvin sign, Sharp S.
assert_eq!((~"HİKß").into_ascii_lower(), ~"hİKß");

let mut i = 0;
while i <= 500 {
let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
else { i };
assert_eq!(from_char(from_u32(i).unwrap()).into_ascii_lower(),
from_char(from_u32(lower).unwrap()))
i += 1;
}
}

#[test]
fn test_eq_ignore_ascii_case() {
assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
Expand Down