Skip to content

Added initial stubs for unicode case folding support #5822

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 85 additions & 2 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand All @@ -15,6 +15,7 @@ use str;
use u32;
use uint;
use unicode;
use libc;

#[cfg(notest)] use cmp::Eq;

Expand Down Expand Up @@ -106,6 +107,42 @@ pub fn is_ascii(c: char) -> bool {
c - ('\x7F' & c) == '\x00'
}

pub enum CaseFoldingRule {
// FIXME: #5820 Add support for locale-specific unicode case folding
// See: http://www.w3.org/International/wiki/Case_folding

Default,
//Locale(&str)
}

/// Converts a character to its lower case representation, using the given case folding rules
#[inline(always)]
pub fn to_lower(c: char, folding_rule: CaseFoldingRule) -> char {
match folding_rule {
Default if is_ascii(c) => unsafe{(libc::tolower(c as libc::c_char)) as char},
Default => c,
}
}

/// Converts a character to its upper case representation, using the given case folding rules
#[inline(always)]
pub fn to_upper(c: char, folding_rule: CaseFoldingRule) -> char {
match folding_rule {
Default if is_ascii(c) => unsafe{(libc::toupper(c as libc::c_char)) as char},
Default => c,
}
}

/// Converts a character to its lower case representation, using the default (english)
/// case folding rules
#[inline(always)]
pub fn to_lower_default(c: char) -> char { to_lower(c, Default) }

/// Converts a character to its upper case representation, using the default (english)
/// case folding rules
#[inline(always)]
pub fn to_upper_default(c: char) -> char { to_upper(c, Default) }

/// Indicates whether the character is numeric (Nd, Nl, or No)
#[inline(always)]
pub fn is_digit(c: char) -> bool {
Expand Down Expand Up @@ -234,6 +271,34 @@ pub fn escape_default(c: char) -> ~str {
}
}

/// Returns the amount of bytes this character would need if encoded in utf8
pub fn len_utf8_bytes(c: char) -> uint {
static max_one_b: uint = 128u;
static max_two_b: uint = 2048u;
static max_three_b: uint = 65536u;
static max_four_b: uint = 2097152u;

let code = c as uint;
if code < max_one_b { 1u }
else if code < max_two_b { 2u }
else if code < max_three_b { 3u }
else if code < max_four_b { 4u }
else { fail!(~"invalid character!") }
}

/// Compares two characters, ignoring case differences.
/// Currently only works for ascii.
pub fn eq_ignore_case(c1: char, c2: char) -> bool {
// FIXME: #5820 Add support for non-ascii comparisons
if is_ascii(c1) && is_ascii(c2) {
unsafe{
libc::tolower(c1 as libc::c_char) == libc::tolower(c2 as libc::c_char)
}
} else {
c1 == c2
}
}

/**
* Compare two chars
*
Expand Down Expand Up @@ -334,7 +399,6 @@ fn test_escape_default() {
assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
}


#[test]
fn test_escape_unicode() {
assert_eq!(escape_unicode('\x00'), ~"\\x00");
Expand All @@ -344,3 +408,22 @@ fn test_escape_unicode() {
assert_eq!(escape_unicode('\u011b'), ~"\\u011b");
assert_eq!(escape_unicode('\U0001d4b6'), ~"\\U0001d4b6");
}

#[test]
fn test_eq_ignore_case() {
assert!(eq_ignore_case('a', 'A'));
assert!(eq_ignore_case('P', 'p'));
assert!(eq_ignore_case('.', '.'));
assert!(eq_ignore_case('x', 'x'));
assert!(eq_ignore_case('X', 'X'));
assert!(!eq_ignore_case('a', 'α'));
assert!(!eq_ignore_case('P', 'h'));
assert!(!eq_ignore_case('.', ':'));
assert!(!eq_ignore_case('x', '<'));
assert!(!eq_ignore_case('X', 'ä'));

// FIXME: #5820 Uncomment if unicode support
//assert!(eq_ignore_case('ö', 'Ö'));
//assert!(eq_ignore_case('Ü', 'ü'));
//assert!(eq_ignore_case('ω', 'Ω'));
}
41 changes: 19 additions & 22 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand Down Expand Up @@ -753,16 +753,18 @@ pub fn each_split_within<'a>(ss: &'a str,

/// Convert a string to lowercase. ASCII only
pub fn to_lower(s: &str) -> ~str {
map(s,
|c| unsafe{(libc::tolower(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::tolower(c as libc::c_char)}) as char
}
}

/// Convert a string to uppercase. ASCII only
pub fn to_upper(s: &str) -> ~str {
map(s,
|c| unsafe{(libc::toupper(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::toupper(c as libc::c_char)}) as char
}
}

/**
Expand Down Expand Up @@ -2984,12 +2986,11 @@ mod tests {

#[test]
fn test_to_lower() {
unsafe {
assert!(~"" == map(~"",
|c| libc::tolower(c as c_char) as char));
assert!(~"ymca" == map(~"YMCA",
|c| libc::tolower(c as c_char) as char));
}
// libc::tolower, and hence str::to_lower
// are culturally insensitive: they only work for ASCII
// (see Issue #1347)
assert!(~"" == to_lower(""));
assert!(~"ymca" == to_lower("YMCA"));
}

#[test]
Expand Down Expand Up @@ -3470,12 +3471,8 @@ mod tests {

#[test]
fn test_map() {
unsafe {
assert!(~"" == map(~"", |c|
libc::toupper(c as c_char) as char));
assert!(~"YMCA" == map(~"ymca",
|c| libc::toupper(c as c_char) as char));
}
assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char));
assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char));
}

#[test]
Expand All @@ -3489,11 +3486,11 @@ mod tests {

#[test]
fn test_any() {
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"ymca", char::is_uppercase));
assert!(true == any(~"YMCA", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
}

#[test]
Expand Down