From e5e6345dc454b696c8204917f417d19b545f477c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Wed, 10 Apr 2013 17:36:28 +0200 Subject: [PATCH] Added initial stubs for unicode case folding support --- src/libcore/char.rs | 87 +++++++++++++++++++++++++++++++++++++++++++-- src/libcore/str.rs | 41 ++++++++++----------- 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 6ca33540ceef6..8a3e32b6e3458 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -1,4 +1,4 @@ -// Copyright 2012 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -15,6 +15,7 @@ use str; use u32; use uint; use unicode; +use libc; #[cfg(notest)] use cmp::Eq; @@ -106,6 +107,42 @@ pub fn is_ascii(c: char) -> bool { c - ('\x7F' & c) == '\x00' } +pub enum CaseFoldingRule { + // FIXME: #5820 Add support for locale-specific unicode case folding + // See: http://www.w3.org/International/wiki/Case_folding + + Default, + //Locale(&str) +} + +/// Converts a character to its lower case representation, using the given case folding rules +#[inline(always)] +pub fn to_lower(c: char, folding_rule: CaseFoldingRule) -> char { + match folding_rule { + Default if is_ascii(c) => unsafe{(libc::tolower(c as libc::c_char)) as char}, + Default => c, + } +} + +/// Converts a character to its upper case representation, using the given case folding rules +#[inline(always)] +pub fn to_upper(c: char, folding_rule: CaseFoldingRule) -> char { + match folding_rule { + Default if is_ascii(c) => unsafe{(libc::toupper(c as libc::c_char)) as char}, + Default => c, + } +} + +/// Converts a character to its lower case representation, using the default (english) +/// case folding rules +#[inline(always)] +pub fn to_lower_default(c: char) -> char { to_lower(c, Default) } + +/// Converts a character to its upper case representation, using the default (english) +/// case folding rules +#[inline(always)] +pub fn to_upper_default(c: char) -> char { to_upper(c, Default) } + /// Indicates whether the character is numeric (Nd, Nl, or No) #[inline(always)] pub fn is_digit(c: char) -> bool { @@ -234,6 +271,34 @@ pub fn escape_default(c: char) -> ~str { } } +/// Returns the amount of bytes this character would need if encoded in utf8 +pub fn len_utf8_bytes(c: char) -> uint { + static max_one_b: uint = 128u; + static max_two_b: uint = 2048u; + static max_three_b: uint = 65536u; + static max_four_b: uint = 2097152u; + + let code = c as uint; + if code < max_one_b { 1u } + else if code < max_two_b { 2u } + else if code < max_three_b { 3u } + else if code < max_four_b { 4u } + else { fail!(~"invalid character!") } +} + +/// Compares two characters, ignoring case differences. +/// Currently only works for ascii. +pub fn eq_ignore_case(c1: char, c2: char) -> bool { + // FIXME: #5820 Add support for non-ascii comparisons + if is_ascii(c1) && is_ascii(c2) { + unsafe{ + libc::tolower(c1 as libc::c_char) == libc::tolower(c2 as libc::c_char) + } + } else { + c1 == c2 + } +} + /** * Compare two chars * @@ -334,7 +399,6 @@ fn test_escape_default() { assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6"); } - #[test] fn test_escape_unicode() { assert_eq!(escape_unicode('\x00'), ~"\\x00"); @@ -344,3 +408,22 @@ fn test_escape_unicode() { assert_eq!(escape_unicode('\u011b'), ~"\\u011b"); assert_eq!(escape_unicode('\U0001d4b6'), ~"\\U0001d4b6"); } + +#[test] +fn test_eq_ignore_case() { + assert!(eq_ignore_case('a', 'A')); + assert!(eq_ignore_case('P', 'p')); + assert!(eq_ignore_case('.', '.')); + assert!(eq_ignore_case('x', 'x')); + assert!(eq_ignore_case('X', 'X')); + assert!(!eq_ignore_case('a', 'α')); + assert!(!eq_ignore_case('P', 'h')); + assert!(!eq_ignore_case('.', ':')); + assert!(!eq_ignore_case('x', '<')); + assert!(!eq_ignore_case('X', 'ä')); + + // FIXME: #5820 Uncomment if unicode support + //assert!(eq_ignore_case('ö', 'Ö')); + //assert!(eq_ignore_case('Ü', 'ü')); + //assert!(eq_ignore_case('ω', 'Ω')); +} diff --git a/src/libcore/str.rs b/src/libcore/str.rs index f1605309fb488..fff6147e04603 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -1,4 +1,4 @@ -// Copyright 2012 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -753,16 +753,18 @@ pub fn each_split_within<'a>(ss: &'a str, /// Convert a string to lowercase. ASCII only pub fn to_lower(s: &str) -> ~str { - map(s, - |c| unsafe{(libc::tolower(c as libc::c_char)) as char} - ) + do map(s) |c| { + assert!(char::is_ascii(c)); + (unsafe{libc::tolower(c as libc::c_char)}) as char + } } /// Convert a string to uppercase. ASCII only pub fn to_upper(s: &str) -> ~str { - map(s, - |c| unsafe{(libc::toupper(c as libc::c_char)) as char} - ) + do map(s) |c| { + assert!(char::is_ascii(c)); + (unsafe{libc::toupper(c as libc::c_char)}) as char + } } /** @@ -2984,12 +2986,11 @@ mod tests { #[test] fn test_to_lower() { - unsafe { - assert!(~"" == map(~"", - |c| libc::tolower(c as c_char) as char)); - assert!(~"ymca" == map(~"YMCA", - |c| libc::tolower(c as c_char) as char)); - } + // libc::tolower, and hence str::to_lower + // are culturally insensitive: they only work for ASCII + // (see Issue #1347) + assert!(~"" == to_lower("")); + assert!(~"ymca" == to_lower("YMCA")); } #[test] @@ -3470,12 +3471,8 @@ mod tests { #[test] fn test_map() { - unsafe { - assert!(~"" == map(~"", |c| - libc::toupper(c as c_char) as char)); - assert!(~"YMCA" == map(~"ymca", - |c| libc::toupper(c as c_char) as char)); - } + assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char)); + assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char)); } #[test] @@ -3489,11 +3486,11 @@ mod tests { #[test] fn test_any() { - assert!(false == any(~"", char::is_uppercase)); + assert!(false == any(~"", char::is_uppercase)); assert!(false == any(~"ymca", char::is_uppercase)); assert!(true == any(~"YMCA", char::is_uppercase)); - assert!(true == any(~"yMCA", char::is_uppercase)); - assert!(true == any(~"Ymcy", char::is_uppercase)); + assert!(true == any(~"yMCA", char::is_uppercase)); + assert!(true == any(~"Ymcy", char::is_uppercase)); } #[test]