Skip to content

to_lower_default & to_upper_default for unicode chars and strings. #9414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 138 additions & 4 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,20 @@ def load_unicode_data(f):
c_hi = 0
com_lo = 0
com_hi = 0
cases={"upcase":[],"lowcase":[]}
cas_offs=0
cas_lo=0
cas_hi=0
curr_cas_offs=0
curr_cas=""
cas=""
for line in fileinput.input(f):
fields = line.split(";")
if len(fields) != 15:
continue
[code, name, gencat, combine, bidi,
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcsae, titlecase ] = fields
old, iso, upcase, lowcase, titlecase ] = fields

code = int(code, 16)

Expand Down Expand Up @@ -87,9 +94,57 @@ def load_unicode_data(f):
com_lo = code
com_hi = code

return (canon_decomp, compat_decomp, gencats, combines)
if upcase != "":
curr_cas_offs = int(upcase,16)-code
curr_cas="upcase"
elif lowcase != "":
curr_cas_offs = int(lowcase, 16)-code
curr_cas="lowcase"
else:
curr_cas_offs=0
curr_cas=""

if (upcase=="" and lowcase=="") or curr_cas_offs != cas_offs or curr_cas != cas:
if cas != "":
cases[cas].append((cas_lo, cas_hi, cas_offs))


if curr_cas_offs !=0:
if curr_cas != cas or curr_cas_offs != cas_offs:
cas_lo=code
cas_hi = code
cas=curr_cas
cas_offs=curr_cas_offs


return (canon_decomp, compat_decomp, gencats, combines, cases)

def load_special_casing(f):
fetch(f)
cases=[]
sensative=False
for line in fileinput.input(f):
s="# Language-Sensitive Mappings"
if line[:len(s)]==s:
sensative=True
if sensative: continue
if line[0]=="#":
continue
fields = line.split("; ")
if len(fields)< 5:
continue
[code, lower, title, upper] = fields[:4]
code = "'\\u%4.4x'" % int(code,16)
lower='"'+"".join([ ("\\u%4.4x" % int(x,16)) for x in lower.strip().split(" ")]) +'"'
upper='"'+"".join([ ("\\u%4.4x" % int(x,16)) for x in upper.strip().split(" ")]) +'"'
cases.append( (
code
, lower
, upper
) )
cases.sort()
return cases

def load_derived_core_properties(f):
fetch(f)
derivedprops = {}
Expand Down Expand Up @@ -172,6 +227,83 @@ def emit_property_module(f, mod, tbl):
f.write(" }\n\n")
f.write("}\n")

def emit_case_module(f, mod, tbl, spec):
f.write("pub mod %s {\n" % mod)
keys = tbl.keys()
keys.sort()
#emit_bsearch_range_table(f);
#f.write(" use option::Option;\n");
f.write(" use option::{Some, None};\n");
f.write(" use vec::ImmutableVector;\n");
f.write("""
fn bsearch_range_value_table(c: char, r: &'static [(char, char, i32)]) -> i32 {
use cmp::{Equal, Less, Greater};
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 0
}
}\n\n
""")
for cat in keys:
f.write(" static %s_table : &'static [(char,char,i32)] = &[\n" % cat)
ix = 0
for tup in tbl[cat]:
f.write(ch_prefix(ix))
f.write("(%s, %s, %s)" % (escape_char(tup[0]), escape_char(tup[1]), str(tup[2])))
ix += 1
f.write("\n ];\n\n")

f.write(" pub fn %s(c: char) -> i32 {\n" % (cat+"_offset") )
f.write(" bsearch_range_value_table(c, %s_table)\n" % cat)
f.write(" }\n\n")

f.write(" static %s_table : &'static [(char, &'static str,&'static str)] = &[\n" % "special")
ix = 0
for tup in special:
f.write(ch_prefix(ix))
f.write("(%s, %s, %s)" % (tup[0], tup[1], tup[2]))
ix += 2
f.write("\n ];\n\n")

f.write("""
pub fn case_special(c:char, case:u8) -> &'static str {
use cmp::{Equal, Less, Greater};
match special_table.bsearch(|&(code, _, _)| {
if c==code { Equal }
else if code < c { Less }
else { Greater }
}) {
Some(idx) => {
if case==0 {
let (_, result, _) = special_table[idx];
result
}
else {
let (_, _, result) = special_table[idx];
result
}
}
None => ""
}
}\n\n
""")

f.write(" pub fn upcase_special(c:char) -> &'static str {\n")
f.write(" case_special(c, 1)\n")
f.write(" }\n\n")

f.write(" pub fn lowcase_special(c:char) -> &'static str {\n")
f.write(" case_special(c, 0)\n")
f.write(" }\n\n")

f.write("}\n")

def emit_property_module_old(f, mod, tbl):
f.write("mod %s {\n" % mod)
Expand Down Expand Up @@ -352,8 +484,8 @@ def emit_decomp_module(f, canon, compat, combine):
os.remove(i);
rf = open(r, "w")

(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt")

(canon_decomp, compat_decomp, gencats, combines, cases) = load_unicode_data("UnicodeData.txt")
special = load_special_casing("SpecialCasing.txt")
# Preamble
rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
Expand All @@ -374,6 +506,8 @@ def emit_decomp_module(f, canon, compat, combine):

emit_property_module(rf, "general_category", gencats)

emit_case_module(rf, "case_changes", cases, special)

emit_decomp_module(rf, canon_decomp, compat_decomp, combines)

derived = load_derived_core_properties("DerivedCoreProperties.txt")
Expand Down
55 changes: 54 additions & 1 deletion src/libstd/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, general_category, decompose};
use unicode::{derived_property, general_category, decompose, case_changes};
use to_str::ToStr;
use str;

Expand Down Expand Up @@ -221,6 +221,36 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
}
}

/// Returns the lowercase form of a given unicode character.
/// Makes a best-effort attempt without checking locale.
pub fn to_lower_default(u: char) -> char {
let off = case_changes::lowcase_offset(u);
from_u32( (((u as u32) as i32) + off) as u32 ).unwrap()
}

/// Returns the lowercase form of a given unicode character
/// using full mapping, may map to multiple char.
/// Makes a best-effort attempt without checking locale.
pub fn to_lower_full_default(u: char) -> ~str {
case_changes::lowcase_special(u).to_owned()
}

/// Returns the uppercase form of a given unicode character.
/// Makes a best-effort attempt without checking locale.
pub fn to_upper_default(u: char) -> char {
let off = case_changes::upcase_offset(u);
from_u32( (((u as u32) as i32) + off) as u32 ).unwrap()
}

/// Returns the uppercase form of a given unicode character
/// using full mapping, may map to multiple char.
/// Makes a best-effort attempt without checking locale.
pub fn to_upper_full_default(u: char) -> ~str {
case_changes::upcase_special(u).to_owned()
}

//FIXME #9363: implement to_upper and to_lower which take into acount locale

// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: uint = 0xAC00;
static L_BASE: uint = 0x1100;
Expand Down Expand Up @@ -367,6 +397,11 @@ pub trait Char {
///
/// This will then return the number of characters written to the slice.
fn encode_utf8(&self, dst: &mut [u8]) -> uint;

fn to_lower_default(&self) -> char;
fn to_upper_default(&self) -> char;
fn to_lower_full_default(&self) -> ~str;
fn to_upper_full_default(&self) -> ~str;
}

impl Char for char {
Expand Down Expand Up @@ -420,6 +455,12 @@ impl Char for char {
return 4;
}
}

fn to_lower_default(&self) -> char { to_lower_default(*self) }
fn to_upper_default(&self) -> char { to_upper_default(*self) }
fn to_lower_full_default(&self) -> ~str { to_lower_full_default(*self) }
fn to_upper_full_default(&self) -> ~str { to_upper_full_default(*self) }

}

#[cfg(not(test))]
Expand Down Expand Up @@ -546,3 +587,15 @@ fn test_to_str() {
let s = 't'.to_str();
assert_eq!(s, ~"t");
}

#[test]
fn test_to_lower_default() {
assert_eq!('ŗ'.to_lower_default(), 'ŗ');
assert_eq!('Ʋ'.to_lower_default(), 'ʋ');
}

#[test]
fn test_to_upper_default() {
assert_eq!('ŗ'.to_upper_default(), 'Ŗ');
assert_eq!('Ʋ'.to_upper_default(), 'Ʋ');
}
70 changes: 70 additions & 0 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,11 @@ pub trait StrSlice<'self> {
fn subslice_offset(&self, inner: &str) -> uint;

fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T;

fn to_lower_default(&self) -> ~str;
fn to_upper_default(&self) -> ~str;
fn to_lower_full_default(&self) -> ~str;
fn to_upper_full_default(&self) -> ~str;
}

/// Extension methods for strings
Expand Down Expand Up @@ -2190,6 +2195,50 @@ impl<'self> StrSlice<'self> for &'self str {
let v: &[u8] = unsafe { cast::transmute(*self) };
v.as_imm_buf(f)
}

/// Convert all characters in the string to lowercase
fn to_lower_default(&self) -> ~str {
let mut out = with_capacity(self.len());
for c in self.iter() {
out.push_char(c.to_lower_default());
}
out
}

/// Convert all characters in the string to lowercase using
/// full mapping, may increase string length
fn to_lower_full_default(&self) -> ~str {
let mut out = with_capacity(self.len());
for c in self.iter() {
let low = c.to_lower_full_default();
if low.is_empty(){ out.push_char(c.to_lower_default()); }
else { out.push_str(low); }
}
out
}

/// Convert all characters in the string to uppercase
fn to_upper_default(&self) -> ~str {
let mut out = with_capacity(self.len());
for c in self.iter() {
out.push_char(c.to_upper_default());
}
out
}

/// Convert all characters in the string to uppercase using
/// full mapping, may increase string length
fn to_upper_full_default(&self) -> ~str {
let mut out = with_capacity(self.len());
for c in self.iter() {
let up = c.to_upper_full_default();
if up.is_empty(){ out.push_char(c.to_upper_default()); }
else { out.push_str(up); }
}
out
}

//FIXME #9363: implement to_lower and to_upper which take into acount locale
}

#[allow(missing_doc)]
Expand Down Expand Up @@ -3741,6 +3790,27 @@ mod tests {
assert_eq!("abcde".to_send_str(), SendStrStatic("abcde"));
assert_eq!("abcde".to_send_str(), SendStrOwned(~"abcde"));
}

#[test]
fn test_to_lower_default() {
assert_eq!("ŗƲΣꓔ!".to_lower_default(), ~"ŗʋσꓔ!" )
}

#[test]
fn test_to_upper_default() {
assert_eq!("ŗƲΣꓔ!".to_upper_default(), ~"ŖƲΣꓔ!" )
}

#[test]
fn test_to_lower_full_default() {
assert_eq!("İA".to_lower_full_default(), ~"i̇a")
}

#[test]
fn test_to_upper_full_default() {
assert_eq!("ßa".to_upper_full_default(), ~"SSA")
}

}

#[cfg(test)]
Expand Down
Loading