From fecdeb85ed6d1a97813eec9a2d73d0e3ed8b9b2e Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 13:51:54 +0200 Subject: [PATCH 1/7] Small fixes/additions in strings.jl: *) Added Set{Char} to Chars (also added 'contains' as an alias to 'has' for sets, for efficiency) *) Made search(::String, ::Chars) be consistent with search(::String, ::String) *) Added write to all String types *) Aliased has == contains for strings to avoid conversion to Set --- base/ascii.jl | 2 +- base/set.jl | 1 + base/string.jl | 17 ++++++++++++++--- base/utf8.jl | 2 +- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/base/ascii.jl b/base/ascii.jl index aff488b33617f..6f94f56082757 100644 --- a/base/ascii.jl +++ b/base/ascii.jl @@ -70,7 +70,7 @@ end ## outputing ASCII strings ## print(io::IO, s::ASCIIString) = (write(io, s.data);nothing) -write(io, s::ASCIIString) = write(io, s.data) +write(io::IO, s::ASCIIString) = write(io, s.data) ## transcoding to ASCII ## diff --git a/base/set.jl b/base/set.jl index 4b7f4046f605d..f1c23cb9fa0ec 100644 --- a/base/set.jl +++ b/base/set.jl @@ -16,6 +16,7 @@ elements(s::Set) = keys(s.hash) eltype{T}(s::Set{T}) = T has(s::Set, x) = has(s.hash, x) +contains(s::Set, x) = has(s, x) get(s::Set, x, deflt) = get(s.hash, x, false) add(s::Set, x) = (s.hash[x] = true; s) diff --git a/base/string.jl b/base/string.jl index d9aececb9c864..804be0af85b9c 100644 --- a/base/string.jl +++ b/base/string.jl @@ -63,6 +63,7 @@ ref(s::String, v::AbstractVector) = symbol(s::String) = symbol(bytestring(s)) print(io::IO, s::String) = for c in s write(io, c) end +write(io::IO, s::String) = print(io, s) show(io::IO, s::String) = print_quoted(io, s) (*)(s::String...) = strcat(s...) @@ -156,7 +157,7 @@ function chr2ind(s::String, i::Integer) end end -typealias Chars Union(Char,AbstractVector{Char}) +typealias Chars Union(Char,AbstractVector{Char},Set{Char}) function strchr(s::String, c::Chars, i::Integer) if i < 1 error("index out of range") end @@ -174,7 +175,15 @@ strchr(s::String, c::Chars) = strchr(s,c,start(s)) contains(s::String, c::Char) = (strchr(s,c)!=0) -search(s::String, c::Chars, i::Integer) = (i=strchr(s,c,i); (i,nextind(s,i))) +function search(s::String, c::Chars, i::Integer) + if isempty(c) + return 1 <= i <= length(s)+1 ? (i,i) : + i == length(s)+2 ? (0,0) : + error("index out of range") + end + i=strchr(s,c,i) + (i, nextind(s,i)) +end search(s::String, c::Chars) = search(s,c,start(s)) function search(s::String, t::String, i::Integer) @@ -447,6 +456,8 @@ strcat(xs...) = string(xs...) # backwards compat print(io::IO, s::RopeString) = print(io, s.head, s.tail) +write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) + ## transformed strings ## type TransformedString <: String @@ -500,7 +511,7 @@ function filter(f::Function, s::String) takebuf_string(out) end -has(s::String, c::Char) = has(Set(s...), c) +has(s::String, c::Char) = contains(s, c) ## string promotion rules ## diff --git a/base/utf8.jl b/base/utf8.jl index 4e964ed377b7a..29c46bc65f443 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -94,7 +94,7 @@ lcfirst(s::UTF8String) = string(lowercase(s[1]), s[2:]) ## outputing UTF-8 strings ## print(io::IO, s::UTF8String) = (write(io, s.data);nothing) -write(io, s::UTF8String) = write(io, s.data) +write(io::IO, s::UTF8String) = write(io, s.data) ## transcoding to UTF-8 ## From 3caf97ce216c401b88e3b187edda17ddd23705c6 Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 14:54:49 +0200 Subject: [PATCH 2/7] Updated string docs --- base/regex.jl | 3 ++- doc/helpdb.jl | 44 ++++++++++++++++++++++++++++++++++++++------ doc/stdlib/base.rst | 22 +++++++++++++++++----- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index 29a84a60d4563..bb9158d359214 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -112,7 +112,8 @@ function search(str::ByteString, re::Regex, idx::Integer) m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true) isempty(m) ? (0,0) : (m[1]+1,m[2]+1) end -search(s::ByteString, r::Regex) = search(s,r,start(s)) +search(s::String, r::Regex, idx::Integer) = error("regex search is only available for bytestrings; use bytestring(s) to convert") +search(s::String, r::Regex) = search(s,r,start(s)) type RegexMatchIterator regex::Regex diff --git a/doc/helpdb.jl b/doc/helpdb.jl index 010d2891cf0fc..b58aeb4482f25 100644 --- a/doc/helpdb.jl +++ b/doc/helpdb.jl @@ -549,21 +549,36 @@ collection[key...] = value "), -(E"Strings",E"ASCIIString",E"ASCIIString(::Array{Uint8, 1}) +(E"Strings",E"ascii",E"ascii(::Array{Uint8, 1}) Create an ASCII string from a byte array. "), -(E"Strings",E"UTF8String",E"UTF8String(::Array{Uint8, 1}) +(E"Strings",E"ascii",E"ascii(s) + + Convert a string to a contiguous ASCII string (all characters must + be valid ASCII characters). + +"), + +(E"Strings",E"utf8",E"utf8(::Array{Uint8, 1}) Create a UTF-8 string from a byte array. "), +(E"Strings",E"utf8",E"utf8(s) + + Convert a string to a contiguous UTF-8 string (all characters must + be valid UTF-8 characters). + +"), + (E"Strings",E"strchr",E"strchr(string, char[, i]) Return the index of 'char' in 'string', giving 0 if not found. The + second argument may also be a vector or a set of characters. The third argument optionally specifies a starting index. "), @@ -582,12 +597,29 @@ collection[key...] = value "), -(E"Strings",E"split",E"split(string, char, include_empty) +(E"Strings",E"search",E"search(string, chars[, start]) + + Search for the given characters within the given string. The second + argument may be a single character, a vector or a set of + characters, a string, or a regular expression (but regular + expressions are only allowed on contiguous strings, such as ASCII + or UTF-8 strings). The third argument optionally specifies a + starting index. The return value is a tuple with 2 integers: the + index of the match and the first valid index past the match (or an + index beyond the end of the string if the match is at the end); it + returns '(0,0)' if no match was found, and '(start,start)' if + 'chars' is empty. + +"), + +(E"Strings",E"split",E"split(string, chars[, limit][, include_empty]) Return an array of strings by splitting the given string on - occurrences of the given character delimiter. The second argument - may also be a set of character delimiters to use. The third - argument specifies whether empty fields should be included. + occurrences of the given character delimiters, which may be + specified in any of the formats allowed by 'search''s second + argument. The last two arguments are optional; they are are a + maximum size for the result and a flag determining whether empty + fields should be included in the result. "), diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 9121cb5dc17c0..8ff9d7675af9d 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -401,17 +401,25 @@ Strings Convert a string to a contiguous byte array representation appropriate for passing it to C functions. -.. function:: ASCIIString(::Array{Uint8,1}) +.. function:: ascii(::Array{Uint8,1}) Create an ASCII string from a byte array. -.. function:: UTF8String(::Array{Uint8,1}) +.. function:: ascii(s) + + Convert a string to a contiguous ASCII string (all characters must be valid ASCII characters). + +.. function:: utf8(::Array{Uint8,1}) Create a UTF-8 string from a byte array. +.. function:: utf8(s) + + Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters). + .. function:: strchr(string, char[, i]) - Return the index of ``char`` in ``string``, giving 0 if not found. The third argument optionally specifies a starting index. + Return the index of ``char`` in ``string``, giving 0 if not found. The second argument may also be a vector or a set of characters. The third argument optionally specifies a starting index. .. function:: lpad(string, n, p) @@ -421,9 +429,13 @@ Strings Make a string at least ``n`` characters long by padding on the right with copies of ``p``. -.. function:: split(string, char, include_empty) +.. function:: search(string, chars[, start]) + + Search for the given characters within the given string. The second argument may be a single character, a vector or a set of characters, a string, or a regular expression (but regular expressions are only allowed on contiguous strings, such as ASCII or UTF-8 strings). The third argument optionally specifies a starting index. The return value is a tuple with 2 integers: the index of the match and the first valid index past the match (or an index beyond the end of the string if the match is at the end); it returns ``(0,0)`` if no match was found, and ``(start,start)`` if ``chars`` is empty. + +.. function:: split(string, chars[, limit][, include_empty]) - Return an array of strings by splitting the given string on occurrences of the given character delimiter. The second argument may also be a set of character delimiters to use. The third argument specifies whether empty fields should be included. + Return an array of strings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument. The last two arguments are optional; they are are a maximum size for the result and a flag determining whether empty fields should be included in the result. .. function:: strip(string) From 912c39858826606bc1b5fa64b2671d2f5dc213b1 Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 14:39:38 +0200 Subject: [PATCH 3/7] TransformedString tweaks avoid unnecessary layering of transformations by combining them in most common cases. e.g. uppercase(ucfirst(string)) ==> uppercase(string) --- base/string.jl | 66 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/base/string.jl b/base/string.jl index 804be0af85b9c..1db7739902a9a 100644 --- a/base/string.jl +++ b/base/string.jl @@ -462,6 +462,7 @@ write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) type TransformedString <: String transform::Function + tag::Char string::String end @@ -476,17 +477,74 @@ end ## uppercase and lowercase transformations ## +function _transfunc_tag2func(tag::Char) + if tag == 'U' # all-uppercase + return (c,i)->uppercase(c) + elseif tag == 'L' # all-lowercase + return (c,i)->lowercase(c) + elseif tag == 'u' # uppercase-first + return (c,i)->i==1 ? uppercase(c) : c + elseif tag == 'l' # lowercase-first + return (c,i)->i==1 ? lowercase(c) : c + elseif tag == 'C' # camelcase + return (c,i)->i==1 ? uppercase(c) : lowercase(c) + elseif tag == 'c' # inverse-camelcase + return (c,i)->i==1 ? lowercase(c) : uppercase(c) + else + error("unknown tag ", tag) + end +end +TransformedString(tag::Char, s::String) = TransformedString(_transfunc_tag2func(tag), tag, s) + +# Note: comment to disallow custom transform functions +TransformedString(transform::Function, s::String) = TransformedString(transform, 'X', s) + uppercase(c::Char) = ccall(:towupper, Char, (Char,), c) lowercase(c::Char) = ccall(:towlower, Char, (Char,), c) uppercase(c::Uint8) = ccall(:toupper, Uint8, (Uint8,), c) lowercase(c::Uint8) = ccall(:tolower, Uint8, (Uint8,), c) -uppercase(s::String) = TransformedString((c,i)->uppercase(c), s) -lowercase(s::String) = TransformedString((c,i)->lowercase(c), s) +uppercase(s::String) = TransformedString('U', s) +lowercase(s::String) = TransformedString('L', s) + +ucfirst(s::String) = TransformedString('u', s) +lcfirst(s::String) = TransformedString('l', s) -ucfirst(s::String) = TransformedString((c,i)->i==1 ? uppercase(c) : c, s) -lcfirst(s::String) = TransformedString((c,i)->i==1 ? lowercase(c) : c, s) +function _transfunc_tag_compose(tag2::Char, tag1::Char) + # Note: comment to disallow custom transform functions + if !contains("ULulCc", tag2) || !contains("ULulCc", tag1) + return 'X' + end + if tag2 == 'U' || tag2 == 'L' || tag2 == 'C' || tag2 == 'c' || + tag2 == tag1 || + (tag2 == 'u' && tag1 == 'l') || + (tag2 == 'l' && tag1 == 'u') + return tag2 + elseif (tag2 == 'u' && (tag1 == 'U' || tag1 == 'C')) || + (tag2 == 'l' && (tag1 == 'L' || tag1 == 'c')) + return tag1 + elseif (tag2 == 'u' && tag1 == 'L') + return 'C' + elseif (tag2 == 'l' && tag1 == 'U') + return 'c' + elseif (tag2 == 'u' && tag1 == 'c') + return 'U' + elseif (tag2 == 'l' && tag1 == 'C') + return 'L' + else + error("invalid transform tags: (", tag2, ",", tag1, ")") + end +end + +function TransformedString(tag::Char, s::TransformedString) + newtag = _transfunc_tag_compose(tag, s.tag) + # Note: comment to disallow custom transform functions + if newtag == 'X' + return TransformedString(_transfunc_tag2func(tag), tag, s) + end + TransformedString(newtag, s.string) +end const uc = uppercase const lc = lowercase From a7c6030e153a253b608702578d7a8e54008724b3 Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 15:38:04 +0200 Subject: [PATCH 4/7] Enforce [rl]pad return type consistency --- base/printf.jl | 6 +++--- base/string.jl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/base/printf.jl b/base/printf.jl index d5a243a0349dd..800d97a7345ae 100644 --- a/base/printf.jl +++ b/base/printf.jl @@ -139,9 +139,9 @@ function _special_handler(flags::ASCIIString, width::Int) pos = contains(flags,'+') ? "+" : contains(flags,' ') ? " " : "" abn = quote - isnan($x) ? $(bytestring(pad("NaN", width))) : - $x < 0 ? $(bytestring(pad("-Inf", width))) : - $(bytestring(pad("$(pos)Inf", width))) + isnan($x) ? $(pad("NaN", width)) : + $x < 0 ? $(pad("-Inf", width)) : + $(pad("$(pos)Inf", width)) end ex = :(isfinite($x) ? $blk : write(out, $abn)) x, ex, blk diff --git a/base/string.jl b/base/string.jl index 1db7739902a9a..c01896e30f222 100644 --- a/base/string.jl +++ b/base/string.jl @@ -933,7 +933,7 @@ function lpad(s::String, n::Integer, p::String) if m <= 0; return s; end l = strlen(p) if l==1 - return p^m * s + return bytestring(p^m * s) end q = div(m,l) r = m - q*l @@ -945,7 +945,7 @@ function rpad(s::String, n::Integer, p::String) if m <= 0; return s; end l = strlen(p) if l==1 - return s * p^m + return bytestring(s * p^m) end q = div(m,l) r = m - q*l From a7ce198bdd91bfd67746e37c1a5fa48754a9c036 Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 18:06:51 +0200 Subject: [PATCH 5/7] Removed tag from TransformedString as per Keno suggestion --- base/string.jl | 90 ++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 54 deletions(-) diff --git a/base/string.jl b/base/string.jl index c01896e30f222..fe9b841283360 100644 --- a/base/string.jl +++ b/base/string.jl @@ -462,7 +462,6 @@ write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) type TransformedString <: String transform::Function - tag::Char string::String end @@ -477,27 +476,12 @@ end ## uppercase and lowercase transformations ## -function _transfunc_tag2func(tag::Char) - if tag == 'U' # all-uppercase - return (c,i)->uppercase(c) - elseif tag == 'L' # all-lowercase - return (c,i)->lowercase(c) - elseif tag == 'u' # uppercase-first - return (c,i)->i==1 ? uppercase(c) : c - elseif tag == 'l' # lowercase-first - return (c,i)->i==1 ? lowercase(c) : c - elseif tag == 'C' # camelcase - return (c,i)->i==1 ? uppercase(c) : lowercase(c) - elseif tag == 'c' # inverse-camelcase - return (c,i)->i==1 ? lowercase(c) : uppercase(c) - else - error("unknown tag ", tag) - end -end -TransformedString(tag::Char, s::String) = TransformedString(_transfunc_tag2func(tag), tag, s) - -# Note: comment to disallow custom transform functions -TransformedString(transform::Function, s::String) = TransformedString(transform, 'X', s) +const _TF_U = (c,i)->uppercase(c) +const _TF_L = (c,i)->lowercase(c) +const _TF_u = (c,i)->i==1 ? uppercase(c) : c +const _TF_l = (c,i)->i==1 ? lowercase(c) : c +const _TF_C = (c,i)->i==1 ? uppercase(c) : lowercase(c) +const _TF_c = (c,i)->i==1 ? lowercase(c) : uppercase(c) uppercase(c::Char) = ccall(:towupper, Char, (Char,), c) lowercase(c::Char) = ccall(:towlower, Char, (Char,), c) @@ -505,45 +489,43 @@ lowercase(c::Char) = ccall(:towlower, Char, (Char,), c) uppercase(c::Uint8) = ccall(:toupper, Uint8, (Uint8,), c) lowercase(c::Uint8) = ccall(:tolower, Uint8, (Uint8,), c) -uppercase(s::String) = TransformedString('U', s) -lowercase(s::String) = TransformedString('L', s) +uppercase(s::String) = TransformedString(_TF_U, s) +lowercase(s::String) = TransformedString(_TF_L, s) -ucfirst(s::String) = TransformedString('u', s) -lcfirst(s::String) = TransformedString('l', s) +ucfirst(s::String) = TransformedString(_TF_u, s) +lcfirst(s::String) = TransformedString(_TF_l, s) -function _transfunc_tag_compose(tag2::Char, tag1::Char) - # Note: comment to disallow custom transform functions - if !contains("ULulCc", tag2) || !contains("ULulCc", tag1) - return 'X' +function _transfunc_compose(f2::Function, f1::Function) + allf = [_TF_U, _TF_L, _TF_u, _TF_l, _TF_C, _TF_c] + if !has(allf, f2) || !has(allf, f1) + return nothing end - if tag2 == 'U' || tag2 == 'L' || tag2 == 'C' || tag2 == 'c' || - tag2 == tag1 || - (tag2 == 'u' && tag1 == 'l') || - (tag2 == 'l' && tag1 == 'u') - return tag2 - elseif (tag2 == 'u' && (tag1 == 'U' || tag1 == 'C')) || - (tag2 == 'l' && (tag1 == 'L' || tag1 == 'c')) - return tag1 - elseif (tag2 == 'u' && tag1 == 'L') - return 'C' - elseif (tag2 == 'l' && tag1 == 'U') - return 'c' - elseif (tag2 == 'u' && tag1 == 'c') - return 'U' - elseif (tag2 == 'l' && tag1 == 'C') - return 'L' - else - error("invalid transform tags: (", tag2, ",", tag1, ")") + if f2 == _TF_U || f2 == _TF_L || f2 == _TF_C || f2 == _TF_c || + f2 == f1 || + (f2 == _TF_u && f1 == _TF_l) || + (f2 == _TF_l && f1 == _TF_u) + return f2 + elseif (f2 == _TF_u && (f1 == _TF_U || f1 == _TF_C)) || + (f2 == _TF_l && (f1 == _TF_L || f1 == _TF_c)) + return f1 + elseif (f2 == _TF_u && f1 == _TF_L) + return _TF_C + elseif (f2 == _TF_l && f1 == _TF_U) + return _TF_c + elseif (f2 == _TF_u && f1 == _TF_c) + return _TF_U + elseif (f2 == _TF_l && f1 == _TF_C) + return _TF_L end + error("this is a bug") end -function TransformedString(tag::Char, s::TransformedString) - newtag = _transfunc_tag_compose(tag, s.tag) - # Note: comment to disallow custom transform functions - if newtag == 'X' - return TransformedString(_transfunc_tag2func(tag), tag, s) +function TransformedString(transform::Function, s::TransformedString) + newtf = _transfunc_compose(transform, s.transform) + if newtf === nothing + return invoke(TransformedString, (Function, String), transform, s) end - TransformedString(newtag, s.string) + TransformedString(newtf, s.string) end const uc = uppercase From 0c24233c7281dac053462a77041c3076eab186b9 Mon Sep 17 00:00:00 2001 From: Carlo Baldassi Date: Mon, 3 Sep 2012 18:29:15 +0200 Subject: [PATCH 6/7] Yet another fix in TransformedString --- base/string.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/string.jl b/base/string.jl index fe9b841283360..6ac97bab5e390 100644 --- a/base/string.jl +++ b/base/string.jl @@ -497,7 +497,7 @@ lcfirst(s::String) = TransformedString(_TF_l, s) function _transfunc_compose(f2::Function, f1::Function) allf = [_TF_U, _TF_L, _TF_u, _TF_l, _TF_C, _TF_c] - if !has(allf, f2) || !has(allf, f1) + if !contains(allf, f2) || !contains(allf, f1) return nothing end if f2 == _TF_U || f2 == _TF_L || f2 == _TF_C || f2 == _TF_c || From 24d409162dc6f2f370f6463c1d45c7885a1ba38c Mon Sep 17 00:00:00 2001 From: Elliot Saba Date: Mon, 3 Sep 2012 12:44:13 -0700 Subject: [PATCH 7/7] Small update to error message in docs --- doc/manual/metaprogramming.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/manual/metaprogramming.rst b/doc/manual/metaprogramming.rst index 80eb56b7fb63d..5cf565370c1dc 100644 --- a/doc/manual/metaprogramming.rst +++ b/doc/manual/metaprogramming.rst @@ -275,7 +275,7 @@ cause a compile-time error: :: julia> $a + b - not supported + unsupported or misplaced expression $ .. _man-macros: