diff --git a/base/ascii.jl b/base/ascii.jl index aff488b33617f..6f94f56082757 100644 --- a/base/ascii.jl +++ b/base/ascii.jl @@ -70,7 +70,7 @@ end ## outputing ASCII strings ## print(io::IO, s::ASCIIString) = (write(io, s.data);nothing) -write(io, s::ASCIIString) = write(io, s.data) +write(io::IO, s::ASCIIString) = write(io, s.data) ## transcoding to ASCII ## diff --git a/base/printf.jl b/base/printf.jl index d5a243a0349dd..800d97a7345ae 100644 --- a/base/printf.jl +++ b/base/printf.jl @@ -139,9 +139,9 @@ function _special_handler(flags::ASCIIString, width::Int) pos = contains(flags,'+') ? "+" : contains(flags,' ') ? " " : "" abn = quote - isnan($x) ? $(bytestring(pad("NaN", width))) : - $x < 0 ? $(bytestring(pad("-Inf", width))) : - $(bytestring(pad("$(pos)Inf", width))) + isnan($x) ? $(pad("NaN", width)) : + $x < 0 ? $(pad("-Inf", width)) : + $(pad("$(pos)Inf", width)) end ex = :(isfinite($x) ? $blk : write(out, $abn)) x, ex, blk diff --git a/base/regex.jl b/base/regex.jl index 29a84a60d4563..bb9158d359214 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -112,7 +112,8 @@ function search(str::ByteString, re::Regex, idx::Integer) m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true) isempty(m) ? (0,0) : (m[1]+1,m[2]+1) end -search(s::ByteString, r::Regex) = search(s,r,start(s)) +search(s::String, r::Regex, idx::Integer) = error("regex search is only available for bytestrings; use bytestring(s) to convert") +search(s::String, r::Regex) = search(s,r,start(s)) type RegexMatchIterator regex::Regex diff --git a/base/set.jl b/base/set.jl index 4b7f4046f605d..f1c23cb9fa0ec 100644 --- a/base/set.jl +++ b/base/set.jl @@ -16,6 +16,7 @@ elements(s::Set) = keys(s.hash) eltype{T}(s::Set{T}) = T has(s::Set, x) = has(s.hash, x) +contains(s::Set, x) = has(s, x) get(s::Set, x, deflt) = get(s.hash, x, false) add(s::Set, x) = (s.hash[x] = true; s) diff --git a/base/string.jl b/base/string.jl index d9aececb9c864..6ac97bab5e390 100644 --- a/base/string.jl +++ b/base/string.jl @@ -63,6 +63,7 @@ ref(s::String, v::AbstractVector) = symbol(s::String) = symbol(bytestring(s)) print(io::IO, s::String) = for c in s write(io, c) end +write(io::IO, s::String) = print(io, s) show(io::IO, s::String) = print_quoted(io, s) (*)(s::String...) = strcat(s...) @@ -156,7 +157,7 @@ function chr2ind(s::String, i::Integer) end end -typealias Chars Union(Char,AbstractVector{Char}) +typealias Chars Union(Char,AbstractVector{Char},Set{Char}) function strchr(s::String, c::Chars, i::Integer) if i < 1 error("index out of range") end @@ -174,7 +175,15 @@ strchr(s::String, c::Chars) = strchr(s,c,start(s)) contains(s::String, c::Char) = (strchr(s,c)!=0) -search(s::String, c::Chars, i::Integer) = (i=strchr(s,c,i); (i,nextind(s,i))) +function search(s::String, c::Chars, i::Integer) + if isempty(c) + return 1 <= i <= length(s)+1 ? (i,i) : + i == length(s)+2 ? (0,0) : + error("index out of range") + end + i=strchr(s,c,i) + (i, nextind(s,i)) +end search(s::String, c::Chars) = search(s,c,start(s)) function search(s::String, t::String, i::Integer) @@ -447,6 +456,8 @@ strcat(xs...) = string(xs...) # backwards compat print(io::IO, s::RopeString) = print(io, s.head, s.tail) +write(io::IO, s::RopeString) = (write(io, s.head); write(io, s.tail)) + ## transformed strings ## type TransformedString <: String @@ -465,17 +476,57 @@ end ## uppercase and lowercase transformations ## +const _TF_U = (c,i)->uppercase(c) +const _TF_L = (c,i)->lowercase(c) +const _TF_u = (c,i)->i==1 ? uppercase(c) : c +const _TF_l = (c,i)->i==1 ? lowercase(c) : c +const _TF_C = (c,i)->i==1 ? uppercase(c) : lowercase(c) +const _TF_c = (c,i)->i==1 ? lowercase(c) : uppercase(c) + uppercase(c::Char) = ccall(:towupper, Char, (Char,), c) lowercase(c::Char) = ccall(:towlower, Char, (Char,), c) uppercase(c::Uint8) = ccall(:toupper, Uint8, (Uint8,), c) lowercase(c::Uint8) = ccall(:tolower, Uint8, (Uint8,), c) -uppercase(s::String) = TransformedString((c,i)->uppercase(c), s) -lowercase(s::String) = TransformedString((c,i)->lowercase(c), s) +uppercase(s::String) = TransformedString(_TF_U, s) +lowercase(s::String) = TransformedString(_TF_L, s) -ucfirst(s::String) = TransformedString((c,i)->i==1 ? uppercase(c) : c, s) -lcfirst(s::String) = TransformedString((c,i)->i==1 ? lowercase(c) : c, s) +ucfirst(s::String) = TransformedString(_TF_u, s) +lcfirst(s::String) = TransformedString(_TF_l, s) + +function _transfunc_compose(f2::Function, f1::Function) + allf = [_TF_U, _TF_L, _TF_u, _TF_l, _TF_C, _TF_c] + if !contains(allf, f2) || !contains(allf, f1) + return nothing + end + if f2 == _TF_U || f2 == _TF_L || f2 == _TF_C || f2 == _TF_c || + f2 == f1 || + (f2 == _TF_u && f1 == _TF_l) || + (f2 == _TF_l && f1 == _TF_u) + return f2 + elseif (f2 == _TF_u && (f1 == _TF_U || f1 == _TF_C)) || + (f2 == _TF_l && (f1 == _TF_L || f1 == _TF_c)) + return f1 + elseif (f2 == _TF_u && f1 == _TF_L) + return _TF_C + elseif (f2 == _TF_l && f1 == _TF_U) + return _TF_c + elseif (f2 == _TF_u && f1 == _TF_c) + return _TF_U + elseif (f2 == _TF_l && f1 == _TF_C) + return _TF_L + end + error("this is a bug") +end + +function TransformedString(transform::Function, s::TransformedString) + newtf = _transfunc_compose(transform, s.transform) + if newtf === nothing + return invoke(TransformedString, (Function, String), transform, s) + end + TransformedString(newtf, s.string) +end const uc = uppercase const lc = lowercase @@ -500,7 +551,7 @@ function filter(f::Function, s::String) takebuf_string(out) end -has(s::String, c::Char) = has(Set(s...), c) +has(s::String, c::Char) = contains(s, c) ## string promotion rules ## @@ -864,7 +915,7 @@ function lpad(s::String, n::Integer, p::String) if m <= 0; return s; end l = strlen(p) if l==1 - return p^m * s + return bytestring(p^m * s) end q = div(m,l) r = m - q*l @@ -876,7 +927,7 @@ function rpad(s::String, n::Integer, p::String) if m <= 0; return s; end l = strlen(p) if l==1 - return s * p^m + return bytestring(s * p^m) end q = div(m,l) r = m - q*l diff --git a/base/utf8.jl b/base/utf8.jl index 4e964ed377b7a..29c46bc65f443 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -94,7 +94,7 @@ lcfirst(s::UTF8String) = string(lowercase(s[1]), s[2:]) ## outputing UTF-8 strings ## print(io::IO, s::UTF8String) = (write(io, s.data);nothing) -write(io, s::UTF8String) = write(io, s.data) +write(io::IO, s::UTF8String) = write(io, s.data) ## transcoding to UTF-8 ## diff --git a/doc/helpdb.jl b/doc/helpdb.jl index 010d2891cf0fc..b58aeb4482f25 100644 --- a/doc/helpdb.jl +++ b/doc/helpdb.jl @@ -549,21 +549,36 @@ collection[key...] = value "), -(E"Strings",E"ASCIIString",E"ASCIIString(::Array{Uint8, 1}) +(E"Strings",E"ascii",E"ascii(::Array{Uint8, 1}) Create an ASCII string from a byte array. "), -(E"Strings",E"UTF8String",E"UTF8String(::Array{Uint8, 1}) +(E"Strings",E"ascii",E"ascii(s) + + Convert a string to a contiguous ASCII string (all characters must + be valid ASCII characters). + +"), + +(E"Strings",E"utf8",E"utf8(::Array{Uint8, 1}) Create a UTF-8 string from a byte array. "), +(E"Strings",E"utf8",E"utf8(s) + + Convert a string to a contiguous UTF-8 string (all characters must + be valid UTF-8 characters). + +"), + (E"Strings",E"strchr",E"strchr(string, char[, i]) Return the index of 'char' in 'string', giving 0 if not found. The + second argument may also be a vector or a set of characters. The third argument optionally specifies a starting index. "), @@ -582,12 +597,29 @@ collection[key...] = value "), -(E"Strings",E"split",E"split(string, char, include_empty) +(E"Strings",E"search",E"search(string, chars[, start]) + + Search for the given characters within the given string. The second + argument may be a single character, a vector or a set of + characters, a string, or a regular expression (but regular + expressions are only allowed on contiguous strings, such as ASCII + or UTF-8 strings). The third argument optionally specifies a + starting index. The return value is a tuple with 2 integers: the + index of the match and the first valid index past the match (or an + index beyond the end of the string if the match is at the end); it + returns '(0,0)' if no match was found, and '(start,start)' if + 'chars' is empty. + +"), + +(E"Strings",E"split",E"split(string, chars[, limit][, include_empty]) Return an array of strings by splitting the given string on - occurrences of the given character delimiter. The second argument - may also be a set of character delimiters to use. The third - argument specifies whether empty fields should be included. + occurrences of the given character delimiters, which may be + specified in any of the formats allowed by 'search''s second + argument. The last two arguments are optional; they are are a + maximum size for the result and a flag determining whether empty + fields should be included in the result. "), diff --git a/doc/manual/metaprogramming.rst b/doc/manual/metaprogramming.rst index 80eb56b7fb63d..5cf565370c1dc 100644 --- a/doc/manual/metaprogramming.rst +++ b/doc/manual/metaprogramming.rst @@ -275,7 +275,7 @@ cause a compile-time error: :: julia> $a + b - not supported + unsupported or misplaced expression $ .. _man-macros: diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 9121cb5dc17c0..8ff9d7675af9d 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -401,17 +401,25 @@ Strings Convert a string to a contiguous byte array representation appropriate for passing it to C functions. -.. function:: ASCIIString(::Array{Uint8,1}) +.. function:: ascii(::Array{Uint8,1}) Create an ASCII string from a byte array. -.. function:: UTF8String(::Array{Uint8,1}) +.. function:: ascii(s) + + Convert a string to a contiguous ASCII string (all characters must be valid ASCII characters). + +.. function:: utf8(::Array{Uint8,1}) Create a UTF-8 string from a byte array. +.. function:: utf8(s) + + Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters). + .. function:: strchr(string, char[, i]) - Return the index of ``char`` in ``string``, giving 0 if not found. The third argument optionally specifies a starting index. + Return the index of ``char`` in ``string``, giving 0 if not found. The second argument may also be a vector or a set of characters. The third argument optionally specifies a starting index. .. function:: lpad(string, n, p) @@ -421,9 +429,13 @@ Strings Make a string at least ``n`` characters long by padding on the right with copies of ``p``. -.. function:: split(string, char, include_empty) +.. function:: search(string, chars[, start]) + + Search for the given characters within the given string. The second argument may be a single character, a vector or a set of characters, a string, or a regular expression (but regular expressions are only allowed on contiguous strings, such as ASCII or UTF-8 strings). The third argument optionally specifies a starting index. The return value is a tuple with 2 integers: the index of the match and the first valid index past the match (or an index beyond the end of the string if the match is at the end); it returns ``(0,0)`` if no match was found, and ``(start,start)`` if ``chars`` is empty. + +.. function:: split(string, chars[, limit][, include_empty]) - Return an array of strings by splitting the given string on occurrences of the given character delimiter. The second argument may also be a set of character delimiters to use. The third argument specifies whether empty fields should be included. + Return an array of strings by splitting the given string on occurrences of the given character delimiters, which may be specified in any of the formats allowed by ``search``'s second argument. The last two arguments are optional; they are are a maximum size for the result and a flag determining whether empty fields should be included in the result. .. function:: strip(string)