From 1fc7018d477a8bf7d312302d6a2bc45848f3e91a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 12 Jul 2016 16:21:08 -0400 Subject: [PATCH] export and document transcode (#17323) * export and document transcode from #16974, add transcode(String, x) and transcode(T, ::String) convenience methods * docs * support UTF-32 in transcode * don't use splatting for UTF-32 to String conversion * typo * eliminate method ambiguities * re-run genstdlib * doc clarification * typo --- NEWS.md | 4 ++++ base/c.jl | 37 ++++++++++++++++++++++++++++--------- base/env.jl | 4 ++-- base/exports.jl | 1 + base/file.jl | 4 ++-- base/interactiveutil.jl | 2 +- base/libc.jl | 2 +- base/path.jl | 4 ++-- doc/manual/strings.rst | 8 +++++--- doc/stdlib/strings.rst | 11 ++++++++++- test/misc.jl | 8 +++++++- 11 files changed, 63 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71440f4f7f935..e48f3fe80e22f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -132,6 +132,9 @@ Library improvements `String(s)`, `unsafe_string(ptr)` (formerly `bytestring(ptr)`), and `unsafe_wrap(String, ptr)` (formerly `pointer_to_string`) ([#16731]). + * A `transcode(T, src)` function is now exported for converting data + between UTF-xx Unicode encodings ([#17323]). + * Most of the combinatorics functions have been moved from `Base` to the [Combinatorics.jl package](https://github.com/JuliaLang/Combinatorics.jl) ([#13897]). @@ -334,4 +337,5 @@ Deprecated or removed [#17075]: https://github.com/JuliaLang/julia/issues/17075 [#17266]: https://github.com/JuliaLang/julia/issues/17266 [#17300]: https://github.com/JuliaLang/julia/issues/17300 +[#17323]: https://github.com/JuliaLang/julia/issues/17323 [#17374]: https://github.com/JuliaLang/julia/issues/17374 diff --git a/base/c.jl b/base/c.jl index 97a1815b7d23c..8bd9b85179b24 100644 --- a/base/c.jl +++ b/base/c.jl @@ -128,20 +128,39 @@ function cwstring(s::AbstractString) end end -# transcoding between data in UTF-8 and UTF-16 for Windows APIs +# transcoding between data in UTF-8 and UTF-16 for Windows APIs, +# and also UTF-32 for APIs using Cwchar_t on other platforms. + """ - Base.transcode(T,src::Vector{U}) + transcode(T, src) + +Convert string data between Unicode encodings. `src` is either a +`String` or a `Vector{UIntXX}` of UTF-XX code units, where +`XX` is 8, 16, or 32. `T` indicates the encoding of the return value: +`String` to return a (UTF-8 encoded) `String` or `UIntXX` +to return a `Vector{UIntXX}` of UTF-`XX` data. (The alias `Cwchar_t` +can also be used as the integer type, for converting `wchar_t*` strings +used by external C libraries.) -Transcodes unicode data `src` to a different encoding, where `U` and `T` are the integers -denoting the input and output code units. Currently supported are UTF-8 and UTF-16, which -are denoted by integers `UInt8` and `UInt16`, respectively. +The `transcode` function succeeds as long as the input data can be +reasonably represented in the target encoding; it always succeeds for +conversions between UTF-XX encodings, even for invalid Unicode data. -NULs are handled like any other character (i.e. the output will be NUL-terminated if and -only if the `src` is). +Only conversion to/from UTF-8 is currently supported. """ function transcode end -transcode{T<:Union{UInt8,UInt16}}(::Type{T}, src::Vector{T}) = src -transcode(::Type{Int32}, src::Vector{UInt32}) = reinterpret(Int32, src) + +transcode{T<:Union{UInt8,UInt16,UInt32,Int32}}(::Type{T}, src::Vector{T}) = src +transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::String) = T[T(c) for c in src] +transcode{T<:Union{Int32,UInt32}}(::Type{T}, src::Vector{UInt8}) = transcode(T, String(src)) +function transcode{S<:Union{Int32,UInt32}}(::Type{UInt8}, src::Vector{S}) + buf = IOBuffer() + for c in src; print(buf, Char(c)); end + takebuf_array(buf) +end +transcode(::Type{String}, src::String) = src +transcode(T, src::String) = transcode(T, src.data) +transcode(::Type{String}, src) = String(transcode(UInt8, src)) function transcode(::Type{UInt16}, src::Vector{UInt8}) dst = UInt16[] diff --git a/base/env.jl b/base/env.jl index cb21ecbe0dcf4..0f41a5bbf170e 100644 --- a/base/env.jl +++ b/base/env.jl @@ -19,7 +19,7 @@ function access_env(onError::Function, str::AbstractString) error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage())) end pop!(val) # NUL - return String(transcode(UInt8, val)) + return transcode(String, val) end function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true) @@ -97,7 +97,7 @@ function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}}) len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos) buf = Array{UInt16}(len) unsafe_copy!(pointer(buf), pos, len) - env = String(transcode(UInt8, buf)) + env = transcode(String, buf) m = match(r"^(=?[^=]+)=(.*)$"s, env) if m === nothing error("malformed environment entry: $env") diff --git a/base/exports.jl b/base/exports.jl index 4cdc9e4739042..dda16f688f0c8 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -874,6 +874,7 @@ export strip, strwidth, summary, + transcode, ucfirst, unescape_string, uppercase, diff --git a/base/file.jl b/base/file.jl index 97aa73a31aee3..396ba739f2cd4 100644 --- a/base/file.jl +++ b/base/file.jl @@ -203,7 +203,7 @@ function tempdir() error("GetTempPath failed: $(Libc.FormatMessage())") end resize!(temppath,lentemppath) - return String(transcode(UInt8, temppath)) + return transcode(String, temppath) end tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique) const temp_prefix = cwstring("jl_") @@ -216,7 +216,7 @@ function tempname(temppath::AbstractString,uunique::UInt32) error("GetTempFileName failed: $(Libc.FormatMessage())") end resize!(tname,lentname) - return String(transcode(UInt8, tname)) + return transcode(String, tname) end function mktemp(parent=tempdir()) filename = tempname(parent, UInt32(0)) diff --git a/base/interactiveutil.jl b/base/interactiveutil.jl index ce4274d2551a1..6ac54783f30e4 100644 --- a/base/interactiveutil.jl +++ b/base/interactiveutil.jl @@ -150,7 +150,7 @@ elseif is_windows() len = 0 while unsafe_load(plock, len+1) != 0; len += 1; end # get Vector{UInt16}, transcode data to UTF-8, make a String of it - s = String(transcode(UInt8, unsafe_wrap(Array, plock, len))) + s = transcode(String, unsafe_wrap(Array, plock, len)) systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock)) return s end diff --git a/base/libc.jl b/base/libc.jl index 6943d457ebde7..8020147d42ce1 100644 --- a/base/libc.jl +++ b/base/libc.jl @@ -277,7 +277,7 @@ if is_windows() buf = Array{UInt16}(len) unsafe_copy!(pointer(buf), p, len) ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p) - return String(transcode(UInt8, buf)) + return transcode(String, buf) end end diff --git a/base/path.jl b/base/path.jl index 494250dc29683..d19779734913a 100644 --- a/base/path.jl +++ b/base/path.jl @@ -136,7 +136,7 @@ function realpath(path::AbstractString) systemerror(:realpath, n == 0) x = n < length(buf) # is the buffer big enough? resize!(buf, n) # shrink if x, grow if !x - x && return String(transcode(UInt8, buf)) + x && return transcode(String, buf) end end @@ -150,7 +150,7 @@ function longpath(path::AbstractString) systemerror(:longpath, n == 0) x = n < length(buf) # is the buffer big enough? resize!(buf, n) # shrink if x, grow if !x - x && return String(transcode(UInt8, buf)) + x && return transcode(String, buf) end end diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 4adec34080742..5628c1d415e1d 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -352,14 +352,16 @@ exception handling required: y -Julia uses UTF-8 encoding by default, and support for new encodings can +Julia uses the UTF-8 encoding by default, and support for new encodings can be added by packages. For example, the `LegacyStrings.jl `_ package implements ``UTF16String`` and ``UTF32String`` types. Additional discussion of other encodings and how to implement support for them is beyond the scope of this document for the time being. For further discussion of UTF-8 encoding issues, -see the section below on `byte array literals <#Byte+Array+Literals>`_, -which goes into some greater detail. +see the section below on `byte array literals <#Byte+Array+Literals>`_. +The :func:`transcode` function is provided to convert data between +the various UTF-xx encodings, primarily for working with external +data and libraries. .. _man-string-interpolation: diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index b27e7616a1e1e..1e6cd2a46b5cb 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -56,6 +56,16 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. This representation is often appropriate for passing strings to C. +.. function:: transcode(T, src) + + .. Docstring generated from Julia source + + Convert string data between Unicode encodings. ``src`` is either a ``String`` or a ``Vector{UIntXX}`` of UTF-XX code units, where ``XX`` is 8, 16, or 32. ``T`` indicates the encoding of the return value: ``String`` to return a (UTF-8 encoded) ``String`` or ``UIntXX`` to return a ``Vector{UIntXX}`` of UTF-``XX`` data. (The alias ``Cwchar_t`` can also be used as the integer type, for converting ``wchar_t*`` strings used by external C libraries.) + + The ``transcode`` function succeeds as long as the input data can be reasonably represented in the target encoding; it always succeeds for conversions between UTF-XX encodings, even for invalid Unicode data. + + Only conversion to/from UTF-8 is currently supported. + .. function:: unsafe_string(p::Ptr{UInt8}, [length::Integer]) .. Docstring generated from Julia source @@ -472,4 +482,3 @@ .. Docstring generated from Julia source General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ . - diff --git a/test/misc.jl b/test/misc.jl index e3ae6b72e8ce6..16066830498e9 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -209,7 +209,6 @@ whos(IOBuffer(), Tmp14173) # warm up @test @allocated(whos(IOBuffer(), Tmp14173)) < 10000 ## test conversion from UTF-8 to UTF-16 (for Windows APIs) -import Base.Libc: transcode # empty arrays @test transcode(UInt16, UInt8[]) == UInt16[] @@ -376,6 +375,13 @@ for (X,Y,Z) in ((V16,V16,V16), (I16,V16,I16), (V16,I16,V16), (V16,V16,I16), (I16 end end +let s = "abcα🐨\0x\0" + for T in (UInt8, UInt16, UInt32, Int32) + @test transcode(T, s) == transcode(T, s.data) + @test transcode(String, transcode(T, s)) == s + end +end + # clipboard functionality if is_windows() for str in ("Hello, world.", "∀ x ∃ y", "")