From bdda16419bc71c6e9cce045f2a3bcb352a425ace Mon Sep 17 00:00:00 2001 From: christian Date: Wed, 4 Aug 2021 14:02:34 +0200 Subject: [PATCH] ARROW-12946: [C++] String swap case kernel This PR adds `swapcase` compute kernel for string. It is similar to `Python str.swapcase()` Closes #10855 from Christian8491/ARROW-12946-String-swap-case-kernel Authored-by: christian Signed-off-by: Antoine Pitrou --- .../arrow/compute/kernels/scalar_string.cc | 349 +++++++++++------- .../compute/kernels/scalar_string_test.cc | 28 ++ docs/source/cpp/compute.rst | 4 + docs/source/python/api/compute.rst | 2 + 4 files changed, 240 insertions(+), 143 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 6ef08a7d2bb5b..5359567fc125d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -81,6 +81,51 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { : utf8_code_unit; } +static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'a') && (ascii_character <= 'z'); +} + +static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'A') && (ascii_character <= 'Z'); +} + +static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + return IsLowerCaseCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); +} + +static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // same +} + +static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')) || + ((ascii_character >= 'a') && (ascii_character <= 'z')) || + ((ascii_character >= 'A') && (ascii_character <= 'Z')); +} + +static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')); +} + +static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || + (ascii_character == ' '); +} + +static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= ' ') && (ascii_character <= '~')); +} + +static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { + if (IsLowerCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit -= 32; + } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit += 32; + } + return utf8_code_unit; +} + template static inline bool IsAsciiCharacter(T character) { return character < 128; @@ -109,17 +154,130 @@ constexpr uint32_t kMaxCodepointLookup = 0xffff; // up to this codepoint is in a lookup table std::vector lut_upper_codepoint; std::vector lut_lower_codepoint; +std::vector lut_swapcase_codepoint; std::vector lut_category; std::once_flag flag_case_luts; +// IsAlpha/Digit etc + +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { + utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup + ? lut_category[codepoint] + : utf8proc_category(codepoint); + uint32_t general_category_bit = 1 << general_category; + // for e.g. undefined (but valid) codepoints, general_category == 0 == + // UTF8PROC_CATEGORY_CN + return (general_category != UTF8PROC_CATEGORY_CN) && + ((general_category_bit & mask) != 0); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), + categories...); +} + +static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) || + (static_cast(utf8proc_tolower(codepoint)) != codepoint)); +} + +static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { + // although this trick seems to work for upper case, this is not enough for lower case + // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the + // best we can do + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) && + (static_cast(utf8proc_tolower(codepoint)) == codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { + // this seems to be a good workaround for utf8proc not having case information + // https://github.com/JuliaStrings/utf8proc/issues/195 + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || + ((static_cast(utf8proc_toupper(codepoint)) == codepoint) && + (static_cast(utf8proc_tolower(codepoint)) != codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory( + codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); +} + +static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { + // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. + // utf8proc has no support for this, this is the best we can do: + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { + // Formally this is not correct, but utf8proc does not allow us to query for Numerical + // properties, e.g. Numeric_Value and Numeric_Type + // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or + // Numeric_Type=Numeric. + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { + auto property = utf8proc_get_property(codepoint); + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || + property->bidi_class == UTF8PROC_BIDI_CLASS_WS || + property->bidi_class == UTF8PROC_BIDI_CLASS_B || + property->bidi_class == UTF8PROC_BIDI_CLASS_S; +} + +static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { + uint32_t general_category = utf8proc_category(codepoint); + return (general_category != UTF8PROC_CATEGORY_CN) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, + UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, + UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, + UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); +} + void EnsureLookupTablesFilled() { std::call_once(flag_case_luts, []() { lut_upper_codepoint.reserve(kMaxCodepointLookup + 1); lut_lower_codepoint.reserve(kMaxCodepointLookup + 1); + lut_swapcase_codepoint.reserve(kMaxCodepointLookup + 1); for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) { lut_upper_codepoint.push_back(utf8proc_toupper(i)); lut_lower_codepoint.push_back(utf8proc_tolower(i)); lut_category.push_back(utf8proc_category(i)); + + if (IsLowerCaseCharacterUnicode(i)) { + lut_swapcase_codepoint.push_back(utf8proc_toupper(i)); + } else if (IsUpperCaseCharacterUnicode(i)) { + lut_swapcase_codepoint.push_back(utf8proc_tolower(i)); + } else { + lut_swapcase_codepoint.push_back(i); + } } }); } @@ -318,6 +476,26 @@ struct UTF8LowerTransform : public CaseMappingTransform { template using UTF8Lower = StringTransformExec>; +struct UTF8SwapCaseTransform : public CaseMappingTransform { + static uint32_t TransformCodepoint(uint32_t codepoint) { + if (codepoint <= kMaxCodepointLookup) { + return lut_swapcase_codepoint[codepoint]; + } else { + if (IsLowerCaseCharacterUnicode(codepoint)) { + return utf8proc_toupper(codepoint); + } else if (IsUpperCaseCharacterUnicode(codepoint)) { + return utf8proc_tolower(codepoint); + } + } + + return codepoint; + } +}; + +template +using UTF8SwapCase = + StringTransformExec>; + #endif // ARROW_WITH_UTF8PROC struct AsciiReverseTransform : public StringTransformBase { @@ -443,6 +621,17 @@ struct AsciiLower { } }; +void TransformAsciiSwapCase(const uint8_t* input, int64_t length, uint8_t* output) { + std::transform(input, input + length, output, ascii_swapcase); +} + +template +struct AsciiSwapCase { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return StringDataTransform(ctx, batch, TransformAsciiSwapCase, out); + } +}; + // ---------------------------------------------------------------------- // exact pattern detection @@ -1351,149 +1540,6 @@ void AddSlice(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } -// IsAlpha/Digit etc - -#ifdef ARROW_WITH_UTF8PROC - -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { - utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup - ? lut_category[codepoint] - : utf8proc_category(codepoint); - uint32_t general_category_bit = 1 << general_category; - // for e.g. undefined (but valid) codepoints, general_category == 0 == - // UTF8PROC_CATEGORY_CN - return (general_category != UTF8PROC_CATEGORY_CN) && - ((general_category_bit & mask) != 0); -} - -template -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, - utf8proc_category_t category, - Categories... categories) { - return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); -} - -template -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, - utf8proc_category_t category, - Categories... categories) { - return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), - categories...); -} - -static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, - UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || - ((static_cast(utf8proc_toupper(codepoint)) != codepoint) || - (static_cast(utf8proc_tolower(codepoint)) != codepoint)); -} - -static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { - // although this trick seems to work for upper case, this is not enough for lower case - // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the - // best we can do - return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) || - ((static_cast(utf8proc_toupper(codepoint)) != codepoint) && - (static_cast(utf8proc_tolower(codepoint)) == codepoint))) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); -} - -static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { - // this seems to be a good workaround for utf8proc not having case information - // https://github.com/JuliaStrings/utf8proc/issues/195 - return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || - ((static_cast(utf8proc_toupper(codepoint)) == codepoint) && - (static_cast(utf8proc_tolower(codepoint)) != codepoint))) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); -} - -static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory( - codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, - UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND, - UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); -} - -static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, - UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, - UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); -} - -static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); -} - -static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { - // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. - // utf8proc has no support for this, this is the best we can do: - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); -} - -static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { - // Formally this is not correct, but utf8proc does not allow us to query for Numerical - // properties, e.g. Numeric_Value and Numeric_Type - // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or - // Numeric_Type=Numeric. - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, - UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); -} - -static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { - auto property = utf8proc_get_property(codepoint); - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || - property->bidi_class == UTF8PROC_BIDI_CLASS_WS || - property->bidi_class == UTF8PROC_BIDI_CLASS_B || - property->bidi_class == UTF8PROC_BIDI_CLASS_S; -} - -static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { - uint32_t general_category = utf8proc_category(codepoint); - return (general_category != UTF8PROC_CATEGORY_CN) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, - UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, - UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, - UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); -} - -#endif - -static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { - return (ascii_character >= 'a') && (ascii_character <= 'z'); -} - -static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { - return (ascii_character >= 'A') && (ascii_character <= 'Z'); -} - -static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { - return IsLowerCaseCharacterAscii(ascii_character) || - IsUpperCaseCharacterAscii(ascii_character); -} - -static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { - return IsCasedCharacterAscii(ascii_character); // same -} - -static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= '0') && (ascii_character <= '9')) || - ((ascii_character >= 'a') && (ascii_character <= 'z')) || - ((ascii_character >= 'A') && (ascii_character <= 'Z')); -} - -static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= '0') && (ascii_character <= '9')); -} - -static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || - (ascii_character == ' '); -} - -static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= ' ') && (ascii_character <= '~')); -} - template struct CharacterPredicateUnicode { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, @@ -4020,6 +4066,14 @@ const FunctionDoc ascii_lower_doc( "non-ASCII characters, use \"utf8_lower\" instead."), {"strings"}); +const FunctionDoc ascii_swapcase_doc( + "Transform ASCII input lowercase characters to uppercase and uppercase characters to " + "lowercase", + ("For each string in `strings`, return a string with opposite casing.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_swapcase\" instead."), + {"strings"}); + const FunctionDoc utf8_upper_doc( "Transform input to uppercase", ("For each string in `strings`, return an uppercase version."), {"strings"}); @@ -4028,6 +4082,11 @@ const FunctionDoc utf8_lower_doc( "Transform input to lowercase", ("For each string in `strings`, return a lowercase version."), {"strings"}); +const FunctionDoc utf8_swapcase_doc( + "Transform input lowercase characters to uppercase and uppercase characters to " + "lowercase", + ("For each string in `strings`, return an opposite case version."), {"strings"}); + const FunctionDoc ascii_reverse_doc( "Reverse ASCII input", ("For each ASCII string in `strings`, return a reversed version.\n\n" @@ -4052,6 +4111,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, MemAllocation::NO_PREALLOCATE); + MakeUnaryStringBatchKernel( + "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, @@ -4095,6 +4156,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { #ifdef ARROW_WITH_UTF8PROC MakeUnaryStringUTF8TransformKernel("utf8_upper", registry, &utf8_upper_doc); MakeUnaryStringUTF8TransformKernel("utf8_lower", registry, &utf8_lower_doc); + MakeUnaryStringUTF8TransformKernel("utf8_swapcase", registry, + &utf8_swapcase_doc); MakeUnaryStringBatchKernel("utf8_trim_whitespace", registry, &utf8_trim_whitespace_doc); MakeUnaryStringBatchKernel("utf8_ltrim_whitespace", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 785c82ca04494..3aa6f5368d2f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -395,6 +395,14 @@ TYPED_TEST(TestStringKernels, AsciiLower) { "[\"aaazzæÆ&\", null, \"\", \"bbb\"]"); } +TYPED_TEST(TestStringKernels, AsciiSwapCase) { + this->CheckUnary("ascii_swapcase", "[]", this->type(), "[]"); + this->CheckUnary("ascii_swapcase", "[\"aAazZæÆ&\", null, \"\", \"BbB\"]", this->type(), + "[\"AaAZzæÆ&\", null, \"\", \"bBb\"]"); + this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); +} + TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), @@ -493,6 +501,26 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { CallFunction("utf8_lower", {invalid_input})); } +TYPED_TEST(TestStringKernels, Utf8SwapCase) { + this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), + "[\"AaAZzÆæ&\", null, \"\", \"B\"]"); + + // test varying encoding lengths and thus changing indices/offsets + this->CheckUnary("utf8_swapcase", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(), + "[\"ɑⱤɽOw\", null, \"Ii\", \"b\"]"); + + // test maximum buffer growth + this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]"); + + this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); + + // Test invalid data + auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"), + CallFunction("utf8_swapcase", {invalid_input})); +} + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index b389b43c02e46..01dc1d92e17ad 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -591,6 +591,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_reverse | Unary | String-like | String-like | | \(2) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_swapcase | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_upper | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) | @@ -609,6 +611,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_reverse | Unary | String-like | String-like | | \(9) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_swapcase | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_upper | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 2fd0bad07e763..c503cba319ca1 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -259,6 +259,7 @@ String Transforms ascii_rpad ascii_rtrim ascii_rtrim_whitespace + ascii_swapcase ascii_trim ascii_upper binary_length @@ -276,6 +277,7 @@ String Transforms utf8_rpad utf8_rtrim utf8_rtrim_whitespace + utf8_swapcase utf8_trim utf8_upper