From e5f3e983d6c6ae83b5f4b3e45e655be97c068428 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 2 Aug 2021 15:43:25 -0500 Subject: [PATCH 1/9] Kernel implementation for asciistring swapcase --- .../arrow/compute/kernels/scalar_string.cc | 101 +++++++++++------- .../compute/kernels/scalar_string_test.cc | 6 ++ 2 files changed, 71 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 6ef08a7d2bb5b..0c67e856a1cf7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -81,6 +81,51 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { : utf8_code_unit; } +static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'a') && (ascii_character <= 'z'); +} + +static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'A') && (ascii_character <= 'Z'); +} + +static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + return IsLowerCaseCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); +} + +static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // same +} + +static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')) || + ((ascii_character >= 'a') && (ascii_character <= 'z')) || + ((ascii_character >= 'A') && (ascii_character <= 'Z')); +} + +static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')); +} + +static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || + (ascii_character == ' '); +} + +static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= ' ') && (ascii_character <= '~')); +} + +static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { + if (IsLowerCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit -= 32; + } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit += 32; + } + return utf8_code_unit; +} + template static inline bool IsAsciiCharacter(T character) { return character < 128; @@ -443,6 +488,17 @@ struct AsciiLower { } }; +void TransformAsciiSwapCase(const uint8_t* input, int64_t length, uint8_t* output) { + std::transform(input, input + length, output, ascii_swapcase); +} + +template +struct AsciiSwapCase { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return StringDataTransform(ctx, batch, TransformAsciiSwapCase, out); + } +}; + // ---------------------------------------------------------------------- // exact pattern detection @@ -1458,42 +1514,6 @@ static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { #endif -static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { - return (ascii_character >= 'a') && (ascii_character <= 'z'); -} - -static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { - return (ascii_character >= 'A') && (ascii_character <= 'Z'); -} - -static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { - return IsLowerCaseCharacterAscii(ascii_character) || - IsUpperCaseCharacterAscii(ascii_character); -} - -static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { - return IsCasedCharacterAscii(ascii_character); // same -} - -static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= '0') && (ascii_character <= '9')) || - ((ascii_character >= 'a') && (ascii_character <= 'z')) || - ((ascii_character >= 'A') && (ascii_character <= 'Z')); -} - -static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= '0') && (ascii_character <= '9')); -} - -static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || - (ascii_character == ' '); -} - -static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= ' ') && (ascii_character <= '~')); -} - template struct CharacterPredicateUnicode { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, @@ -4020,6 +4040,13 @@ const FunctionDoc ascii_lower_doc( "non-ASCII characters, use \"utf8_lower\" instead."), {"strings"}); +const FunctionDoc ascii_swapcase_doc( + "Transform ASCII input lowercase characters to uppercase and uppercase characters to lowercase", + ("For each string in `strings`, return a string with opposite casing.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_swapcase\" instead."), + {"strings"}); + const FunctionDoc utf8_upper_doc( "Transform input to uppercase", ("For each string in `strings`, return an uppercase version."), {"strings"}); @@ -4052,6 +4079,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, MemAllocation::NO_PREALLOCATE); + MakeUnaryStringBatchKernel("ascii_swapcase", registry, &ascii_swapcase_doc, + MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 785c82ca04494..5eddc94124f07 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -395,6 +395,12 @@ TYPED_TEST(TestStringKernels, AsciiLower) { "[\"aaazzæÆ&\", null, \"\", \"bbb\"]"); } +TYPED_TEST(TestStringKernels, AsciiSwapCase) { + this->CheckUnary("ascii_swapcase", "[]", this->type(), "[]"); + this->CheckUnary("ascii_swapcase", "[\"aAazZæÆ&\", null, \"\", \"BbB\"]", this->type(), + "[\"AaAZzæÆ&\", null, \"\", \"bBb\"]"); +} + TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), From 0d0decf1b80b6ffd2064b54cef20a654bec51496 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 2 Aug 2021 15:56:44 -0500 Subject: [PATCH 2/9] Apply clang-format for swapcase --- cpp/src/arrow/compute/kernels/scalar_string.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 0c67e856a1cf7..8568d4ffe2d68 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -4041,7 +4041,8 @@ const FunctionDoc ascii_lower_doc( {"strings"}); const FunctionDoc ascii_swapcase_doc( - "Transform ASCII input lowercase characters to uppercase and uppercase characters to lowercase", + "Transform ASCII input lowercase characters to uppercase and uppercase characters to " + "lowercase", ("For each string in `strings`, return a string with opposite casing.\n\n" "This function assumes the input is fully ASCII. If it may contain\n" "non-ASCII characters, use \"utf8_swapcase\" instead."), @@ -4079,8 +4080,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, MemAllocation::NO_PREALLOCATE); - MakeUnaryStringBatchKernel("ascii_swapcase", registry, &ascii_swapcase_doc, - MemAllocation::NO_PREALLOCATE); + MakeUnaryStringBatchKernel( + "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, From 77948a03f8d851cfc01467f59d1f8d75bb50f1f0 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 2 Aug 2021 16:17:21 -0500 Subject: [PATCH 3/9] Add docs for swapcase string compute kernel --- docs/source/cpp/compute.rst | 2 ++ docs/source/python/api/compute.rst | 1 + 2 files changed, 3 insertions(+) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index b389b43c02e46..d2674f6cd36ee 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -591,6 +591,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_reverse | Unary | String-like | String-like | | \(2) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_swapcase | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_upper | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) | diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 2fd0bad07e763..faffe826569af 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -259,6 +259,7 @@ String Transforms ascii_rpad ascii_rtrim ascii_rtrim_whitespace + ascii_swapcase ascii_trim ascii_upper binary_length From 9df4b5dec9ebeb5d8db254d971f9bb0b8a6b1316 Mon Sep 17 00:00:00 2001 From: christian Date: Mon, 2 Aug 2021 23:10:49 -0500 Subject: [PATCH 4/9] kernel implementation for utf8 swapcase --- .../arrow/compute/kernels/scalar_string.cc | 37 +++++++++++++++++++ .../compute/kernels/scalar_string_test.cc | 17 +++++++++ docs/source/cpp/compute.rst | 2 + docs/source/python/api/compute.rst | 1 + 4 files changed, 57 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 8568d4ffe2d68..8b21392b2eb6b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -154,6 +154,7 @@ constexpr uint32_t kMaxCodepointLookup = 0xffff; // up to this codepoint is in a lookup table std::vector lut_upper_codepoint; std::vector lut_lower_codepoint; +std::vector lut_swapcase_codepoint; std::vector lut_category; std::once_flag flag_case_luts; @@ -161,10 +162,19 @@ void EnsureLookupTablesFilled() { std::call_once(flag_case_luts, []() { lut_upper_codepoint.reserve(kMaxCodepointLookup + 1); lut_lower_codepoint.reserve(kMaxCodepointLookup + 1); + lut_swapcase_codepoint.reserve(kMaxCodepointLookup + 1); for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) { lut_upper_codepoint.push_back(utf8proc_toupper(i)); lut_lower_codepoint.push_back(utf8proc_tolower(i)); lut_category.push_back(utf8proc_category(i)); + + if (utf8proc_islower(i)) { + lut_swapcase_codepoint.push_back(utf8proc_toupper(i)); + } else if (utf8proc_isupper(i)) { + lut_swapcase_codepoint.push_back(utf8proc_tolower(i)); + } else { + lut_swapcase_codepoint.push_back(i); + } } }); } @@ -363,6 +373,26 @@ struct UTF8LowerTransform : public CaseMappingTransform { template using UTF8Lower = StringTransformExec>; +struct UTF8SwapCaseTransform : public CaseMappingTransform { + static uint32_t TransformCodepoint(uint32_t codepoint) { + if (codepoint <= kMaxCodepointLookup) { + return lut_swapcase_codepoint[codepoint]; + } else { + if (utf8proc_islower(codepoint)) { + return utf8proc_toupper(codepoint); + } else if (utf8proc_isupper(codepoint)) { + return utf8proc_tolower(codepoint); + } else { + return codepoint; + } + } + } +}; + +template +using UTF8SwapCase = + StringTransformExec>; + #endif // ARROW_WITH_UTF8PROC struct AsciiReverseTransform : public StringTransformBase { @@ -4056,6 +4086,11 @@ const FunctionDoc utf8_lower_doc( "Transform input to lowercase", ("For each string in `strings`, return a lowercase version."), {"strings"}); +const FunctionDoc utf8_swapcase_doc( + "Transform input lowercase characters to uppercase and uppercase characters to " + "lowercase", + ("For each string in `strings`, return an opposite case version."), {"strings"}); + const FunctionDoc ascii_reverse_doc( "Reverse ASCII input", ("For each ASCII string in `strings`, return a reversed version.\n\n" @@ -4125,6 +4160,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { #ifdef ARROW_WITH_UTF8PROC MakeUnaryStringUTF8TransformKernel("utf8_upper", registry, &utf8_upper_doc); MakeUnaryStringUTF8TransformKernel("utf8_lower", registry, &utf8_lower_doc); + MakeUnaryStringUTF8TransformKernel("utf8_swapcase", registry, + &utf8_swapcase_doc); MakeUnaryStringBatchKernel("utf8_trim_whitespace", registry, &utf8_trim_whitespace_doc); MakeUnaryStringBatchKernel("utf8_ltrim_whitespace", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 5eddc94124f07..246aa07701b16 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -499,6 +499,23 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { CallFunction("utf8_lower", {invalid_input})); } +TYPED_TEST(TestStringKernels, Utf8SwapCase) { + this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), + "[\"AaAZzÆæ&\", null, \"\", \"B\"]"); + + // test varying encoding lengths and thus changing indices/offsets + this->CheckUnary("utf8_swapcase", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(), + "[\"ɑⱤɽOw\", null, \"Ii\", \"b\"]"); + + // test maximum buffer growth + this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]"); + + // Test invalid data + auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"), + CallFunction("utf8_swapcase", {invalid_input})); +} + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index d2674f6cd36ee..01dc1d92e17ad 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -611,6 +611,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_reverse | Unary | String-like | String-like | | \(9) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_swapcase | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_upper | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index faffe826569af..c503cba319ca1 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -277,6 +277,7 @@ String Transforms utf8_rpad utf8_rtrim utf8_rtrim_whitespace + utf8_swapcase utf8_trim utf8_upper From 8e07c09682f7415df063980cc4957158af15681c Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 3 Aug 2021 10:32:34 -0500 Subject: [PATCH 5/9] Add swapcase tests without casing --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 246aa07701b16..3aa6f5368d2f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -399,6 +399,8 @@ TYPED_TEST(TestStringKernels, AsciiSwapCase) { this->CheckUnary("ascii_swapcase", "[]", this->type(), "[]"); this->CheckUnary("ascii_swapcase", "[\"aAazZæÆ&\", null, \"\", \"BbB\"]", this->type(), "[\"AaAZzæÆ&\", null, \"\", \"bBb\"]"); + this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); } TYPED_TEST(TestStringKernels, AsciiReverse) { @@ -510,6 +512,9 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { // test maximum buffer growth this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]"); + this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); + // Test invalid data auto invalid_input = ArrayFromJSON(this->type(), "[\"Ⱥa\xFFⱭ\", \"Ɽ\xe1\xbdⱤaA\"]"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"), From 2e154aa26c3b5129b98219cf31592c994a89b631 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 3 Aug 2021 13:40:54 -0500 Subject: [PATCH 6/9] moving return statement to be the last one --- cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 8b21392b2eb6b..f7963da7cab13 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -382,10 +382,10 @@ struct UTF8SwapCaseTransform : public CaseMappingTransform { return utf8proc_toupper(codepoint); } else if (utf8proc_isupper(codepoint)) { return utf8proc_tolower(codepoint); - } else { - return codepoint; } } + + return codepoint; } }; From 191d09d2fbe2f0c73214d20bc0a54035bff83069 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 3 Aug 2021 14:44:23 -0500 Subject: [PATCH 7/9] Reusing helper functions for isLower and isUpper unicode --- .../arrow/compute/kernels/scalar_string.cc | 218 +++++++++--------- 1 file changed, 107 insertions(+), 111 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f7963da7cab13..5359567fc125d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -158,6 +158,109 @@ std::vector lut_swapcase_codepoint; std::vector lut_category; std::once_flag flag_case_luts; +// IsAlpha/Digit etc + +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { + utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup + ? lut_category[codepoint] + : utf8proc_category(codepoint); + uint32_t general_category_bit = 1 << general_category; + // for e.g. undefined (but valid) codepoints, general_category == 0 == + // UTF8PROC_CATEGORY_CN + return (general_category != UTF8PROC_CATEGORY_CN) && + ((general_category_bit & mask) != 0); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), + categories...); +} + +static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) || + (static_cast(utf8proc_tolower(codepoint)) != codepoint)); +} + +static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { + // although this trick seems to work for upper case, this is not enough for lower case + // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the + // best we can do + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) && + (static_cast(utf8proc_tolower(codepoint)) == codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { + // this seems to be a good workaround for utf8proc not having case information + // https://github.com/JuliaStrings/utf8proc/issues/195 + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || + ((static_cast(utf8proc_toupper(codepoint)) == codepoint) && + (static_cast(utf8proc_tolower(codepoint)) != codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory( + codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); +} + +static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { + // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. + // utf8proc has no support for this, this is the best we can do: + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { + // Formally this is not correct, but utf8proc does not allow us to query for Numerical + // properties, e.g. Numeric_Value and Numeric_Type + // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or + // Numeric_Type=Numeric. + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { + auto property = utf8proc_get_property(codepoint); + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || + property->bidi_class == UTF8PROC_BIDI_CLASS_WS || + property->bidi_class == UTF8PROC_BIDI_CLASS_B || + property->bidi_class == UTF8PROC_BIDI_CLASS_S; +} + +static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { + uint32_t general_category = utf8proc_category(codepoint); + return (general_category != UTF8PROC_CATEGORY_CN) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, + UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, + UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, + UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); +} + void EnsureLookupTablesFilled() { std::call_once(flag_case_luts, []() { lut_upper_codepoint.reserve(kMaxCodepointLookup + 1); @@ -168,9 +271,9 @@ void EnsureLookupTablesFilled() { lut_lower_codepoint.push_back(utf8proc_tolower(i)); lut_category.push_back(utf8proc_category(i)); - if (utf8proc_islower(i)) { + if (IsLowerCaseCharacterUnicode(i)) { lut_swapcase_codepoint.push_back(utf8proc_toupper(i)); - } else if (utf8proc_isupper(i)) { + } else if (IsUpperCaseCharacterUnicode(i)) { lut_swapcase_codepoint.push_back(utf8proc_tolower(i)); } else { lut_swapcase_codepoint.push_back(i); @@ -378,9 +481,9 @@ struct UTF8SwapCaseTransform : public CaseMappingTransform { if (codepoint <= kMaxCodepointLookup) { return lut_swapcase_codepoint[codepoint]; } else { - if (utf8proc_islower(codepoint)) { + if (IsLowerCaseCharacterUnicode(codepoint)) { return utf8proc_toupper(codepoint); - } else if (utf8proc_isupper(codepoint)) { + } else if (IsUpperCaseCharacterUnicode(codepoint)) { return utf8proc_tolower(codepoint); } } @@ -1437,113 +1540,6 @@ void AddSlice(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } -// IsAlpha/Digit etc - -#ifdef ARROW_WITH_UTF8PROC - -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { - utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup - ? lut_category[codepoint] - : utf8proc_category(codepoint); - uint32_t general_category_bit = 1 << general_category; - // for e.g. undefined (but valid) codepoints, general_category == 0 == - // UTF8PROC_CATEGORY_CN - return (general_category != UTF8PROC_CATEGORY_CN) && - ((general_category_bit & mask) != 0); -} - -template -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, - utf8proc_category_t category, - Categories... categories) { - return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); -} - -template -static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, - utf8proc_category_t category, - Categories... categories) { - return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), - categories...); -} - -static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, - UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || - ((static_cast(utf8proc_toupper(codepoint)) != codepoint) || - (static_cast(utf8proc_tolower(codepoint)) != codepoint)); -} - -static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { - // although this trick seems to work for upper case, this is not enough for lower case - // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the - // best we can do - return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) || - ((static_cast(utf8proc_toupper(codepoint)) != codepoint) && - (static_cast(utf8proc_tolower(codepoint)) == codepoint))) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); -} - -static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { - // this seems to be a good workaround for utf8proc not having case information - // https://github.com/JuliaStrings/utf8proc/issues/195 - return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || - ((static_cast(utf8proc_toupper(codepoint)) == codepoint) && - (static_cast(utf8proc_tolower(codepoint)) != codepoint))) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); -} - -static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory( - codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, - UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND, - UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); -} - -static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, - UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, - UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); -} - -static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); -} - -static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { - // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. - // utf8proc has no support for this, this is the best we can do: - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); -} - -static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { - // Formally this is not correct, but utf8proc does not allow us to query for Numerical - // properties, e.g. Numeric_Value and Numeric_Type - // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or - // Numeric_Type=Numeric. - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, - UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); -} - -static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { - auto property = utf8proc_get_property(codepoint); - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || - property->bidi_class == UTF8PROC_BIDI_CLASS_WS || - property->bidi_class == UTF8PROC_BIDI_CLASS_B || - property->bidi_class == UTF8PROC_BIDI_CLASS_S; -} - -static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { - uint32_t general_category = utf8proc_category(codepoint); - return (general_category != UTF8PROC_CATEGORY_CN) && - !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, - UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, - UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, - UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); -} - -#endif - template struct CharacterPredicateUnicode { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, From 66a364a36f6f5d7229a67a5d7b371a962bfb3d23 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 3 Aug 2021 15:56:42 -0500 Subject: [PATCH 8/9] Macro for check utf8proc version --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 3aa6f5368d2f5..92e39d8b166b2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -501,6 +501,9 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { CallFunction("utf8_lower", {invalid_input})); } +// Older versions of utf8proc fail +#if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5) + TYPED_TEST(TestStringKernels, Utf8SwapCase) { this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), "[\"AaAZzÆæ&\", null, \"\", \"B\"]"); @@ -521,6 +524,8 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { CallFunction("utf8_swapcase", {invalid_input})); } +#endif // UTF8PROC_VERSION_MINOR >= 5 + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO From ae28e31b3a3fba1ae186dbb04bd5c780f9262cd6 Mon Sep 17 00:00:00 2001 From: christian Date: Tue, 3 Aug 2021 19:43:06 -0500 Subject: [PATCH 9/9] Remove last check for swapcase as not needed --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 92e39d8b166b2..3aa6f5368d2f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -501,9 +501,6 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { CallFunction("utf8_lower", {invalid_input})); } -// Older versions of utf8proc fail -#if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5) - TYPED_TEST(TestStringKernels, Utf8SwapCase) { this->CheckUnary("utf8_swapcase", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), "[\"AaAZzÆæ&\", null, \"\", \"B\"]"); @@ -524,8 +521,6 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { CallFunction("utf8_swapcase", {invalid_input})); } -#endif // UTF8PROC_VERSION_MINOR >= 5 - TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO