From 12f9a4da884ade3562c53bc2b91256ded94bd445 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 4 Aug 2021 17:05:23 +0200 Subject: [PATCH] ARROW-12944: [C++] String capitalize kernel This PR adds scalar compute functions for string capitalization, namely "ascii_capitalize" and "utf8_capitalize". Closes #10857 from edponce/ARROW-12944-String-capitalize-kernel Authored-by: Eduardo Ponce Signed-off-by: Antoine Pitrou --- .../arrow/compute/kernels/scalar_string.cc | 82 ++++++++++++++++--- .../compute/kernels/scalar_string_test.cc | 22 ++++- docs/source/cpp/compute.rst | 4 + docs/source/python/api/compute.rst | 2 + 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 5359567fc125d..8d8152744794e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -446,10 +446,10 @@ struct StringTransformCodepoint : public StringTransformBase { // struct CaseMappingMixin { struct CaseMappingTransform { static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) { - // Section 5.18 of the Unicode spec claim that the number of codepoints for case + // Section 5.18 of the Unicode spec claims that the number of codepoints for case // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes // However, since we don't support all casings (SpecialCasing.txt) the growth - // in bytes iss actually only at max 3/2 (as covered by the unittest). + // in bytes is actually only at max 3/2 (as covered by the unittest). // Note that rounding down the 3/2 is ok, since only codepoints encoded by // two code units (even) can grow to 3 code units. return static_cast(input_ncodeunits) * 3 / 2; @@ -496,6 +496,37 @@ template using UTF8SwapCase = StringTransformExec>; +struct Utf8CapitalizeTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + uint8_t* output_start = output; + if (input_string_ncodeunits > 0) { + // Get number of code units in first code point + uint32_t codepoint = 0; + const uint8_t* i = input; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) { + return kTransformError; + } + int64_t codepoint_ncodeunits = + std::min(static_cast(i - input), input_string_ncodeunits); + if (ARROW_PREDICT_FALSE( + !util::UTF8Transform(input, input + codepoint_ncodeunits, &output, + UTF8UpperTransform::TransformCodepoint))) { + return kTransformError; + } + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + input + codepoint_ncodeunits, input + input_string_ncodeunits, &output, + UTF8LowerTransform::TransformCodepoint))) { + return kTransformError; + } + } + return output - output_start; + } +}; + +template +using Utf8Capitalize = StringTransformExec; + #endif // ARROW_WITH_UTF8PROC struct AsciiReverseTransform : public StringTransformBase { @@ -632,6 +663,20 @@ struct AsciiSwapCase { } }; +struct AsciiCapitalizeTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + if (input_string_ncodeunits > 0) { + *output = ascii_toupper(*input); + TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1); + } + return input_string_ncodeunits; + } +}; + +template +using AsciiCapitalize = StringTransformExec; + // ---------------------------------------------------------------------- // exact pattern detection @@ -4074,6 +4119,20 @@ const FunctionDoc ascii_swapcase_doc( "non-ASCII characters, use \"utf8_swapcase\" instead."), {"strings"}); +const FunctionDoc ascii_capitalize_doc( + "Capitalize the first character of ASCII input", + ("For each string in `strings`, return a capitalized version.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_capitalize\" instead."), + {"strings"}); + +const FunctionDoc ascii_reverse_doc( + "Reverse ASCII input", + ("For each ASCII string in `strings`, return a reversed version.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_reverse\" instead."), + {"strings"}); + const FunctionDoc utf8_upper_doc( "Transform input to uppercase", ("For each string in `strings`, return an uppercase version."), {"strings"}); @@ -4087,17 +4146,16 @@ const FunctionDoc utf8_swapcase_doc( "lowercase", ("For each string in `strings`, return an opposite case version."), {"strings"}); -const FunctionDoc ascii_reverse_doc( - "Reverse ASCII input", - ("For each ASCII string in `strings`, return a reversed version.\n\n" - "This function assumes the input is fully ASCII. If it may contain\n" - "non-ASCII characters, use \"utf8_reverse\" instead."), +const FunctionDoc utf8_capitalize_doc( + "Capitalize the first character of input", + ("For each string in `strings`, return a capitalized version,\n" + "with the first character uppercased and the others lowercased."), {"strings"}); const FunctionDoc utf8_reverse_doc( - "Reverse utf8 input", - ("For each utf8 string in `strings`, return a reversed version.\n\n" - "This function operates on codepoints/UTF-8 code units, not grapheme\n" + "Reverse input", + ("For each string in `strings`, return a reversed version.\n\n" + "This function operates on Unicode codepoints, not grapheme\n" "clusters. Hence, it will not correctly reverse grapheme clusters\n" "composed of multiple codepoints."), {"strings"}); @@ -4113,6 +4171,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel( "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE); + MakeUnaryStringBatchKernel("ascii_capitalize", registry, + &ascii_capitalize_doc); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, @@ -4158,6 +4218,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MakeUnaryStringUTF8TransformKernel("utf8_lower", registry, &utf8_lower_doc); MakeUnaryStringUTF8TransformKernel("utf8_swapcase", registry, &utf8_swapcase_doc); + MakeUnaryStringBatchKernel("utf8_capitalize", registry, + &utf8_capitalize_doc); MakeUnaryStringBatchKernel("utf8_trim_whitespace", registry, &utf8_trim_whitespace_doc); MakeUnaryStringBatchKernel("utf8_ltrim_whitespace", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 3aa6f5368d2f5..920197ca3c337 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -403,6 +403,16 @@ TYPED_TEST(TestStringKernels, AsciiSwapCase) { "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); } +TYPED_TEST(TestStringKernels, AsciiCapitalize) { + this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]"); + this->CheckUnary("ascii_capitalize", + "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", " + "\"!hELlo, wORLd!\"]", + this->type(), + "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", " + "\"!hello, world!\"]"); +} + TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), @@ -462,7 +472,7 @@ TYPED_TEST(TestStringKernels, Utf8Upper) { this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), "[\"AAAZZÆÆ&\", null, \"\", \"B\"]"); - // test varying encoding lenghts and thus changing indices/offsets + // test varying encoding lengths and thus changing indices/offsets this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(), "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]"); @@ -521,6 +531,16 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { CallFunction("utf8_swapcase", {invalid_input})); } +TYPED_TEST(TestStringKernels, Utf8Capitalize) { + this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]"); + this->CheckUnary("utf8_capitalize", + "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " + "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", + this->type(), + "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", " + "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]"); +} + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 01dc1d92e17ad..b12d0f2efde07 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -587,6 +587,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +=========================+=======+========================+========================+===================================+=======+ +| ascii_capitalize | Unary | String-like | String-like | | | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_lower | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_reverse | Unary | String-like | String-like | | \(2) | @@ -603,6 +605,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_capitalize | Unary | String-like | String-like | | | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_length | Unary | String-like | Int32 or Int64 | | \(7) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_lower | Unary | String-like | String-like | | \(8) | diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index c503cba319ca1..b3ab086899ae9 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -250,6 +250,7 @@ String Transforms .. autosummary:: :toctree: ../generated/ + ascii_capitalize ascii_center ascii_lpad ascii_ltrim @@ -266,6 +267,7 @@ String Transforms binary_replace_slice replace_substring replace_substring_regex + utf8_capitalize utf8_center utf8_length utf8_lower