ARROW-12944: [C++] String capitalize kernel

This PR adds scalar compute functions for string capitalization, namely "ascii_capitalize" and "utf8_capitalize". Closes apache#10857 from edponce/ARROW-12944-String-capitalize-kernel Authored-by: Eduardo Ponce <edponce00@gmail.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
michalursa · Aug 17, 2021 · 12f9a4d · 12f9a4d
1 parent d484e9b
commit 12f9a4d
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 11 deletions.
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -446,10 +446,10 @@ struct StringTransformCodepoint : public StringTransformBase {
 // struct CaseMappingMixin {
 struct CaseMappingTransform {
   static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
-    // Section 5.18 of the Unicode spec claim that the number of codepoints for case
+    // Section 5.18 of the Unicode spec claims that the number of codepoints for case
     // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
     // However, since we don't support all casings (SpecialCasing.txt) the growth
-    // in bytes iss actually only at max 3/2 (as covered by the unittest).
+    // in bytes is actually only at max 3/2 (as covered by the unittest).
     // Note that rounding down the 3/2 is ok, since only codepoints encoded by
     // two code units (even) can grow to 3 code units.
     return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
@@ -496,6 +496,37 @@ template <typename Type>
 using UTF8SwapCase =
     StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;
 
+struct Utf8CapitalizeTransform : public StringTransformBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    uint8_t* output_start = output;
+    if (input_string_ncodeunits > 0) {
+      // Get number of code units in first code point
+      uint32_t codepoint = 0;
+      const uint8_t* i = input;
+      if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) {
+        return kTransformError;
+      }
+      int64_t codepoint_ncodeunits =
+          std::min(static_cast<int64_t>(i - input), input_string_ncodeunits);
+      if (ARROW_PREDICT_FALSE(
+              !util::UTF8Transform(input, input + codepoint_ncodeunits, &output,
+                                   UTF8UpperTransform::TransformCodepoint))) {
+        return kTransformError;
+      }
+      if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
+              input + codepoint_ncodeunits, input + input_string_ncodeunits, &output,
+              UTF8LowerTransform::TransformCodepoint))) {
+        return kTransformError;
+      }
+    }
+    return output - output_start;
+  }
+};
+
+template <typename Type>
+using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;
+
 #endif  // ARROW_WITH_UTF8PROC
 
 struct AsciiReverseTransform : public StringTransformBase {
@@ -632,6 +663,20 @@ struct AsciiSwapCase {
   }
 };
 
+struct AsciiCapitalizeTransform : public StringTransformBase {
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    if (input_string_ncodeunits > 0) {
+      *output = ascii_toupper(*input);
+      TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1);
+    }
+    return input_string_ncodeunits;
+  }
+};
+
+template <typename Type>
+using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;
+
 // ----------------------------------------------------------------------
 // exact pattern detection
 
@@ -4074,6 +4119,20 @@ const FunctionDoc ascii_swapcase_doc(
      "non-ASCII characters, use \"utf8_swapcase\" instead."),
     {"strings"});
 
+const FunctionDoc ascii_capitalize_doc(
+    "Capitalize the first character of ASCII input",
+    ("For each string in `strings`, return a capitalized version.\n\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
+     "non-ASCII characters, use \"utf8_capitalize\" instead."),
+    {"strings"});
+
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ASCII input",
+    ("For each ASCII string in `strings`, return a reversed version.\n\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
+     "non-ASCII characters, use \"utf8_reverse\" instead."),
+    {"strings"});
+
 const FunctionDoc utf8_upper_doc(
     "Transform input to uppercase",
     ("For each string in `strings`, return an uppercase version."), {"strings"});
@@ -4087,17 +4146,16 @@ const FunctionDoc utf8_swapcase_doc(
     "lowercase",
     ("For each string in `strings`, return an opposite case version."), {"strings"});
 
-const FunctionDoc ascii_reverse_doc(
-    "Reverse ASCII input",
-    ("For each ASCII string in `strings`, return a reversed version.\n\n"
-     "This function assumes the input is fully ASCII.  If it may contain\n"
-     "non-ASCII characters, use \"utf8_reverse\" instead."),
+const FunctionDoc utf8_capitalize_doc(
+    "Capitalize the first character of input",
+    ("For each string in `strings`, return a capitalized version,\n"
+     "with the first character uppercased and the others lowercased."),
     {"strings"});
 
 const FunctionDoc utf8_reverse_doc(
-    "Reverse utf8 input",
-    ("For each utf8 string in `strings`, return a reversed version.\n\n"
-     "This function operates on codepoints/UTF-8 code units, not grapheme\n"
+    "Reverse input",
+    ("For each string in `strings`, return a reversed version.\n\n"
+     "This function operates on Unicode codepoints, not grapheme\n"
      "clusters. Hence, it will not correctly reverse grapheme clusters\n"
      "composed of multiple codepoints."),
     {"strings"});
@@ -4113,6 +4171,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
                                          MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiSwapCase>(
       "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
+  MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
+                                              &ascii_capitalize_doc);
   MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
                                                   &ascii_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
@@ -4158,6 +4218,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
   MakeUnaryStringUTF8TransformKernel<UTF8SwapCase>("utf8_swapcase", registry,
                                                    &utf8_swapcase_doc);
+  MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
+                                             &utf8_capitalize_doc);
   MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
                                                  &utf8_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -403,6 +403,16 @@ TYPED_TEST(TestStringKernels, AsciiSwapCase) {
                    "[\"HeLLo, wOrLD!\", \"$. a35?\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiCapitalize) {
+  this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_capitalize",
+                   "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
+                   "\"!hELlo, wORLd!\"]",
+                   this->type(),
+                   "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", "
+                   "\"!hello, world!\"]");
+}
+
 TYPED_TEST(TestStringKernels, AsciiReverse) {
   this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
   this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
@@ -462,7 +472,7 @@ TYPED_TEST(TestStringKernels, Utf8Upper) {
   this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
                    "[\"AAAZZÆÆ&\", null, \"\", \"B\"]");
 
-  // test varying encoding lenghts and thus changing indices/offsets
+  // test varying encoding lengths and thus changing indices/offsets
   this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(),
                    "[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");
 
@@ -521,6 +531,16 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
                                   CallFunction("utf8_swapcase", {invalid_input}));
 }
 
+TYPED_TEST(TestStringKernels, Utf8Capitalize) {
+  this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_capitalize",
+                   "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
+                   "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
+                   this->type(),
+                   "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
+                   "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
+}
+
 TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
   // U+08BE (utf8: 	\xE0\xA2\xBE) is undefined, but utf8proc things it is
   // UTF8PROC_CATEGORY_LO

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
@@ -587,6 +587,8 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | Function name           | Arity | Input types            | Output type            | Options class                     | Notes |
 +=========================+=======+========================+========================+===================================+=======+
+| ascii_capitalize        | Unary | String-like            | String-like            |                                   |       |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | ascii_lower             | Unary | String-like            | String-like            |                                   | \(1)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | ascii_reverse           | Unary | String-like            | String-like            |                                   | \(2)  |
@@ -603,6 +605,8 @@ String transforms
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | replace_substring_regex | Unary | String-like            | String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
+| utf8_capitalize         | Unary | String-like            | String-like            |                                   |       |
++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_length             | Unary | String-like            | Int32 or Int64         |                                   | \(7)  |
 +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
 | utf8_lower              | Unary | String-like            | String-like            |                                   | \(8)  |

diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
@@ -250,6 +250,7 @@ String Transforms
 .. autosummary::
    :toctree: ../generated/
 
+   ascii_capitalize
    ascii_center
    ascii_lpad
    ascii_ltrim
@@ -266,6 +267,7 @@ String Transforms
    binary_replace_slice
    replace_substring
    replace_substring_regex
+   utf8_capitalize
    utf8_center
    utf8_length
    utf8_lower