Skip to content

Commit

Permalink
ARROW-12944: [C++] String capitalize kernel
Browse files Browse the repository at this point in the history
This PR adds scalar compute functions for string capitalization, namely "ascii_capitalize" and "utf8_capitalize".

Closes apache#10857 from edponce/ARROW-12944-String-capitalize-kernel

Authored-by: Eduardo Ponce <edponce00@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
edponce authored and michalursa committed Aug 17, 2021
1 parent d484e9b commit 12f9a4d
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 11 deletions.
82 changes: 72 additions & 10 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,10 +446,10 @@ struct StringTransformCodepoint : public StringTransformBase {
// struct CaseMappingMixin {
struct CaseMappingTransform {
static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
// Section 5.18 of the Unicode spec claim that the number of codepoints for case
// Section 5.18 of the Unicode spec claims that the number of codepoints for case
// mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
// However, since we don't support all casings (SpecialCasing.txt) the growth
// in bytes iss actually only at max 3/2 (as covered by the unittest).
// in bytes is actually only at max 3/2 (as covered by the unittest).
// Note that rounding down the 3/2 is ok, since only codepoints encoded by
// two code units (even) can grow to 3 code units.
return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
Expand Down Expand Up @@ -496,6 +496,37 @@ template <typename Type>
using UTF8SwapCase =
StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;

struct Utf8CapitalizeTransform : public StringTransformBase {
int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
uint8_t* output_start = output;
if (input_string_ncodeunits > 0) {
// Get number of code units in first code point
uint32_t codepoint = 0;
const uint8_t* i = input;
if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) {
return kTransformError;
}
int64_t codepoint_ncodeunits =
std::min(static_cast<int64_t>(i - input), input_string_ncodeunits);
if (ARROW_PREDICT_FALSE(
!util::UTF8Transform(input, input + codepoint_ncodeunits, &output,
UTF8UpperTransform::TransformCodepoint))) {
return kTransformError;
}
if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
input + codepoint_ncodeunits, input + input_string_ncodeunits, &output,
UTF8LowerTransform::TransformCodepoint))) {
return kTransformError;
}
}
return output - output_start;
}
};

template <typename Type>
using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;

#endif // ARROW_WITH_UTF8PROC

struct AsciiReverseTransform : public StringTransformBase {
Expand Down Expand Up @@ -632,6 +663,20 @@ struct AsciiSwapCase {
}
};

struct AsciiCapitalizeTransform : public StringTransformBase {
int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
if (input_string_ncodeunits > 0) {
*output = ascii_toupper(*input);
TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1);
}
return input_string_ncodeunits;
}
};

template <typename Type>
using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;

// ----------------------------------------------------------------------
// exact pattern detection

Expand Down Expand Up @@ -4074,6 +4119,20 @@ const FunctionDoc ascii_swapcase_doc(
"non-ASCII characters, use \"utf8_swapcase\" instead."),
{"strings"});

const FunctionDoc ascii_capitalize_doc(
"Capitalize the first character of ASCII input",
("For each string in `strings`, return a capitalized version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_capitalize\" instead."),
{"strings"});

const FunctionDoc ascii_reverse_doc(
"Reverse ASCII input",
("For each ASCII string in `strings`, return a reversed version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_reverse\" instead."),
{"strings"});

const FunctionDoc utf8_upper_doc(
"Transform input to uppercase",
("For each string in `strings`, return an uppercase version."), {"strings"});
Expand All @@ -4087,17 +4146,16 @@ const FunctionDoc utf8_swapcase_doc(
"lowercase",
("For each string in `strings`, return an opposite case version."), {"strings"});

const FunctionDoc ascii_reverse_doc(
"Reverse ASCII input",
("For each ASCII string in `strings`, return a reversed version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_reverse\" instead."),
const FunctionDoc utf8_capitalize_doc(
"Capitalize the first character of input",
("For each string in `strings`, return a capitalized version,\n"
"with the first character uppercased and the others lowercased."),
{"strings"});

const FunctionDoc utf8_reverse_doc(
"Reverse utf8 input",
("For each utf8 string in `strings`, return a reversed version.\n\n"
"This function operates on codepoints/UTF-8 code units, not grapheme\n"
"Reverse input",
("For each string in `strings`, return a reversed version.\n\n"
"This function operates on Unicode codepoints, not grapheme\n"
"clusters. Hence, it will not correctly reverse grapheme clusters\n"
"composed of multiple codepoints."),
{"strings"});
Expand All @@ -4113,6 +4171,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MemAllocation::NO_PREALLOCATE);
MakeUnaryStringBatchKernel<AsciiSwapCase>(
"ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
&ascii_capitalize_doc);
MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
&ascii_trim_whitespace_doc);
MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
Expand Down Expand Up @@ -4158,6 +4218,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
MakeUnaryStringUTF8TransformKernel<UTF8SwapCase>("utf8_swapcase", registry,
&utf8_swapcase_doc);
MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
&utf8_capitalize_doc);
MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
&utf8_trim_whitespace_doc);
MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
Expand Down
22 changes: 21 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,16 @@ TYPED_TEST(TestStringKernels, AsciiSwapCase) {
"[\"HeLLo, wOrLD!\", \"$. a35?\"]");
}

TYPED_TEST(TestStringKernels, AsciiCapitalize) {
this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
this->CheckUnary("ascii_capitalize",
"[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
"\"!hELlo, wORLd!\"]",
this->type(),
"[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", "
"\"!hello, world!\"]");
}

TYPED_TEST(TestStringKernels, AsciiReverse) {
this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
Expand Down Expand Up @@ -462,7 +472,7 @@ TYPED_TEST(TestStringKernels, Utf8Upper) {
this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
"[\"AAAZZÆÆ&\", null, \"\", \"B\"]");

// test varying encoding lenghts and thus changing indices/offsets
// test varying encoding lengths and thus changing indices/offsets
this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(),
"[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");

Expand Down Expand Up @@ -521,6 +531,16 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
CallFunction("utf8_swapcase", {invalid_input}));
}

TYPED_TEST(TestStringKernels, Utf8Capitalize) {
this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
this->CheckUnary("utf8_capitalize",
"[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
"\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
this->type(),
"[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
"\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
}

TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
Expand Down
4 changes: 4 additions & 0 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,8 @@ String transforms
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=========================+=======+========================+========================+===================================+=======+
| ascii_capitalize | Unary | String-like | String-like | | |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| ascii_lower | Unary | String-like | String-like | | \(1) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| ascii_reverse | Unary | String-like | String-like | | \(2) |
Expand All @@ -603,6 +605,8 @@ String transforms
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_capitalize | Unary | String-like | String-like | | |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_length | Unary | String-like | Int32 or Int64 | | \(7) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_lower | Unary | String-like | String-like | | \(8) |
Expand Down
2 changes: 2 additions & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ String Transforms
.. autosummary::
:toctree: ../generated/

ascii_capitalize
ascii_center
ascii_lpad
ascii_ltrim
Expand All @@ -266,6 +267,7 @@ String Transforms
binary_replace_slice
replace_substring
replace_substring_regex
utf8_capitalize
utf8_center
utf8_length
utf8_lower
Expand Down

0 comments on commit 12f9a4d

Please sign in to comment.