Skip to content

Commit

Permalink
Remove string encoding enum (#183)
Browse files Browse the repository at this point in the history
Summary:
With the recent ascii compute changes (See: #32), the StringEncoding enum is redundant and no longer required.

Pull Request resolved: #183

Reviewed By: mbasmanova

Differential Revision: D30847517

Pulled By: kgpai

fbshipit-source-id: 7c850f2536d17af179d343138b154ba934fb9c28
  • Loading branch information
kgpai authored and facebook-github-bot committed Sep 10, 2021
1 parent 35ea790 commit 0243740
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 350 deletions.
3 changes: 0 additions & 3 deletions velox/expression/ControlExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@

namespace facebook::velox::exec {

using functions::stringCore::maxEncoding;
using functions::stringCore::StringEncodingMode;

void ConstantExpr::evalSpecialForm(
const SelectivityVector& rows,
EvalCtx* context,
Expand Down
3 changes: 0 additions & 3 deletions velox/expression/Expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ DEFINE_bool(

namespace facebook::velox::exec {

using functions::stringCore::maxEncoding;
using functions::stringCore::StringEncodingMode;

namespace {

bool isMember(
Expand Down
13 changes: 5 additions & 8 deletions velox/functions/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,15 @@ Note the symbol udf_round is automatically generated from the symbol round in th
For functions dealing with string inputs and outputs, please use the mechanisms provided for optimizing for ASCII and
Unicode cases. For example here's how upper function is implemented `functions/lib/string/StringImpl.h`

By checking `isAscii<stringEncoding>` we know the string is ascii and so run the extremely simpler version of the
function compared to unicode (utf8) implementation using `utf8proc` library.
By passing asciiness via the template variable we know the string is ascii and so run the extremely simpler version of the
function compared to unicode (utf8) implementation using `utf8proc` library. Note that the template variabe `ascii` having
value `true` indicates that the vector has no unicode character in it.

```c++
/// Perform upper for a UTF8 string
template <
StringEncodingMode stringEncoding,
typename TOutString,
typename TInString>
template <bool ascii, typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
if (isAsciiInput) {
if constexpr (ascii) {
output.resize(input.size());
upperAscii(output.data(), input.data(), input.size());
} else {
Expand Down
37 changes: 13 additions & 24 deletions velox/functions/lib/StringEncodingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,38 +23,27 @@ namespace facebook::velox::functions {
using namespace stringCore;

/// Return the string encoding of a vector, if not set UTF8 is returned
static StringEncodingMode getStringEncodingOrUTF8(
BaseVector* vector,
const SelectivityVector& rows) {
static bool isAscii(BaseVector* vector, const SelectivityVector& rows) {
if (auto simpleVector = vector->template as<SimpleVector<StringView>>()) {
auto ascii = simpleVector->isAscii(rows);
return ascii && ascii.value() ? StringEncodingMode::ASCII
: StringEncodingMode::UTF8;
return ascii.has_value() && ascii.value();
}
VELOX_UNREACHABLE();
return StringEncodingMode::UTF8;
return false;
}

/// Wrap an input function with the appropriate string encoding instantiation.
/// Func is a struct templated on StringEncodingMode with a static function
/// apply. The wrapper will call Func::apply<EncodingMode> with the correct
/// StringEncodingMode instantiation.
template <template <StringEncodingMode> typename Func>
/// Wrap an input function with the appropriate ascii instantiation.
/// Func is a struct templated on boolean with a static function
/// apply. The wrapper will call Func::apply<bool> based on truth value of the
/// boolean.
template <template <bool> typename Func>
struct StringEncodingTemplateWrapper {
template <typename... Params>
static void apply(const StringEncodingMode mode, Params... args) {
switch (mode) {
case StringEncodingMode::UTF8:
Func<StringEncodingMode::UTF8>::apply(args...);
return;

case StringEncodingMode::ASCII:
Func<StringEncodingMode::ASCII>::apply(args...);
return;

case StringEncodingMode::MOSTLY_ASCII:
Func<StringEncodingMode::MOSTLY_ASCII>::apply(args...);
return;
static void apply(const bool isAscii, Params... args) {
if (isAscii) {
Func<true>::apply(args...);
} else {
Func<false>::apply(args...);
}
}
};
Expand Down
46 changes: 3 additions & 43 deletions velox/functions/lib/string/StringCore.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,36 +43,10 @@
namespace facebook::velox::functions {
namespace stringCore {

/// Encoding of input string
enum class StringEncodingMode {
UTF8 = 0,
ASCII = 1,
MOSTLY_ASCII = 2,
};

// Define ordering on string encodings UTF8 > MOSTLY_ASCII > ASCII
FOLLY_ALWAYS_INLINE StringEncodingMode
maxEncoding(const StringEncodingMode& lhs, const StringEncodingMode& rhs) {
if (lhs == StringEncodingMode::UTF8 || rhs == StringEncodingMode::UTF8) {
return StringEncodingMode::UTF8;
}

if (lhs == StringEncodingMode::MOSTLY_ASCII ||
rhs == StringEncodingMode::MOSTLY_ASCII) {
return StringEncodingMode::MOSTLY_ASCII;
}

return StringEncodingMode::ASCII;
};

/// Check if a given string is ascii
template <StringEncodingMode mode = StringEncodingMode::MOSTLY_ASCII>
static bool isAscii(const char* str, size_t length);

template <>
FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::MOSTLY_ASCII>(
const char* str,
size_t length) {
FOLLY_ALWAYS_INLINE bool isAscii(const char* str, size_t length) {
for (auto i = 0; i < length; i++) {
if (str[i] & 0x80) {
return false;
Expand All @@ -81,20 +55,6 @@ FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::MOSTLY_ASCII>(
return true;
}

template <>
FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::UTF8>(
const char* str,
size_t length) {
return false;
}

template <>
FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::ASCII>(
const char* str,
size_t length) {
return true;
}

/// Perform upper for ascii string input
FOLLY_ALWAYS_INLINE static void
upperAscii(char* output, const char* input, size_t length) {
Expand Down Expand Up @@ -369,14 +329,14 @@ inline static size_t replace(
/// Given a utf8 string, a starting position and length returns the
/// corresponding underlying byte range [startByteIndex, endByteIndex).
/// Byte indicies starts from 0, UTF8 character positions starts from 1.
template <StringEncodingMode mode>
template <bool isAscii>
static inline std::pair<size_t, size_t>
getByteRange(const char* str, size_t startCharPosition, size_t length) {
if (startCharPosition < 1 && length > 0) {
throw std::invalid_argument(
"start position must be >= 1 and length must be > 0");
}
if constexpr (mode == StringEncodingMode::ASCII) {
if constexpr (isAscii) {
return std::make_pair(
startCharPosition - 1, startCharPosition + length - 1);
} else {
Expand Down
26 changes: 9 additions & 17 deletions velox/functions/lib/string/StringImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,9 @@ namespace stringImpl {
using namespace stringCore;

/// Perform upper for a UTF8 string
template <
StringEncodingMode stringEncoding,
typename TOutString,
typename TInString>
template <bool ascii, typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
if (isAsciiInput) {
if constexpr (ascii) {
output.resize(input.size());
upperAscii(output.data(), input.data(), input.size());
} else {
Expand All @@ -63,13 +59,9 @@ FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
}

/// Perform lower for a UTF8 string
template <
StringEncodingMode stringEncoding,
typename TOutString,
typename TInString>
template <bool ascii, typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE bool lower(TOutString& output, const TInString& input) {
bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
if (isAsciiInput) {
if constexpr (ascii) {
output.resize(input.size());
lowerAscii(output.data(), input.data(), input.size());
} else {
Expand Down Expand Up @@ -133,9 +125,9 @@ void concatDynamic(TOutString& output, const std::vector<TInString>& inputs) {
}

/// Return length of the input string in chars
template <StringEncodingMode stringEncoding, typename T>
template <bool isAscii, typename T>
FOLLY_ALWAYS_INLINE int64_t length(const T& input) {
if constexpr (stringEncoding == StringEncodingMode::ASCII) {
if constexpr (isAscii) {
return input.size();
} else {
return lengthUnicode(input.data(), input.size());
Expand Down Expand Up @@ -165,7 +157,7 @@ FOLLY_ALWAYS_INLINE void codePointToString(
/// string. Implements the logic of presto codepoint function.
template <typename T>
FOLLY_ALWAYS_INLINE int32_t charToCodePoint(const T& inputString) {
auto length = stringImpl::length<StringEncodingMode::UTF8>(inputString);
auto length = stringImpl::length</*isAscii*/ false>(inputString);
VELOX_USER_CHECK_EQ(
length,
1,
Expand All @@ -180,7 +172,7 @@ FOLLY_ALWAYS_INLINE int32_t charToCodePoint(const T& inputString) {
/// Returns the starting position in characters of the Nth instance of the
/// substring in string. Positions start with 1. If not found, 0 is returned. If
/// subString is empty result is 1.
template <StringEncodingMode stringEncoding, typename T>
template <bool isAscii, typename T>
FOLLY_ALWAYS_INLINE int64_t
stringPosition(const T& string, const T& subString, int64_t instance = 0) {
if (subString.size() == 0) {
Expand All @@ -200,7 +192,7 @@ stringPosition(const T& string, const T& subString, int64_t instance = 0) {

// Return the number of characters from the beginning of the string to
// byteIndex.
return length<stringEncoding>(std::string_view(string.data(), byteIndex)) + 1;
return length<isAscii>(std::string_view(string.data(), byteIndex)) + 1;
}

/// Replace replaced with replacement in inputString and write results to
Expand Down
46 changes: 20 additions & 26 deletions velox/functions/lib/string/tests/StringImplTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ TEST_F(StringImplTest, upperAscii) {
auto& expectedUpper = std::get<1>(testCase);

std::string upperOutput;
upper<StringEncodingMode::ASCII>(upperOutput, input);
upper</*ascii*/ true>(upperOutput, input);
ASSERT_EQ(upperOutput, expectedUpper);

upperOutput.clear();
upper<StringEncodingMode::MOSTLY_ASCII>(upperOutput, input);
upper</*ascii*/ false>(upperOutput, input);
ASSERT_EQ(upperOutput, expectedUpper);
}
}
Expand All @@ -84,11 +84,11 @@ TEST_F(StringImplTest, lowerAscii) {
auto& expectedLower = std::get<1>(testCase);

std::string lowerOutput;
lower<StringEncodingMode::ASCII>(lowerOutput, input);
lower</*ascii*/ true>(lowerOutput, input);
ASSERT_EQ(lowerOutput, expectedLower);

lowerOutput.clear();
lower<StringEncodingMode::MOSTLY_ASCII>(lowerOutput, input);
lower</*ascii*/ false>(lowerOutput, input);
ASSERT_EQ(lowerOutput, expectedLower);
}
}
Expand All @@ -99,11 +99,11 @@ TEST_F(StringImplTest, upperUnicode) {
auto& expectedUpper = std::get<1>(testCase);

std::string upperOutput;
upper<StringEncodingMode::UTF8>(upperOutput, input);
upper</*ascii*/ false>(upperOutput, input);
ASSERT_EQ(upperOutput, expectedUpper);

upperOutput.clear();
upper<StringEncodingMode::MOSTLY_ASCII>(upperOutput, input);
upper</*ascii*/ false>(upperOutput, input);
ASSERT_EQ(upperOutput, expectedUpper);
}
}
Expand All @@ -114,11 +114,11 @@ TEST_F(StringImplTest, lowerUnicode) {
auto& expectedLower = std::get<1>(testCase);

std::string lowerOutput;
lower<StringEncodingMode::UTF8>(lowerOutput, input);
lower</*ascii*/ false>(lowerOutput, input);
ASSERT_EQ(lowerOutput, expectedLower);

lowerOutput.clear();
lower<StringEncodingMode::MOSTLY_ASCII>(lowerOutput, input);
lower</*ascii*/ false>(lowerOutput, input);
ASSERT_EQ(lowerOutput, expectedLower);
}
}
Expand Down Expand Up @@ -183,24 +183,20 @@ TEST_F(StringImplTest, length) {
for (const auto& test : getUpperAsciiTestData()) {
auto& inputString = std::get<0>(test);

ASSERT_EQ(
length<StringEncodingMode::ASCII>(inputString), inputString.size());
ASSERT_EQ(
length<StringEncodingMode::UTF8>(inputString), inputString.size());
ASSERT_EQ(
length<StringEncodingMode::MOSTLY_ASCII>(inputString),
inputString.size());
ASSERT_EQ(length</*isAscii*/ true>(inputString), inputString.size());
ASSERT_EQ(length</*isAscii*/ false>(inputString), inputString.size());
ASSERT_EQ(length</*isAscii*/ false>(inputString), inputString.size());
}

// Test unicode inputs
for (auto& test : getLowerUnicodeTestData()) {
auto& inputString = std::get<0>(test);

ASSERT_EQ(
length<StringEncodingMode::UTF8>(inputString),
length</*isAscii*/ false>(inputString),
lengthUtf8Ref(inputString.data(), inputString.size()));
ASSERT_EQ(
length<StringEncodingMode::MOSTLY_ASCII>(inputString),
length</*isAscii*/ false>(inputString),
lengthUtf8Ref(inputString.data(), inputString.size()));
}
}
Expand Down Expand Up @@ -263,11 +259,11 @@ TEST_F(StringImplTest, stringPosition) {
const int64_t instance,
const int64_t expectedPosition) {
ASSERT_EQ(
stringPosition<StringEncodingMode::ASCII>(
stringPosition</*isAscii*/ true>(
StringView(string), StringView(substr), instance),
expectedPosition);
ASSERT_EQ(
stringPosition<StringEncodingMode::MOSTLY_ASCII>(
stringPosition</*isAscii*/ false>(
StringView(string), StringView(substr), instance),
expectedPosition);
};
Expand All @@ -277,11 +273,11 @@ TEST_F(StringImplTest, stringPosition) {
const int64_t instance,
const int64_t expectedPosition) {
ASSERT_EQ(
stringPosition<StringEncodingMode::UTF8>(
stringPosition</*isAscii*/ false>(
StringView(string), StringView(substr), instance),
expectedPosition);
ASSERT_EQ(
stringPosition<StringEncodingMode::MOSTLY_ASCII>(
stringPosition</*isAscii*/ false>(
StringView(string), StringView(substr), instance),
expectedPosition);
};
Expand All @@ -307,7 +303,7 @@ TEST_F(StringImplTest, stringPosition) {
testValidInputUnicode("abc/xyz/foo/bar", "/", 4, 0L);

EXPECT_THROW(
stringPosition<StringEncodingMode::MOSTLY_ASCII>(
stringPosition</*isAscii*/ false>(
StringView("foobar"), StringView("foobar"), 0),
VeloxUserError);
}
Expand Down Expand Up @@ -396,14 +392,12 @@ TEST_F(StringImplTest, getByteRange) {
auto expectedEndByteIndex = strlen(unicodeString);

// Find the byte range of unicodeString[i, end]
auto range =
getByteRange<StringEncodingMode::UTF8>(unicodeString, i, 6 - i + 1);
auto range = getByteRange</*isAscii*/ false>(unicodeString, i, 6 - i + 1);

EXPECT_EQ(expectedStartByteIndex, range.first);
EXPECT_EQ(expectedEndByteIndex, range.second);

range = getByteRange<StringEncodingMode::MOSTLY_ASCII>(
unicodeString, i, 6 - i + 1);
range = getByteRange</*isAscii*/ false>(unicodeString, i, 6 - i + 1);

EXPECT_EQ(expectedStartByteIndex, range.first);
EXPECT_EQ(expectedEndByteIndex, range.second);
Expand Down
Loading

0 comments on commit 0243740

Please sign in to comment.