Remove string encoding enum (#183)

Summary: With the recent ascii compute changes (See: #32), the StringEncoding enum is redundant and no longer required. Pull Request resolved: #183 Reviewed By: mbasmanova Differential Revision: D30847517 Pulled By: kgpai fbshipit-source-id: 7c850f2536d17af179d343138b154ba934fb9c28
facebookincubator · Sep 10, 2021 · 0243740 · 0243740
1 parent 35ea790
commit 0243740
Show file tree

Hide file tree

Showing 12 changed files with 127 additions and 350 deletions.
diff --git a/velox/expression/ControlExpr.cpp b/velox/expression/ControlExpr.cpp
@@ -21,9 +21,6 @@
 
 namespace facebook::velox::exec {
 
-using functions::stringCore::maxEncoding;
-using functions::stringCore::StringEncodingMode;
-
 void ConstantExpr::evalSpecialForm(
     const SelectivityVector& rows,
     EvalCtx* context,

diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp
@@ -29,9 +29,6 @@ DEFINE_bool(
 
 namespace facebook::velox::exec {
 
-using functions::stringCore::maxEncoding;
-using functions::stringCore::StringEncodingMode;
-
 namespace {
 
 bool isMember(

diff --git a/velox/functions/README.md b/velox/functions/README.md
@@ -41,18 +41,15 @@ Note the symbol udf_round is automatically generated from the symbol round in th
 For functions dealing with string inputs and outputs, please use the mechanisms provided for optimizing for ASCII and
 Unicode cases. For example here's how upper function is implemented `functions/lib/string/StringImpl.h`
 
-By checking `isAscii<stringEncoding>` we know the string is ascii and so run the extremely simpler version of the
-function compared to unicode (utf8) implementation using `utf8proc` library.
+By passing asciiness via the template variable we know the string is ascii and so run the extremely simpler version of the
+function compared to unicode (utf8) implementation using `utf8proc` library. Note that the template variabe `ascii` having
+value `true` indicates that the vector has no unicode character in it.
 
 ```c++
 /// Perform upper for a UTF8 string
-template <
-    StringEncodingMode stringEncoding,
-    typename TOutString,
-    typename TInString>
+template <bool ascii, typename TOutString, typename TInString>
 FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
-  bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
-  if (isAsciiInput) {
+  if constexpr (ascii) {
     output.resize(input.size());
     upperAscii(output.data(), input.data(), input.size());
   } else {

diff --git a/velox/functions/lib/StringEncodingUtils.h b/velox/functions/lib/StringEncodingUtils.h
@@ -23,38 +23,27 @@ namespace facebook::velox::functions {
 using namespace stringCore;
 
 /// Return the string encoding of a vector, if not set UTF8 is returned
-static StringEncodingMode getStringEncodingOrUTF8(
-    BaseVector* vector,
-    const SelectivityVector& rows) {
+static bool isAscii(BaseVector* vector, const SelectivityVector& rows) {
   if (auto simpleVector = vector->template as<SimpleVector<StringView>>()) {
     auto ascii = simpleVector->isAscii(rows);
-    return ascii && ascii.value() ? StringEncodingMode::ASCII
-                                  : StringEncodingMode::UTF8;
+    return ascii.has_value() && ascii.value();
   }
   VELOX_UNREACHABLE();
-  return StringEncodingMode::UTF8;
+  return false;
 }
 
-/// Wrap an input function with the appropriate string encoding instantiation.
-/// Func is a struct templated on StringEncodingMode with a static function
-/// apply. The wrapper will call Func::apply<EncodingMode> with the correct
-/// StringEncodingMode instantiation.
-template <template <StringEncodingMode> typename Func>
+/// Wrap an input function with the appropriate ascii instantiation.
+/// Func is a struct templated on boolean with a static function
+/// apply. The wrapper will call Func::apply<bool> based on truth value of the
+/// boolean.
+template <template <bool> typename Func>
 struct StringEncodingTemplateWrapper {
   template <typename... Params>
-  static void apply(const StringEncodingMode mode, Params... args) {
-    switch (mode) {
-      case StringEncodingMode::UTF8:
-        Func<StringEncodingMode::UTF8>::apply(args...);
-        return;
-
-      case StringEncodingMode::ASCII:
-        Func<StringEncodingMode::ASCII>::apply(args...);
-        return;
-
-      case StringEncodingMode::MOSTLY_ASCII:
-        Func<StringEncodingMode::MOSTLY_ASCII>::apply(args...);
-        return;
+  static void apply(const bool isAscii, Params... args) {
+    if (isAscii) {
+      Func<true>::apply(args...);
+    } else {
+      Func<false>::apply(args...);
     }
   }
 };

diff --git a/velox/functions/lib/string/StringCore.h b/velox/functions/lib/string/StringCore.h
@@ -43,36 +43,10 @@
 namespace facebook::velox::functions {
 namespace stringCore {
 
-/// Encoding of input string
-enum class StringEncodingMode {
-  UTF8 = 0,
-  ASCII = 1,
-  MOSTLY_ASCII = 2,
-};
-
-// Define ordering on string encodings UTF8 > MOSTLY_ASCII > ASCII
-FOLLY_ALWAYS_INLINE StringEncodingMode
-maxEncoding(const StringEncodingMode& lhs, const StringEncodingMode& rhs) {
-  if (lhs == StringEncodingMode::UTF8 || rhs == StringEncodingMode::UTF8) {
-    return StringEncodingMode::UTF8;
-  }
-
-  if (lhs == StringEncodingMode::MOSTLY_ASCII ||
-      rhs == StringEncodingMode::MOSTLY_ASCII) {
-    return StringEncodingMode::MOSTLY_ASCII;
-  }
-
-  return StringEncodingMode::ASCII;
-};
-
 /// Check if a given string is ascii
-template <StringEncodingMode mode = StringEncodingMode::MOSTLY_ASCII>
 static bool isAscii(const char* str, size_t length);
 
-template <>
-FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::MOSTLY_ASCII>(
-    const char* str,
-    size_t length) {
+FOLLY_ALWAYS_INLINE bool isAscii(const char* str, size_t length) {
   for (auto i = 0; i < length; i++) {
     if (str[i] & 0x80) {
       return false;
@@ -81,20 +55,6 @@ FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::MOSTLY_ASCII>(
   return true;
 }
 
-template <>
-FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::UTF8>(
-    const char* str,
-    size_t length) {
-  return false;
-}
-
-template <>
-FOLLY_ALWAYS_INLINE bool isAscii<StringEncodingMode::ASCII>(
-    const char* str,
-    size_t length) {
-  return true;
-}
-
 /// Perform upper for ascii string input
 FOLLY_ALWAYS_INLINE static void
 upperAscii(char* output, const char* input, size_t length) {
@@ -369,14 +329,14 @@ inline static size_t replace(
 /// Given a utf8 string, a starting position and length returns the
 /// corresponding underlying byte range [startByteIndex, endByteIndex).
 /// Byte indicies starts from 0, UTF8 character positions starts from 1.
-template <StringEncodingMode mode>
+template <bool isAscii>
 static inline std::pair<size_t, size_t>
 getByteRange(const char* str, size_t startCharPosition, size_t length) {
   if (startCharPosition < 1 && length > 0) {
     throw std::invalid_argument(
         "start position must be >= 1 and length must be > 0");
   }
-  if constexpr (mode == StringEncodingMode::ASCII) {
+  if constexpr (isAscii) {
     return std::make_pair(
         startCharPosition - 1, startCharPosition + length - 1);
   } else {

diff --git a/velox/functions/lib/string/StringImpl.h b/velox/functions/lib/string/StringImpl.h
@@ -44,13 +44,9 @@ namespace stringImpl {
 using namespace stringCore;
 
 /// Perform upper for a UTF8 string
-template <
-    StringEncodingMode stringEncoding,
-    typename TOutString,
-    typename TInString>
+template <bool ascii, typename TOutString, typename TInString>
 FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
-  bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
-  if (isAsciiInput) {
+  if constexpr (ascii) {
     output.resize(input.size());
     upperAscii(output.data(), input.data(), input.size());
   } else {
@@ -63,13 +59,9 @@ FOLLY_ALWAYS_INLINE bool upper(TOutString& output, const TInString& input) {
 }
 
 /// Perform lower for a UTF8 string
-template <
-    StringEncodingMode stringEncoding,
-    typename TOutString,
-    typename TInString>
+template <bool ascii, typename TOutString, typename TInString>
 FOLLY_ALWAYS_INLINE bool lower(TOutString& output, const TInString& input) {
-  bool isAsciiInput = isAscii<stringEncoding>(input.data(), input.size());
-  if (isAsciiInput) {
+  if constexpr (ascii) {
     output.resize(input.size());
     lowerAscii(output.data(), input.data(), input.size());
   } else {
@@ -133,9 +125,9 @@ void concatDynamic(TOutString& output, const std::vector<TInString>& inputs) {
 }
 
 /// Return length of the input string in chars
-template <StringEncodingMode stringEncoding, typename T>
+template <bool isAscii, typename T>
 FOLLY_ALWAYS_INLINE int64_t length(const T& input) {
-  if constexpr (stringEncoding == StringEncodingMode::ASCII) {
+  if constexpr (isAscii) {
     return input.size();
   } else {
     return lengthUnicode(input.data(), input.size());
@@ -165,7 +157,7 @@ FOLLY_ALWAYS_INLINE void codePointToString(
 /// string. Implements the logic of presto codepoint function.
 template <typename T>
 FOLLY_ALWAYS_INLINE int32_t charToCodePoint(const T& inputString) {
-  auto length = stringImpl::length<StringEncodingMode::UTF8>(inputString);
+  auto length = stringImpl::length</*isAscii*/ false>(inputString);
   VELOX_USER_CHECK_EQ(
       length,
       1,
@@ -180,7 +172,7 @@ FOLLY_ALWAYS_INLINE int32_t charToCodePoint(const T& inputString) {
 /// Returns the starting position in characters of the Nth instance of the
 /// substring in string. Positions start with 1. If not found, 0 is returned. If
 /// subString is empty result is 1.
-template <StringEncodingMode stringEncoding, typename T>
+template <bool isAscii, typename T>
 FOLLY_ALWAYS_INLINE int64_t
 stringPosition(const T& string, const T& subString, int64_t instance = 0) {
   if (subString.size() == 0) {
@@ -200,7 +192,7 @@ stringPosition(const T& string, const T& subString, int64_t instance = 0) {
 
   // Return the number of characters from the beginning of the string to
   // byteIndex.
-  return length<stringEncoding>(std::string_view(string.data(), byteIndex)) + 1;
+  return length<isAscii>(std::string_view(string.data(), byteIndex)) + 1;
 }
 
 /// Replace replaced with replacement in inputString and write results to

diff --git a/velox/functions/lib/string/tests/StringImplTest.cpp b/velox/functions/lib/string/tests/StringImplTest.cpp
@@ -69,11 +69,11 @@ TEST_F(StringImplTest, upperAscii) {
     auto& expectedUpper = std::get<1>(testCase);
 
     std::string upperOutput;
-    upper<StringEncodingMode::ASCII>(upperOutput, input);
+    upper</*ascii*/ true>(upperOutput, input);
     ASSERT_EQ(upperOutput, expectedUpper);
 
     upperOutput.clear();
-    upper<StringEncodingMode::MOSTLY_ASCII>(upperOutput, input);
+    upper</*ascii*/ false>(upperOutput, input);
     ASSERT_EQ(upperOutput, expectedUpper);
   }
 }
@@ -84,11 +84,11 @@ TEST_F(StringImplTest, lowerAscii) {
     auto& expectedLower = std::get<1>(testCase);
 
     std::string lowerOutput;
-    lower<StringEncodingMode::ASCII>(lowerOutput, input);
+    lower</*ascii*/ true>(lowerOutput, input);
     ASSERT_EQ(lowerOutput, expectedLower);
 
     lowerOutput.clear();
-    lower<StringEncodingMode::MOSTLY_ASCII>(lowerOutput, input);
+    lower</*ascii*/ false>(lowerOutput, input);
     ASSERT_EQ(lowerOutput, expectedLower);
   }
 }
@@ -99,11 +99,11 @@ TEST_F(StringImplTest, upperUnicode) {
     auto& expectedUpper = std::get<1>(testCase);
 
     std::string upperOutput;
-    upper<StringEncodingMode::UTF8>(upperOutput, input);
+    upper</*ascii*/ false>(upperOutput, input);
     ASSERT_EQ(upperOutput, expectedUpper);
 
     upperOutput.clear();
-    upper<StringEncodingMode::MOSTLY_ASCII>(upperOutput, input);
+    upper</*ascii*/ false>(upperOutput, input);
     ASSERT_EQ(upperOutput, expectedUpper);
   }
 }
@@ -114,11 +114,11 @@ TEST_F(StringImplTest, lowerUnicode) {
     auto& expectedLower = std::get<1>(testCase);
 
     std::string lowerOutput;
-    lower<StringEncodingMode::UTF8>(lowerOutput, input);
+    lower</*ascii*/ false>(lowerOutput, input);
     ASSERT_EQ(lowerOutput, expectedLower);
 
     lowerOutput.clear();
-    lower<StringEncodingMode::MOSTLY_ASCII>(lowerOutput, input);
+    lower</*ascii*/ false>(lowerOutput, input);
     ASSERT_EQ(lowerOutput, expectedLower);
   }
 }
@@ -183,24 +183,20 @@ TEST_F(StringImplTest, length) {
   for (const auto& test : getUpperAsciiTestData()) {
     auto& inputString = std::get<0>(test);
 
-    ASSERT_EQ(
-        length<StringEncodingMode::ASCII>(inputString), inputString.size());
-    ASSERT_EQ(
-        length<StringEncodingMode::UTF8>(inputString), inputString.size());
-    ASSERT_EQ(
-        length<StringEncodingMode::MOSTLY_ASCII>(inputString),
-        inputString.size());
+    ASSERT_EQ(length</*isAscii*/ true>(inputString), inputString.size());
+    ASSERT_EQ(length</*isAscii*/ false>(inputString), inputString.size());
+    ASSERT_EQ(length</*isAscii*/ false>(inputString), inputString.size());
   }
 
   // Test unicode inputs
   for (auto& test : getLowerUnicodeTestData()) {
     auto& inputString = std::get<0>(test);
 
     ASSERT_EQ(
-        length<StringEncodingMode::UTF8>(inputString),
+        length</*isAscii*/ false>(inputString),
         lengthUtf8Ref(inputString.data(), inputString.size()));
     ASSERT_EQ(
-        length<StringEncodingMode::MOSTLY_ASCII>(inputString),
+        length</*isAscii*/ false>(inputString),
         lengthUtf8Ref(inputString.data(), inputString.size()));
   }
 }
@@ -263,11 +259,11 @@ TEST_F(StringImplTest, stringPosition) {
                                 const int64_t instance,
                                 const int64_t expectedPosition) {
     ASSERT_EQ(
-        stringPosition<StringEncodingMode::ASCII>(
+        stringPosition</*isAscii*/ true>(
             StringView(string), StringView(substr), instance),
         expectedPosition);
     ASSERT_EQ(
-        stringPosition<StringEncodingMode::MOSTLY_ASCII>(
+        stringPosition</*isAscii*/ false>(
             StringView(string), StringView(substr), instance),
         expectedPosition);
   };
@@ -277,11 +273,11 @@ TEST_F(StringImplTest, stringPosition) {
                                   const int64_t instance,
                                   const int64_t expectedPosition) {
     ASSERT_EQ(
-        stringPosition<StringEncodingMode::UTF8>(
+        stringPosition</*isAscii*/ false>(
             StringView(string), StringView(substr), instance),
         expectedPosition);
     ASSERT_EQ(
-        stringPosition<StringEncodingMode::MOSTLY_ASCII>(
+        stringPosition</*isAscii*/ false>(
             StringView(string), StringView(substr), instance),
         expectedPosition);
   };
@@ -307,7 +303,7 @@ TEST_F(StringImplTest, stringPosition) {
   testValidInputUnicode("abc/xyz/foo/bar", "/", 4, 0L);
 
   EXPECT_THROW(
-      stringPosition<StringEncodingMode::MOSTLY_ASCII>(
+      stringPosition</*isAscii*/ false>(
           StringView("foobar"), StringView("foobar"), 0),
       VeloxUserError);
 }
@@ -396,14 +392,12 @@ TEST_F(StringImplTest, getByteRange) {
     auto expectedEndByteIndex = strlen(unicodeString);
 
     // Find the byte range of unicodeString[i, end]
-    auto range =
-        getByteRange<StringEncodingMode::UTF8>(unicodeString, i, 6 - i + 1);
+    auto range = getByteRange</*isAscii*/ false>(unicodeString, i, 6 - i + 1);
 
     EXPECT_EQ(expectedStartByteIndex, range.first);
     EXPECT_EQ(expectedEndByteIndex, range.second);
 
-    range = getByteRange<StringEncodingMode::MOSTLY_ASCII>(
-        unicodeString, i, 6 - i + 1);
+    range = getByteRange</*isAscii*/ false>(unicodeString, i, 6 - i + 1);
 
     EXPECT_EQ(expectedStartByteIndex, range.first);
     EXPECT_EQ(expectedEndByteIndex, range.second);