ARROW-1491: [C++] Add casting from strings to numbers and booleans

pitrou · wesm · commit c84aac8653d4 · 2018-08-06T15:44:53.000-04:00
The implementation for numbers uses the C standard strto* functions.
This makes casting a bit lenient (it will accept whitespace).
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
@@ -769,6 +769,109 @@ TEST_F(TestCast, OffsetOutputBuffer) {
                                                                 int16(), e3);
 }
 
+TEST_F(TestCast, StringToBoolean) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true, false, true, true, true};
+
+  vector<std::string> v1 = {"False", "true", "true", "True", "false"};
+  vector<std::string> v2 = {"0", "1", "1", "1", "0"};
+  vector<bool> e = {false, true, true, true, false};
+  CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v1, is_valid, boolean(),
+                                                        e, options);
+  CheckCase<StringType, std::string, BooleanType, bool>(utf8(), v2, is_valid, boolean(),
+                                                        e, options);
+}
+
+TEST_F(TestCast, StringToBooleanErrors) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true};
+
+  CheckFails<StringType, std::string>(utf8(), {"false "}, is_valid, boolean(), options);
+  CheckFails<StringType, std::string>(utf8(), {"T"}, is_valid, boolean(), options);
+}
+
+TEST_F(TestCast, StringToNumber) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true, false, true, true, true};
+
+  // string to int
+  vector<std::string> v_int = {"0", "1", "127", "-1", "0"};
+  vector<int8_t> e_int8 = {0, 1, 127, -1, 0};
+  vector<int16_t> e_int16 = {0, 1, 127, -1, 0};
+  vector<int32_t> e_int32 = {0, 1, 127, -1, 0};
+  vector<int64_t> e_int64 = {0, 1, 127, -1, 0};
+  CheckCase<StringType, std::string, Int8Type, int8_t>(utf8(), v_int, is_valid, int8(),
+                                                       e_int8, options);
+  CheckCase<StringType, std::string, Int16Type, int16_t>(utf8(), v_int, is_valid, int16(),
+                                                         e_int16, options);
+  CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(),
+                                                         e_int32, options);
+  CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(),
+                                                         e_int64, options);
+
+  v_int = {"2147483647", "0", "-2147483648", "0", "0"};
+  e_int32 = {2147483647, 0, -2147483648LL, 0, 0};
+  CheckCase<StringType, std::string, Int32Type, int32_t>(utf8(), v_int, is_valid, int32(),
+                                                         e_int32, options);
+  v_int = {"9223372036854775807", "0", "-9223372036854775808", "0", "0"};
+  e_int64 = {9223372036854775807LL, 0, (-9223372036854775807LL - 1), 0, 0};
+  CheckCase<StringType, std::string, Int64Type, int64_t>(utf8(), v_int, is_valid, int64(),
+                                                         e_int64, options);
+
+  // string to uint
+  vector<std::string> v_uint = {"0", "1", "127", "255", "0"};
+  vector<uint8_t> e_uint8 = {0, 1, 127, 255, 0};
+  vector<uint16_t> e_uint16 = {0, 1, 127, 255, 0};
+  vector<uint32_t> e_uint32 = {0, 1, 127, 255, 0};
+  vector<uint64_t> e_uint64 = {0, 1, 127, 255, 0};
+  CheckCase<StringType, std::string, UInt8Type, uint8_t>(utf8(), v_uint, is_valid,
+                                                         uint8(), e_uint8, options);
+  CheckCase<StringType, std::string, UInt16Type, uint16_t>(utf8(), v_uint, is_valid,
+                                                           uint16(), e_uint16, options);
+  CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid,
+                                                           uint32(), e_uint32, options);
+  CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid,
+                                                           uint64(), e_uint64, options);
+
+  v_uint = {"4294967295", "0", "0", "0", "0"};
+  e_uint32 = {4294967295, 0, 0, 0, 0};
+  CheckCase<StringType, std::string, UInt32Type, uint32_t>(utf8(), v_uint, is_valid,
+                                                           uint32(), e_uint32, options);
+  v_uint = {"18446744073709551615", "0", "0", "0", "0"};
+  e_uint64 = {18446744073709551615ULL, 0, 0, 0, 0};
+  CheckCase<StringType, std::string, UInt64Type, uint64_t>(utf8(), v_uint, is_valid,
+                                                           uint64(), e_uint64, options);
+
+  // string to float
+  vector<std::string> v_float = {"0.1", "1.2", "127.3", "200.4", "0.5"};
+  vector<float> e_float = {0.1f, 1.2f, 127.3f, 200.4f, 0.5f};
+  vector<double> e_double = {0.1, 1.2, 127.3, 200.4, 0.5};
+  CheckCase<StringType, std::string, FloatType, float>(utf8(), v_float, is_valid,
+                                                       float32(), e_float, options);
+  CheckCase<StringType, std::string, DoubleType, double>(utf8(), v_float, is_valid,
+                                                         float64(), e_double, options);
+}
+
+TEST_F(TestCast, StringToNumberErrors) {
+  CastOptions options;
+
+  vector<bool> is_valid = {true};
+
+  CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, int8(), options);
+  CheckFails<StringType, std::string>(utf8(), {"12 z"}, is_valid, int8(), options);
+  CheckFails<StringType, std::string>(utf8(), {"128"}, is_valid, int8(), options);
+  CheckFails<StringType, std::string>(utf8(), {"-129"}, is_valid, int8(), options);
+  CheckFails<StringType, std::string>(utf8(), {"0.5"}, is_valid, int8(), options);
+
+  CheckFails<StringType, std::string>(utf8(), {"256"}, is_valid, uint8(), options);
+  CheckFails<StringType, std::string>(utf8(), {"-1"}, is_valid, uint8(), options);
+
+  CheckFails<StringType, std::string>(utf8(), {"z"}, is_valid, float32(), options);
+}
+
 template <typename TestType>
 class TestDictionaryCast : public TestCast {};
 
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
@@ -17,7 +17,9 @@
 
 #include "arrow/compute/kernels/cast.h"
 
+#include <cerrno>
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <limits>
@@ -727,6 +729,191 @@ struct CastFunctor<T, DictionaryType,
   }
 };
 
+// ----------------------------------------------------------------------
+// String to Number
+
+// Polymorphic wrapper around strtof() and friends
+static void StringToFloat(const char* str, char** str_end, float* out) {
+  *out = strtof(str, str_end);
+}
+
+static void StringToFloat(const char* str, char** str_end, double* out) {
+  *out = strtod(str, str_end);
+}
+
+// Function to cast a C string to a number.  Returns true on success,
+// false on error.
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value,
+                        bool>::type static CastStringToNumber(const char* str,
+                                                              size_t length, T* out) {
+  // Need a null-terminated copy to pass to the C library converters
+  std::string null_terminated(str, length);
+  str = null_terminated.data();
+  char* str_end;
+  StringToFloat(str, &str_end, out);
+  return (errno == 0 && static_cast<size_t>(str_end - str) == length);
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
+                        bool>::type static CastStringToNumber(const char* str,
+                                                              size_t length, T* out) {
+  static constexpr bool need_long_long = sizeof(T) > sizeof(long);  // NOLINT
+  static constexpr T min_value = std::numeric_limits<T>::min();
+  static constexpr T max_value = std::numeric_limits<T>::max();
+
+  // Need a null-terminated copy to pass to the C library converters
+  std::string null_terminated(str, length);
+  str = null_terminated.data();
+  char* str_end;
+  if (need_long_long) {
+    auto res = std::strtoll(str, &str_end, 10);
+    *out = static_cast<T>(res);  // may downcast
+    if (res < min_value || res > max_value) {
+      return false;
+    }
+  } else {
+    auto res = std::strtol(str, &str_end, 10);
+    *out = static_cast<T>(res);  // may downcast
+    if (res < min_value || res > max_value) {
+      return false;
+    }
+  }
+  return (errno == 0 && static_cast<size_t>(str_end - str) == length);
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
+                        bool>::type static CastStringToNumber(const char* str,
+                                                              size_t length, T* out) {
+  static constexpr bool need_long_long = sizeof(T) > sizeof(unsigned long);  // NOLINT
+  static constexpr T max_value = std::numeric_limits<T>::max();
+
+  // Need a null-terminated copy to pass to the C library converters
+  std::string null_terminated(str, length);
+  str = null_terminated.data();
+  char* str_end;
+  if (need_long_long) {
+    auto res = std::strtoull(str, &str_end, 10);
+    *out = static_cast<T>(res);  // may downcast
+    if (res > max_value) {
+      return false;
+    }
+  } else {
+    auto res = std::strtoul(str, &str_end, 10);
+    *out = static_cast<T>(res);  // may downcast
+    if (res > max_value) {
+      return false;
+    }
+  }
+  return (errno == 0 && static_cast<size_t>(str_end - str) == length);
+}
+
+template <typename O>
+struct CastFunctor<O, StringType, typename std::enable_if<is_number<O>::value>::type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using out_type = typename O::c_type;
+
+    StringArray input_array(input.Copy());
+    auto out_data = GetMutableValues<out_type>(output, 1);
+    errno = 0;
+
+    for (int64_t i = 0; i < input.length; ++i) {
+      if (input_array.IsNull(i)) {
+        out_data++;
+        continue;
+      }
+      int32_t length = -1;
+      auto str = input_array.GetValue(i, &length);
+      if (!CastStringToNumber(reinterpret_cast<const char*>(str),
+                              static_cast<size_t>(length), out_data)) {
+        std::stringstream ss;
+        ss << "Failed to cast String '" << input_array.GetString(i) << "' into "
+           << output->type->ToString();
+        ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
+        return;
+      }
+      ++out_data;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// String to Boolean
+
+// Helper function to cast a C string to a boolean.  Returns true on success,
+// false on error.
+
+static bool CastStringtoBoolean(const char* s, size_t length, bool* out) {
+  if (length == 1) {
+    // "0" or "1"?
+    if (s[0] == '0') {
+      *out = false;
+      return true;
+    }
+    if (s[0] == '1') {
+      *out = true;
+      return true;
+    }
+    return false;
+  }
+  if (length == 4) {
+    // "true"?
+    *out = true;
+    return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
+            (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
+  }
+  if (length == 5) {
+    // "false"?
+    *out = false;
+    return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
+            (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
+            (s[4] == 'e' || s[4] == 'E'));
+  }
+  return false;
+}
+
+template <typename O>
+struct CastFunctor<O, StringType,
+                   typename std::enable_if<std::is_same<BooleanType, O>::value>::type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    StringArray input_array(input.Copy());
+    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
+                                           output->offset, input.length);
+
+    for (int64_t i = 0; i < input.length; ++i) {
+      if (input_array.IsNull(i)) {
+        writer.Next();
+        continue;
+      }
+
+      int32_t length = -1;
+      auto str = input_array.GetValue(i, &length);
+      bool value;
+      if (!CastStringtoBoolean(reinterpret_cast<const char*>(str),
+                               static_cast<size_t>(length), &value)) {
+        std::stringstream ss;
+        ss << "Failed to cast String '" << input_array.GetString(i) << "' into "
+           << output->type->ToString();
+        ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
+        return;
+      }
+
+      if (value) {
+        writer.Set();
+      } else {
+        writer.Clear();
+      }
+      writer.Next();
+    }
+    writer.Finish();
+  }
+};
+
 // ----------------------------------------------------------------------
 
 typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
@@ -905,6 +1092,20 @@ class CastKernel : public UnaryKernel {
   FN(TimestampType, Date64Type);     \
   FN(TimestampType, Int64Type);
 
+#define STRING_CASES(FN, IN_TYPE) \
+  FN(StringType, StringType);     \
+  FN(StringType, BooleanType);    \
+  FN(StringType, UInt8Type);      \
+  FN(StringType, Int8Type);       \
+  FN(StringType, UInt16Type);     \
+  FN(StringType, Int16Type);      \
+  FN(StringType, UInt32Type);     \
+  FN(StringType, Int32Type);      \
+  FN(StringType, UInt64Type);     \
+  FN(StringType, Int64Type);      \
+  FN(StringType, FloatType);      \
+  FN(StringType, DoubleType);
+
 #define DICTIONARY_CASES(FN, IN_TYPE) \
   FN(IN_TYPE, NullType);              \
   FN(IN_TYPE, Time32Type);            \
@@ -962,6 +1163,7 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type);
 GET_CAST_FUNCTION(TIME32_CASES, Time32Type);
 GET_CAST_FUNCTION(TIME64_CASES, Time64Type);
 GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType);
+GET_CAST_FUNCTION(STRING_CASES, StringType);
 GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType);
 
 #define CAST_FUNCTION_CASE(InType)                      \
@@ -1009,6 +1211,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>&
     CAST_FUNCTION_CASE(Time32Type);
     CAST_FUNCTION_CASE(Time64Type);
     CAST_FUNCTION_CASE(TimestampType);
+    CAST_FUNCTION_CASE(StringType);
     CAST_FUNCTION_CASE(DictionaryType);
     case Type::LIST:
       RETURN_NOT_OK(GetListCastFunc(in_type, out_type, options, kernel));