diff --git a/cpp/src/arrow/util/parsing-util-test.cc b/cpp/src/arrow/util/parsing-util-test.cc index b126b8211b0..9fa5ffb7a91 100644 --- a/cpp/src/arrow/util/parsing-util-test.cc +++ b/cpp/src/arrow/util/parsing-util-test.cc @@ -119,13 +119,16 @@ TEST(StringConversion, ToInt8) { AssertConversion(converter, "0", 0); AssertConversion(converter, "127", 127); + AssertConversion(converter, "0127", 127); AssertConversion(converter, "-128", -128); + AssertConversion(converter, "-00128", -128); // Non-representable values AssertConversionFails(converter, "128"); AssertConversionFails(converter, "-129"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -134,13 +137,18 @@ TEST(StringConversion, ToUInt8) { StringConverter converter; AssertConversion(converter, "0", 0); + AssertConversion(converter, "26", 26); AssertConversion(converter, "255", 255); + AssertConversion(converter, "0255", 255); // Non-representable values - // AssertConversionFails(converter, "-1"); + AssertConversionFails(converter, "-1"); AssertConversionFails(converter, "256"); + AssertConversionFails(converter, "260"); + AssertConversionFails(converter, "1234"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -150,13 +158,16 @@ TEST(StringConversion, ToInt16) { AssertConversion(converter, "0", 0); AssertConversion(converter, "32767", 32767); + AssertConversion(converter, "032767", 32767); AssertConversion(converter, "-32768", -32768); + AssertConversion(converter, "-0032768", -32768); // Non-representable values AssertConversionFails(converter, "32768"); AssertConversionFails(converter, "-32769"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -165,13 +176,17 @@ TEST(StringConversion, ToUInt16) { StringConverter converter; AssertConversion(converter, "0", 0); + AssertConversion(converter, "6660", 6660); AssertConversion(converter, "65535", 65535); + AssertConversion(converter, "065535", 65535); // Non-representable values - // AssertConversionFails(converter, "-1"); + AssertConversionFails(converter, "-1"); AssertConversionFails(converter, "65536"); + AssertConversionFails(converter, "123456"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -181,13 +196,16 @@ TEST(StringConversion, ToInt32) { AssertConversion(converter, "0", 0); AssertConversion(converter, "2147483647", 2147483647); + AssertConversion(converter, "02147483647", 2147483647); AssertConversion(converter, "-2147483648", -2147483648LL); + AssertConversion(converter, "-002147483648", -2147483648LL); // Non-representable values AssertConversionFails(converter, "2147483648"); AssertConversionFails(converter, "-2147483649"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -196,13 +214,17 @@ TEST(StringConversion, ToUInt32) { StringConverter converter; AssertConversion(converter, "0", 0); + AssertConversion(converter, "432198765", 432198765UL); AssertConversion(converter, "4294967295", 4294967295UL); + AssertConversion(converter, "04294967295", 4294967295UL); // Non-representable values - // AssertConversionFails(converter, "-1"); + AssertConversionFails(converter, "-1"); AssertConversionFails(converter, "4294967296"); + AssertConversionFails(converter, "12345678901"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -212,13 +234,16 @@ TEST(StringConversion, ToInt64) { AssertConversion(converter, "0", 0); AssertConversion(converter, "9223372036854775807", 9223372036854775807LL); + AssertConversion(converter, "09223372036854775807", 9223372036854775807LL); AssertConversion(converter, "-9223372036854775808", -9223372036854775807LL - 1); + AssertConversion(converter, "-009223372036854775808", -9223372036854775807LL - 1); // Non-representable values AssertConversionFails(converter, "9223372036854775808"); AssertConversionFails(converter, "-9223372036854775809"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } @@ -230,10 +255,11 @@ TEST(StringConversion, ToUInt64) { AssertConversion(converter, "18446744073709551615", 18446744073709551615ULL); // Non-representable values - // AssertConversionFails(converter, "-1"); + AssertConversionFails(converter, "-1"); AssertConversionFails(converter, "18446744073709551616"); AssertConversionFails(converter, ""); + AssertConversionFails(converter, "-"); AssertConversionFails(converter, "0.0"); AssertConversionFails(converter, "e"); } diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index 8efc6143c8b..4f2dc7894ab 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "arrow/type.h" #include "arrow/type_traits.h" @@ -108,81 +109,213 @@ class StringConverter : public StringToFloatConverterMixinfloat conversion library +namespace detail { + +#define PARSE_UNSIGNED_ITERATION(C_TYPE) \ + if (length > 0) { \ + uint8_t digit = static_cast(*s++ - '0'); \ + result = static_cast(result * 10U); \ + length--; \ + if (ARROW_PREDICT_FALSE(digit > 9U)) { \ + /* Non-digit */ \ + return false; \ + } \ + result = static_cast(result + digit); \ + } + +#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \ + if (length > 0) { \ + if (ARROW_PREDICT_FALSE(result > std::numeric_limits::max() / 10U)) { \ + /* Overflow */ \ + return false; \ + } \ + uint8_t digit = static_cast(*s++ - '0'); \ + result = static_cast(result * 10U); \ + C_TYPE new_result = static_cast(result + digit); \ + if (ARROW_PREDICT_FALSE(--length > 0)) { \ + /* Too many digits */ \ + return false; \ + } \ + if (ARROW_PREDICT_FALSE(digit > 9U)) { \ + /* Non-digit */ \ + return false; \ + } \ + if (ARROW_PREDICT_FALSE(new_result < result)) { \ + /* Overflow */ \ + return false; \ + } \ + result = new_result; \ + } + +inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) { + uint8_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint8_t); + PARSE_UNSIGNED_ITERATION(uint8_t); + PARSE_UNSIGNED_ITERATION_LAST(uint8_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) { + uint16_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION_LAST(uint16_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) { + uint32_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + + PARSE_UNSIGNED_ITERATION_LAST(uint32_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { + uint64_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION_LAST(uint64_t); + *out = result; + return true; +} + +#undef PARSE_UNSIGNED_ITERATION +#undef PARSE_UNSIGNED_ITERATION_LAST + +} // namespace detail + template -class StringConverter> { +class StringToUnsignedIntConverterMixin { public: using value_type = typename ARROW_TYPE::c_type; - StringConverter() { ibuf.imbue(std::locale::classic()); } - bool operator()(const char* s, size_t length, value_type* out) { - static constexpr bool need_long_long = sizeof(value_type) > sizeof(long); // NOLINT - static constexpr value_type min_value = std::numeric_limits::min(); - static constexpr value_type max_value = std::numeric_limits::max(); - - ibuf.clear(); - ibuf.str(std::string(s, length)); - if (need_long_long) { - long long res; // NOLINT - ibuf >> res; - *out = static_cast(res); // may downcast - if (res < min_value || res > max_value) { - return false; - } - } else { - long res; // NOLINT - ibuf >> res; - *out = static_cast(res); // may downcast - if (res < min_value || res > max_value) { - return false; - } + if (ARROW_PREDICT_FALSE(length == 0)) { + return false; } - // XXX Should we reset errno on failure? - return !ibuf.fail() && ibuf.eof(); + // Skip leading zeros + while (length > 0 && *s == '0') { + length--; + s++; + } + return detail::ParseUnsigned(s, length, out); } +}; - protected: - std::istringstream ibuf; +template <> +class StringConverter : public StringToUnsignedIntConverterMixin {}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { }; template -class StringConverter> { +class StringToSignedIntConverterMixin { public: using value_type = typename ARROW_TYPE::c_type; - - StringConverter() { ibuf.imbue(std::locale::classic()); } + using unsigned_type = typename std::make_unsigned::type; bool operator()(const char* s, size_t length, value_type* out) { - static constexpr bool need_long_long = - sizeof(value_type) > sizeof(unsigned long); // NOLINT - static constexpr value_type max_value = std::numeric_limits::max(); + static constexpr unsigned_type max_positive = + static_cast(std::numeric_limits::max()); + // Assuming two's complement + static constexpr unsigned_type max_negative = max_positive + 1; + bool negative = false; + unsigned_type unsigned_value = 0; - ibuf.clear(); - ibuf.str(std::string(s, length)); - // XXX The following unfortunately allows negative input values - if (need_long_long) { - unsigned long long res; // NOLINT - ibuf >> res; - *out = static_cast(res); // may downcast - if (res > max_value) { + if (ARROW_PREDICT_FALSE(length == 0)) { + return false; + } + if (*s == '-') { + negative = true; + s++; + if (--length == 0) { return false; } + } + // Skip leading zeros + while (length > 0 && *s == '0') { + length--; + s++; + } + if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) { + return false; + } + if (negative) { + if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { + return false; + } + *out = static_cast(-static_cast(unsigned_value)); } else { - unsigned long res; // NOLINT - ibuf >> res; - *out = static_cast(res); // may downcast - if (res > max_value) { + if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { return false; } + *out = static_cast(unsigned_value); } - // XXX Should we reset errno on failure? - return !ibuf.fail() && ibuf.eof(); + return true; } - - protected: - std::istringstream ibuf; }; +template <> +class StringConverter : public StringToSignedIntConverterMixin {}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin {}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin {}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin {}; + } // namespace internal } // namespace arrow