diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 13a3dacac6f..0af9c83c45f 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include // IWYU pragma: keep #include @@ -46,6 +47,8 @@ namespace arrow { using internal::checked_cast; +namespace { + class PrettyPrinter { public: PrettyPrinter(const PrettyPrintOptions& options, std::ostream* sink) @@ -90,11 +93,6 @@ void PrettyPrinter::CloseArray(const Array& array) { void PrettyPrinter::Write(const char* data) { (*sink_) << data; } void PrettyPrinter::Write(const std::string& data) { (*sink_) << data; } -void PrettyPrinter::WriteIndented(const char* data) { - Indent(); - Write(data); -} - void PrettyPrinter::WriteIndented(const std::string& data) { Indent(); Write(data); @@ -167,29 +165,36 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_date WriteDataValues(const T& array) { + enable_if_time WriteDataValues(const T& array) { const auto data = array.raw_values(); - using unit = typename std::conditional::value, - arrow_vendored::date::days, - std::chrono::milliseconds>::type; - WriteValues(array, [&](int64_t i) { FormatDateTime("%F", data[i], true); }); + const auto& type = checked_cast(*array.type()); + WriteValues(array, + [&](int64_t i) { FormatDateTime(type.unit(), "%T", data[i], false); }); return Status::OK(); } - template - enable_if_time WriteDataValues(const T& array) { + Status WriteDataValues(const Date32Array& array) { const auto data = array.raw_values(); - const auto type = static_cast(array.type().get()); - WriteValues(array, - [&](int64_t i) { FormatDateTime(type->unit(), "%T", data[i], false); }); + WriteValues(array, [&](int64_t i) { + FormatDateTime("%F", arrow_vendored::date::days{data[i]}, true); + }); + return Status::OK(); + } + + Status WriteDataValues(const Date64Array& array) { + const auto data = array.raw_values(); + WriteValues(array, [&](int64_t i) { + FormatDateTime("%F", std::chrono::milliseconds{data[i]}, true); + }); return Status::OK(); } Status WriteDataValues(const TimestampArray& array) { const int64_t* data = array.raw_values(); - const auto type = static_cast(array.type().get()); + const auto& type = checked_cast(*array.type()); + WriteValues(array, - [&](int64_t i) { FormatDateTime(type->unit(), "%F %T", data[i], true); }); + [&](int64_t i) { FormatDateTime(type.unit(), "%F %T", data[i], true); }); return Status::OK(); } @@ -429,11 +434,47 @@ class ArrayPrinter : public PrettyPrinter { private: template - void FormatDateTime(const char* fmt, int64_t value, bool add_epoch) { + void FormatDateTime(const char* fmt, Unit duration, bool add_epoch) { + // NOTE about bounds checking: + // + // While we catch exceptions below, some out-of-bound values would result + // in successful but erroneous printing because of silent integer wraparound + // in the `arrow_vendored::date` library. + // + // To avoid such misprinting, we must therefore check the bounds explicitly. + // The bounds correspond to start of year -32767 and end of year 32767, + // respectively (-32768 is an invalid year value in `arrow_vendored::date`). + // + // Note these values are the same as documented for C++20: + // https://en.cppreference.com/w/cpp/chrono/year_month_day/operator_days + constexpr Unit kMinIncl = + std::chrono::duration_cast(arrow_vendored::date::days{-12687428}); + constexpr Unit kMaxExcl = + std::chrono::duration_cast(arrow_vendored::date::days{11248738}); + if (duration >= kMinIncl && duration < kMaxExcl) { + try { + if (add_epoch) { + (*sink_) << arrow_vendored::date::format(fmt, epoch_ + duration); + } else { + (*sink_) << arrow_vendored::date::format(fmt, duration); + } + return; + } catch (std::ios::failure&) { + // Fall back below + } + } + WriteOutOfRange(duration.count()); + } + + // FormatDateTime specialization for nanoseconds: a 64-bit number of + // nanoseconds cannot represent years outside of the [-32767, 32767] + // range, and the {kMinIncl, kMaxExcl} constants above would overflow. + void FormatDateTime(const char* fmt, std::chrono::nanoseconds duration, + bool add_epoch) { if (add_epoch) { - (*sink_) << arrow_vendored::date::format(fmt, epoch_ + Unit{value}); + (*sink_) << arrow_vendored::date::format(fmt, epoch_ + duration); } else { - (*sink_) << arrow_vendored::date::format(fmt, Unit{value}); + (*sink_) << arrow_vendored::date::format(fmt, duration); } } @@ -441,24 +482,29 @@ class ArrayPrinter : public PrettyPrinter { bool add_epoch) { switch (unit) { case TimeUnit::NANO: - FormatDateTime(fmt, value, add_epoch); + FormatDateTime(fmt, std::chrono::nanoseconds{value}, add_epoch); break; case TimeUnit::MICRO: - FormatDateTime(fmt, value, add_epoch); + FormatDateTime(fmt, std::chrono::microseconds{value}, add_epoch); break; case TimeUnit::MILLI: - FormatDateTime(fmt, value, add_epoch); + FormatDateTime(fmt, std::chrono::milliseconds{value}, add_epoch); break; case TimeUnit::SECOND: - FormatDateTime(fmt, value, add_epoch); + FormatDateTime(fmt, std::chrono::seconds{value}, add_epoch); break; } } - static arrow_vendored::date::sys_days epoch_; + template + void WriteOutOfRange(const V& value) { + (*sink_) << ""; + } + + static const arrow_vendored::date::sys_days epoch_; }; -arrow_vendored::date::sys_days ArrayPrinter::epoch_ = +const arrow_vendored::date::sys_days ArrayPrinter::epoch_ = arrow_vendored::date::sys_days{arrow_vendored::date::jan / 1 / 1970}; Status ArrayPrinter::WriteValidityBitmap(const Array& array) { @@ -477,6 +523,8 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) { } } +} // namespace + Status PrettyPrint(const Array& arr, int indent, std::ostream* sink) { PrettyPrintOptions options; options.indent = indent; @@ -608,6 +656,8 @@ Status DebugPrint(const Array& arr, int indent) { return PrettyPrint(arr, indent, &std::cerr); } +namespace { + class SchemaPrinter : public PrettyPrinter { public: SchemaPrinter(const Schema& schema, const PrettyPrintOptions& options, @@ -709,6 +759,8 @@ Status SchemaPrinter::PrintField(const Field& field) { return Status::OK(); } +} // namespace + Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, std::ostream* sink) { SchemaPrinter printer(schema, options, sink); diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index e06efa20ae1..d01e5377d07 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -54,6 +55,7 @@ void CheckStream(const T& obj, const PrettyPrintOptions& options, const char* ex void CheckArray(const Array& arr, const PrettyPrintOptions& options, const char* expected, bool check_operator = true) { + ARROW_SCOPED_TRACE("For datatype: ", arr.type()->ToString()); CheckStream(arr, options, expected); if (options.indent == 0 && check_operator) { @@ -309,6 +311,161 @@ TEST_F(TestPrettyPrint, TestIntervalTypes) { } } +TEST_F(TestPrettyPrint, DateTimeTypesWithOutOfRangeValues) { + // Our vendored date library allows years within [-32767, 32767], + // which limits the range of values which can be displayed. + const int32_t min_int32 = std::numeric_limits::min(); + const int32_t max_int32 = std::numeric_limits::max(); + const int64_t min_int64 = std::numeric_limits::min(); + const int64_t max_int64 = std::numeric_limits::max(); + + const int32_t min_date32 = -12687428; + const int32_t max_date32 = 11248737; + const int64_t min_date64 = 86400000LL * min_date32; + const int64_t max_date64 = 86400000LL * (max_date32 + 1) - 1; + const int64_t min_timestamp_seconds = -1096193779200LL; + const int64_t max_timestamp_seconds = 971890963199LL; + const int64_t min_timestamp_millis = min_timestamp_seconds * 1000; + const int64_t max_timestamp_millis = max_timestamp_seconds * 1000 + 999; + const int64_t min_timestamp_micros = min_timestamp_millis * 1000; + const int64_t max_timestamp_micros = max_timestamp_millis * 1000 + 999; + + std::vector is_valid = {false, false, false, false, true, + true, true, true, true, true}; + + { + std::vector values = {min_int32, max_int32, min_date32 - 1, max_date32 + 1, + min_int32, max_int32, min_date32 - 1, max_date32 + 1, + min_date32, max_date32}; + static const char* expected = R"expected([ + null, + null, + null, + null, + , + , + , + , + -32767-01-01, + 32767-12-31 +])expected"; + CheckPrimitive({0, 10}, is_valid, values, expected); + } + + { + std::vector values = {min_int64, max_int64, min_date64 - 1, max_date64 + 1, + min_int64, max_int64, min_date64 - 1, max_date64 + 1, + min_date64, max_date64}; + static const char* expected = R"expected([ + null, + null, + null, + null, + , + , + , + , + -32767-01-01, + 32767-12-31 +])expected"; + CheckPrimitive({0, 10}, is_valid, values, expected); + } + + // TODO time32, time64 + + { + std::vector values = {min_int64, + max_int64, + min_timestamp_seconds - 1, + max_timestamp_seconds + 1, + min_int64, + max_int64, + min_timestamp_seconds - 1, + max_timestamp_seconds + 1, + min_timestamp_seconds, + max_timestamp_seconds}; + static const char* expected = R"expected([ + null, + null, + null, + null, + , + , + , + , + -32767-01-01 00:00:00, + 32767-12-31 23:59:59 +])expected"; + CheckPrimitive(timestamp(TimeUnit::SECOND), {0, 10}, is_valid, + values, expected); + } + { + std::vector values = {min_int64, + max_int64, + min_timestamp_millis - 1, + max_timestamp_millis + 1, + min_int64, + max_int64, + min_timestamp_millis - 1, + max_timestamp_millis + 1, + min_timestamp_millis, + max_timestamp_millis}; + static const char* expected = R"expected([ + null, + null, + null, + null, + , + , + , + , + -32767-01-01 00:00:00.000, + 32767-12-31 23:59:59.999 +])expected"; + CheckPrimitive(timestamp(TimeUnit::MILLI), {0, 10}, is_valid, + values, expected); + } + { + std::vector values = {min_int64, + max_int64, + min_timestamp_micros - 1, + max_timestamp_micros + 1, + min_int64, + max_int64, + min_timestamp_micros - 1, + max_timestamp_micros + 1, + min_timestamp_micros, + max_timestamp_micros}; + static const char* expected = R"expected([ + null, + null, + null, + null, + , + , + , + , + -32767-01-01 00:00:00.000000, + 32767-12-31 23:59:59.999999 +])expected"; + CheckPrimitive(timestamp(TimeUnit::MICRO), {0, 10}, is_valid, + values, expected); + } +#ifndef ARROW_UBSAN + // While the values below are legal and correct, they trigger an internal + // signed overflow inside the arrow_vendored::date library. + { + std::vector values = {min_int64, max_int64}; + static const char* expected = R"expected([ + 1677-09-21 00:12:43.145224192, + 2262-04-11 23:47:16.854775807 +])expected"; + CheckPrimitive(timestamp(TimeUnit::NANO), {0, 10}, + {true, true}, values, expected); + } +#endif +} + TEST_F(TestPrettyPrint, StructTypeBasic) { auto simple_1 = field("one", int32()); auto simple_2 = field("two", int32());