Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 78 additions & 26 deletions cpp/src/arrow/pretty_print.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <limits>
#include <memory>
#include <sstream> // IWYU pragma: keep
#include <string>
Expand All @@ -46,6 +47,8 @@ namespace arrow {

using internal::checked_cast;

namespace {

class PrettyPrinter {
public:
PrettyPrinter(const PrettyPrintOptions& options, std::ostream* sink)
Expand Down Expand Up @@ -90,11 +93,6 @@ void PrettyPrinter::CloseArray(const Array& array) {
void PrettyPrinter::Write(const char* data) { (*sink_) << data; }
void PrettyPrinter::Write(const std::string& data) { (*sink_) << data; }

void PrettyPrinter::WriteIndented(const char* data) {
Indent();
Write(data);
}

void PrettyPrinter::WriteIndented(const std::string& data) {
Indent();
Write(data);
Expand Down Expand Up @@ -167,29 +165,36 @@ class ArrayPrinter : public PrettyPrinter {
}

template <typename T>
enable_if_date<typename T::TypeClass, Status> WriteDataValues(const T& array) {
enable_if_time<typename T::TypeClass, Status> WriteDataValues(const T& array) {
const auto data = array.raw_values();
using unit = typename std::conditional<std::is_same<T, Date32Array>::value,
arrow_vendored::date::days,
std::chrono::milliseconds>::type;
WriteValues(array, [&](int64_t i) { FormatDateTime<unit>("%F", data[i], true); });
const auto& type = checked_cast<const TimeType&>(*array.type());
WriteValues(array,
[&](int64_t i) { FormatDateTime(type.unit(), "%T", data[i], false); });
return Status::OK();
}

template <typename T>
enable_if_time<typename T::TypeClass, Status> WriteDataValues(const T& array) {
Status WriteDataValues(const Date32Array& array) {
const auto data = array.raw_values();
const auto type = static_cast<const TimeType*>(array.type().get());
WriteValues(array,
[&](int64_t i) { FormatDateTime(type->unit(), "%T", data[i], false); });
WriteValues(array, [&](int64_t i) {
FormatDateTime("%F", arrow_vendored::date::days{data[i]}, true);
});
return Status::OK();
}

Status WriteDataValues(const Date64Array& array) {
const auto data = array.raw_values();
WriteValues(array, [&](int64_t i) {
FormatDateTime("%F", std::chrono::milliseconds{data[i]}, true);
});
return Status::OK();
}

Status WriteDataValues(const TimestampArray& array) {
const int64_t* data = array.raw_values();
const auto type = static_cast<const TimestampType*>(array.type().get());
const auto& type = checked_cast<const TimestampType&>(*array.type());

WriteValues(array,
[&](int64_t i) { FormatDateTime(type->unit(), "%F %T", data[i], true); });
[&](int64_t i) { FormatDateTime(type.unit(), "%F %T", data[i], true); });
return Status::OK();
}

Expand Down Expand Up @@ -429,36 +434,77 @@ class ArrayPrinter : public PrettyPrinter {

private:
template <typename Unit>
void FormatDateTime(const char* fmt, int64_t value, bool add_epoch) {
void FormatDateTime(const char* fmt, Unit duration, bool add_epoch) {
// NOTE about bounds checking:
//
// While we catch exceptions below, some out-of-bound values would result
// in successful but erroneous printing because of silent integer wraparound
// in the `arrow_vendored::date` library.
//
// To avoid such misprinting, we must therefore check the bounds explicitly.
// The bounds correspond to start of year -32767 and end of year 32767,
// respectively (-32768 is an invalid year value in `arrow_vendored::date`).
//
// Note these values are the same as documented for C++20:
// https://en.cppreference.com/w/cpp/chrono/year_month_day/operator_days
constexpr Unit kMinIncl =
std::chrono::duration_cast<Unit>(arrow_vendored::date::days{-12687428});
constexpr Unit kMaxExcl =
std::chrono::duration_cast<Unit>(arrow_vendored::date::days{11248738});
if (duration >= kMinIncl && duration < kMaxExcl) {
try {
if (add_epoch) {
(*sink_) << arrow_vendored::date::format(fmt, epoch_ + duration);
} else {
(*sink_) << arrow_vendored::date::format(fmt, duration);
}
return;
} catch (std::ios::failure&) {
// Fall back below
}
}
WriteOutOfRange(duration.count());
}

// FormatDateTime specialization for nanoseconds: a 64-bit number of
// nanoseconds cannot represent years outside of the [-32767, 32767]
// range, and the {kMinIncl, kMaxExcl} constants above would overflow.
void FormatDateTime(const char* fmt, std::chrono::nanoseconds duration,
bool add_epoch) {
if (add_epoch) {
(*sink_) << arrow_vendored::date::format(fmt, epoch_ + Unit{value});
(*sink_) << arrow_vendored::date::format(fmt, epoch_ + duration);
} else {
(*sink_) << arrow_vendored::date::format(fmt, Unit{value});
(*sink_) << arrow_vendored::date::format(fmt, duration);
}
}

void FormatDateTime(TimeUnit::type unit, const char* fmt, int64_t value,
bool add_epoch) {
switch (unit) {
case TimeUnit::NANO:
FormatDateTime<std::chrono::nanoseconds>(fmt, value, add_epoch);
FormatDateTime(fmt, std::chrono::nanoseconds{value}, add_epoch);
break;
case TimeUnit::MICRO:
FormatDateTime<std::chrono::microseconds>(fmt, value, add_epoch);
FormatDateTime(fmt, std::chrono::microseconds{value}, add_epoch);
break;
case TimeUnit::MILLI:
FormatDateTime<std::chrono::milliseconds>(fmt, value, add_epoch);
FormatDateTime(fmt, std::chrono::milliseconds{value}, add_epoch);
break;
case TimeUnit::SECOND:
FormatDateTime<std::chrono::seconds>(fmt, value, add_epoch);
FormatDateTime(fmt, std::chrono::seconds{value}, add_epoch);
break;
}
}

static arrow_vendored::date::sys_days epoch_;
template <typename V>
void WriteOutOfRange(const V& value) {
(*sink_) << "<value out of range: " << value << ">";
}

static const arrow_vendored::date::sys_days epoch_;
};

arrow_vendored::date::sys_days ArrayPrinter::epoch_ =
const arrow_vendored::date::sys_days ArrayPrinter::epoch_ =
arrow_vendored::date::sys_days{arrow_vendored::date::jan / 1 / 1970};

Status ArrayPrinter::WriteValidityBitmap(const Array& array) {
Expand All @@ -477,6 +523,8 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) {
}
}

} // namespace

Status PrettyPrint(const Array& arr, int indent, std::ostream* sink) {
PrettyPrintOptions options;
options.indent = indent;
Expand Down Expand Up @@ -608,6 +656,8 @@ Status DebugPrint(const Array& arr, int indent) {
return PrettyPrint(arr, indent, &std::cerr);
}

namespace {

class SchemaPrinter : public PrettyPrinter {
public:
SchemaPrinter(const Schema& schema, const PrettyPrintOptions& options,
Expand Down Expand Up @@ -709,6 +759,8 @@ Status SchemaPrinter::PrintField(const Field& field) {
return Status::OK();
}

} // namespace

Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
std::ostream* sink) {
SchemaPrinter printer(schema, options, sink);
Expand Down
157 changes: 157 additions & 0 deletions cpp/src/arrow/pretty_print_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <sstream>
#include <string>
Expand Down Expand Up @@ -54,6 +55,7 @@ void CheckStream(const T& obj, const PrettyPrintOptions& options, const char* ex

void CheckArray(const Array& arr, const PrettyPrintOptions& options, const char* expected,
bool check_operator = true) {
ARROW_SCOPED_TRACE("For datatype: ", arr.type()->ToString());
CheckStream(arr, options, expected);

if (options.indent == 0 && check_operator) {
Expand Down Expand Up @@ -309,6 +311,161 @@ TEST_F(TestPrettyPrint, TestIntervalTypes) {
}
}

TEST_F(TestPrettyPrint, DateTimeTypesWithOutOfRangeValues) {
// Our vendored date library allows years within [-32767, 32767],
// which limits the range of values which can be displayed.
const int32_t min_int32 = std::numeric_limits<int32_t>::min();
const int32_t max_int32 = std::numeric_limits<int32_t>::max();
const int64_t min_int64 = std::numeric_limits<int64_t>::min();
const int64_t max_int64 = std::numeric_limits<int64_t>::max();

const int32_t min_date32 = -12687428;
const int32_t max_date32 = 11248737;
const int64_t min_date64 = 86400000LL * min_date32;
const int64_t max_date64 = 86400000LL * (max_date32 + 1) - 1;
const int64_t min_timestamp_seconds = -1096193779200LL;
const int64_t max_timestamp_seconds = 971890963199LL;
const int64_t min_timestamp_millis = min_timestamp_seconds * 1000;
const int64_t max_timestamp_millis = max_timestamp_seconds * 1000 + 999;
const int64_t min_timestamp_micros = min_timestamp_millis * 1000;
const int64_t max_timestamp_micros = max_timestamp_millis * 1000 + 999;

std::vector<bool> is_valid = {false, false, false, false, true,
true, true, true, true, true};

{
std::vector<int32_t> values = {min_int32, max_int32, min_date32 - 1, max_date32 + 1,
min_int32, max_int32, min_date32 - 1, max_date32 + 1,
min_date32, max_date32};
static const char* expected = R"expected([
null,
null,
null,
null,
<value out of range: -2147483648>,
<value out of range: 2147483647>,
<value out of range: -12687429>,
<value out of range: 11248738>,
-32767-01-01,
32767-12-31
])expected";
CheckPrimitive<Date32Type, int32_t>({0, 10}, is_valid, values, expected);
}

{
std::vector<int64_t> values = {min_int64, max_int64, min_date64 - 1, max_date64 + 1,
min_int64, max_int64, min_date64 - 1, max_date64 + 1,
min_date64, max_date64};
static const char* expected = R"expected([
null,
null,
null,
null,
<value out of range: -9223372036854775808>,
<value out of range: 9223372036854775807>,
<value out of range: -1096193779200001>,
<value out of range: 971890963200000>,
-32767-01-01,
32767-12-31
])expected";
CheckPrimitive<Date64Type, int64_t>({0, 10}, is_valid, values, expected);
}

// TODO time32, time64

{
std::vector<int64_t> values = {min_int64,
max_int64,
min_timestamp_seconds - 1,
max_timestamp_seconds + 1,
min_int64,
max_int64,
min_timestamp_seconds - 1,
max_timestamp_seconds + 1,
min_timestamp_seconds,
max_timestamp_seconds};
static const char* expected = R"expected([
null,
null,
null,
null,
<value out of range: -9223372036854775808>,
<value out of range: 9223372036854775807>,
<value out of range: -1096193779201>,
<value out of range: 971890963200>,
-32767-01-01 00:00:00,
32767-12-31 23:59:59
])expected";
CheckPrimitive<TimestampType, int64_t>(timestamp(TimeUnit::SECOND), {0, 10}, is_valid,
values, expected);
}
{
std::vector<int64_t> values = {min_int64,
max_int64,
min_timestamp_millis - 1,
max_timestamp_millis + 1,
min_int64,
max_int64,
min_timestamp_millis - 1,
max_timestamp_millis + 1,
min_timestamp_millis,
max_timestamp_millis};
static const char* expected = R"expected([
null,
null,
null,
null,
<value out of range: -9223372036854775808>,
<value out of range: 9223372036854775807>,
<value out of range: -1096193779200001>,
<value out of range: 971890963200000>,
-32767-01-01 00:00:00.000,
32767-12-31 23:59:59.999
])expected";
CheckPrimitive<TimestampType, int64_t>(timestamp(TimeUnit::MILLI), {0, 10}, is_valid,
values, expected);
}
{
std::vector<int64_t> values = {min_int64,
max_int64,
min_timestamp_micros - 1,
max_timestamp_micros + 1,
min_int64,
max_int64,
min_timestamp_micros - 1,
max_timestamp_micros + 1,
min_timestamp_micros,
max_timestamp_micros};
static const char* expected = R"expected([
null,
null,
null,
null,
<value out of range: -9223372036854775808>,
<value out of range: 9223372036854775807>,
<value out of range: -1096193779200000001>,
<value out of range: 971890963200000000>,
-32767-01-01 00:00:00.000000,
32767-12-31 23:59:59.999999
])expected";
CheckPrimitive<TimestampType, int64_t>(timestamp(TimeUnit::MICRO), {0, 10}, is_valid,
values, expected);
}
#ifndef ARROW_UBSAN
// While the values below are legal and correct, they trigger an internal
// signed overflow inside the arrow_vendored::date library.
{
std::vector<int64_t> values = {min_int64, max_int64};
static const char* expected = R"expected([
1677-09-21 00:12:43.145224192,
2262-04-11 23:47:16.854775807
])expected";
CheckPrimitive<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {0, 10},
{true, true}, values, expected);
}
#endif
}

TEST_F(TestPrettyPrint, StructTypeBasic) {
auto simple_1 = field("one", int32());
auto simple_2 = field("two", int32());
Expand Down