diff --git a/velox/vector/VectorPrinter.cpp b/velox/vector/VectorPrinter.cpp index eab9e92149194..f0eff3ca0648e 100644 --- a/velox/vector/VectorPrinter.cpp +++ b/velox/vector/VectorPrinter.cpp @@ -39,11 +39,11 @@ std::string printFixedWidth( return base->toString(baseIndex); } -class VectorPrinter { +class VectorPrinterBase { public: - explicit VectorPrinter(const BaseVector& vector) : decoded_{vector} {} + explicit VectorPrinterBase(const BaseVector& vector) : decoded_{vector} {} - virtual ~VectorPrinter() = default; + virtual ~VectorPrinterBase() = default; std::string summarize(vector_size_t index) const { if (decoded_.isNullAt(index)) { @@ -75,15 +75,16 @@ class VectorPrinter { virtual std::string summarizeNonNull(vector_size_t index) const = 0; DecodedVector decoded_; - std::vector> children_; + std::vector> children_; }; -std::unique_ptr createVectorPrinter(const BaseVector& vector); +std::unique_ptr createVectorPrinter( + const BaseVector& vector); -class PrimitiveVectorPrinter : public VectorPrinter { +class PrimitiveVectorPrinter : public VectorPrinterBase { public: explicit PrimitiveVectorPrinter(const BaseVector& vector) - : VectorPrinter(vector) {} + : VectorPrinterBase(vector) {} protected: std::string printNonNull(vector_size_t index, const std::string& indent) @@ -109,10 +110,10 @@ class PrimitiveVectorPrinter : public VectorPrinter { } }; -class ArrayVectorPrinter : public VectorPrinter { +class ArrayVectorPrinter : public VectorPrinterBase { public: explicit ArrayVectorPrinter(const BaseVector& vector) - : VectorPrinter(vector) { + : VectorPrinterBase(vector) { auto* arrayVector = decoded_.base()->as(); children_.emplace_back(createVectorPrinter(*arrayVector->elements())); } @@ -157,9 +158,10 @@ class ArrayVectorPrinter : public VectorPrinter { } }; -class MapVectorPrinter : public VectorPrinter { +class MapVectorPrinter : public VectorPrinterBase { public: - explicit MapVectorPrinter(const BaseVector& vector) : VectorPrinter(vector) { + explicit MapVectorPrinter(const BaseVector& vector) + : VectorPrinterBase(vector) { auto* mapVector = decoded_.base()->as(); children_.emplace_back(createVectorPrinter(*mapVector->mapKeys())); children_.emplace_back(createVectorPrinter(*mapVector->mapValues())); @@ -217,9 +219,10 @@ class MapVectorPrinter : public VectorPrinter { } }; -class RowVectorPrinter : public VectorPrinter { +class RowVectorPrinter : public VectorPrinterBase { public: - explicit RowVectorPrinter(const BaseVector& vector) : VectorPrinter(vector) { + explicit RowVectorPrinter(const BaseVector& vector) + : VectorPrinterBase(vector) { auto* rowVector = decoded_.base()->as(); for (const auto& child : rowVector->children()) { children_.emplace_back(createVectorPrinter(*child)); @@ -249,7 +252,8 @@ class RowVectorPrinter : public VectorPrinter { } }; -std::unique_ptr createVectorPrinter(const BaseVector& vector) { +std::unique_ptr createVectorPrinter( + const BaseVector& vector) { switch (vector.typeKind()) { case TypeKind::ARRAY: return std::make_unique(vector); @@ -395,4 +399,244 @@ std::string printVector( return out.str(); } +namespace { +class VectorVisitor { + public: + struct Context { + VectorPrinter::Options options; + + std::stringstream text; + + int32_t indent{0}; + + bool skipTopSummary{false}; + + // Vector name if a child or a RowVector. + std::optional name; + + // Node ID in the format A.B.C.D, where each component is an index of the + // node in the corresponding layer of the hierarchy. + std::string parentNodeId; + + size_t nodeId{0}; + }; + + void visit(const BaseVector& vector, Context& ctx) { + const auto parentNodeId = ctx.parentNodeId; + const auto nodeId = ctx.nodeId; + const auto name = ctx.name; + + ctx.parentNodeId = ctx.parentNodeId.empty() + ? std::to_string(ctx.nodeId) + : fmt::format("{}.{}", parentNodeId, ctx.nodeId); + + if (ctx.skipTopSummary) { + ctx.skipTopSummary = false; + } else { + ctx.text << toIndentation(ctx.indent); + if (ctx.options.includeNodeIds) { + ctx.text << ctx.parentNodeId << " "; + } + ctx.text << toSummaryString(vector, ctx) << std::endl; + } + + ctx.nodeId = 0; + ctx.name.reset(); + ctx.indent++; + + SCOPE_EXIT { + ctx.parentNodeId = parentNodeId; + ctx.nodeId = nodeId; + ctx.name = name; + ctx.indent--; + }; + + switch (vector.encoding()) { + case VectorEncoding::Simple::FLAT: + break; + case VectorEncoding::Simple::ARRAY: + visitArrayVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::MAP: + visitMapVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::ROW: + visitRowVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::DICTIONARY: + visitDictionaryVector(vector, ctx); + break; + case VectorEncoding::Simple::CONSTANT: + visitConstantVector(vector, ctx); + break; + default: + VELOX_NYI(); + } + } + + private: + static std::string toIndentation(int32_t indent) { + static constexpr auto kIndentSize = 3; + + return std::string(indent * kIndentSize, ' '); + } + + static std::string truncate(const std::string& str, size_t maxLen = 50) { + return str.substr(0, maxLen); + } + + static std::string toSummaryString(const BaseVector& vector, Context& ctx) { + std::stringstream summary; + summary << vector.type()->toSummaryString(ctx.options.types); + summary << " " << vector.size() << " rows"; + + summary << " " << VectorEncoding::mapSimpleToName(vector.encoding()); + summary << " " << succinctBytes(vector.retainedSize()); + + if (ctx.name.has_value()) { + summary << " " << truncate(ctx.name.value()); + } + return summary.str(); + } + + // Computes basic statistics about integers: min, max, avg. + class IntegerStats { + public: + void add(int64_t value) { + min_ = std::min(min_, value); + max_ = std::max(max_, value); + sum_ += value; + ++cnt_; + } + + int64_t min() const { + return min_; + } + + int64_t max() const { + return max_; + } + + int64_t count() const { + return cnt_; + } + + double avg() const { + return cnt_ > 0 ? (sum_ / cnt_) : 0; + } + + private: + int64_t min_{std::numeric_limits::max()}; + int64_t max_{std::numeric_limits::min()}; + size_t cnt_{0}; + double sum_{0.0}; + }; + + static void appendArrayStats(const ArrayVectorBase& base, Context& ctx) { + size_t numNulls = 0; + size_t numEmpty = 0; + IntegerStats sizeStats; + + for (auto i = 0; i < base.size(); ++i) { + if (base.isNullAt(i)) { + ++numNulls; + } else if (base.sizeAt(i) == 0) { + ++numEmpty; + } else { + sizeStats.add(base.sizeAt(i)); + } + } + + const auto indent = toIndentation(ctx.indent + 1); + ctx.text << indent << "Stats: " << numNulls << " nulls, " << numEmpty + << " empty"; + + if (sizeStats.count() > 0) { + if (sizeStats.min() == sizeStats.max()) { + ctx.text << ", sizes: " << sizeStats.min(); + } else { + ctx.text << ", sizes: [" << sizeStats.min() << "..." << sizeStats.max() + << ", avg " << (int)sizeStats.avg() << "]"; + } + } + + ctx.text << std::endl; + } + + void visitArrayVector(const ArrayVector& vector, Context& ctx) { + appendArrayStats(vector, ctx); + + visit(*vector.elements(), ctx); + } + + void visitMapVector(const MapVector& vector, Context& ctx) { + appendArrayStats(vector, ctx); + + visit(*vector.mapKeys(), ctx); + + ctx.nodeId++; + visit(*vector.mapValues(), ctx); + } + + void visitRowVector(const RowVector& vector, Context& ctx) { + const auto& rowType = vector.type()->asRow(); + const auto cnt = + std::min(ctx.options.maxChildren, vector.childrenSize()); + for (size_t i = 0; i < cnt; ++i) { + if (ctx.options.includeChildNames) { + ctx.name = rowType.nameOf(i); + } + + visit(*vector.childAt(i), ctx); + ctx.nodeId++; + } + ctx.name.reset(); + + if (vector.childrenSize() > cnt) { + ctx.text << toIndentation(ctx.indent) << "..." + << (vector.childrenSize() - cnt) << " more" << std::endl; + } + } + + void visitDictionaryVector(const BaseVector& vector, Context& ctx) { + size_t numNulls = 0; + std::unordered_set uniqueIndices; + + const auto* rawIndices = vector.wrapInfo()->as(); + for (auto i = 0; i < vector.size(); ++i) { + if (vector.isNullAt(i)) { + ++numNulls; + } else { + uniqueIndices.insert(rawIndices[i]); + } + } + + ctx.text << toIndentation(ctx.indent + 1) << "Stats: " << numNulls + << " nulls, " << uniqueIndices.size() << " unique" << std::endl; + + visit(*vector.valueVector(), ctx); + } + + void visitConstantVector(const BaseVector& vector, Context& ctx) { + if (vector.valueVector() != nullptr) { + visit(*vector.valueVector(), ctx); + } + } +}; +} // namespace + +// static +std::string VectorPrinter::summarizeToText( + const BaseVector& vector, + const Options& options) { + VectorVisitor::Context ctx; + ctx.options = options; + ctx.skipTopSummary = options.skipTopSummary; + ctx.indent = options.indent; + + VectorVisitor visitor; + visitor.visit(vector, ctx); + return ctx.text.str(); +} + } // namespace facebook::velox diff --git a/velox/vector/VectorPrinter.h b/velox/vector/VectorPrinter.h index 83aeffce714b3..74b7a251330c4 100644 --- a/velox/vector/VectorPrinter.h +++ b/velox/vector/VectorPrinter.h @@ -40,4 +40,73 @@ std::string printVector( const BaseVector& vector, const SelectivityVector& rows); +class VectorPrinter { + public: + struct Options { + // Options that control summarization of types. + velox::Type::TypeSummaryOptions types; + + // Maximum number of child vectors to include in the summary. + size_t maxChildren{5}; + + // Whether to include the names of the RowVector child vectors. + bool includeChildNames{false}; + + // Whether to include unique IDs for each node in the vector hierarchy. + bool includeNodeIds{false}; + + // Optional indent to add to all lines. Useful when embedding the output of + // the printer into some other text. Each indentation is 3 spaces. 0 means + // no indentation. 1 - 3 spaces. 2 - 6 spaces. + int32_t indent{0}; + + // Whether to skip printing the summary of the top level vector. Similar to + // 'indent', useful when embedding the output of the printer into some other + // text. + bool skipTopSummary{false}; + + // Workaround for compiler error: default member initializer for + // 'maxChildren' needed within definition of enclosing class 'VectorPrinter' + // outside of member functions + // + // static std::string summarizeToText( + // const BaseVector& vector, + // Options options = {}); + static Options defaultOptions() { + return {}; + } + }; + + // Returns a summary of the vector in human-readable text format. + // + // Prints a hierarchy of vectors with up to options.maxChildren children at + // each level. For each vector, prints a header that includes the data type, + // number of rows, encoding. + // + // For example, + // + // A flat vector of integers: + // + // INTEGER 8 rows FLAT 32B + // + // A flat array of integers with 3 null arrays, 1 empty array and the rest + // of arrays of average size 3: + // + // ARRAY 8 rows ARRAY 288B + // Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3] + // BIGINT 12 rows FLAT 128B + // + // A dictionary over map with 4 unique maps: + // + // MAP 8 rows DICTIONARY 192B + // Stats: 0 nulls, 4 unique + // MAP 4 rows MAP 160B + // Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2] + // INTEGER 8 rows FLAT 32B + // REAL 8 rows FLAT 32B + static std::string summarizeToText( + const BaseVector& vector, + const Options& options = Options::defaultOptions()); +}; + } // namespace facebook::velox diff --git a/velox/vector/tests/CMakeLists.txt b/velox/vector/tests/CMakeLists.txt index 0895297644770..c47ccec4d360b 100644 --- a/velox/vector/tests/CMakeLists.txt +++ b/velox/vector/tests/CMakeLists.txt @@ -60,6 +60,7 @@ target_link_libraries( Boost::system GTest::gtest GTest::gtest_main + GTest::gmock Folly::folly gflags::gflags glog::glog diff --git a/velox/vector/tests/VectorPrinterTest.cpp b/velox/vector/tests/VectorPrinterTest.cpp index bbc5436f15ec3..72f53c75cc069 100644 --- a/velox/vector/tests/VectorPrinterTest.cpp +++ b/velox/vector/tests/VectorPrinterTest.cpp @@ -17,6 +17,8 @@ #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorTestBase.h" +#include + namespace facebook::velox::test { class VectorPrinterTest : public testing::Test, public VectorTestBase { @@ -24,10 +26,21 @@ class VectorPrinterTest : public testing::Test, public VectorTestBase { static void SetUpTestCase() { memory::MemoryManager::testingSetInstance({}); } + + static std::vector summarizeToLines( + const BaseVector& vector, + const VectorPrinter::Options& options = {}) { + std::vector lines; + folly::split('\n', VectorPrinter::summarizeToText(vector, options), lines); + if (lines.back().empty()) { + lines.pop_back(); + } + return lines; + } }; // Sanity check that printVector doesn't fail or crash. -TEST_F(VectorPrinterTest, basic) { +TEST_F(VectorPrinterTest, printVectorFuzz) { VectorFuzzer::Options options; options.vectorSize = 100; options.nullRatio = 0.1; @@ -50,11 +63,94 @@ TEST_F(VectorPrinterTest, basic) { } } -TEST_F(VectorPrinterTest, map) { +TEST_F(VectorPrinterTest, printVectorMap) { auto data = makeMapVector({ {}, {{1, 10}}, }); ASSERT_NO_THROW(printVector(*data)); } + +TEST_F(VectorPrinterTest, summarizeToText) { + auto data = makeRowVector( + {"a", "b", "c", "d"}, + { + makeFlatVector({1, 2, 3, 4, 5, 6, 7, 8}), + makeArrayVectorFromJson({ + "[1, 2, 3]", + "[4, 5]", + "null", + "[6, 7, 8, 9]", + "null", + "null", + "[10, 11, 12]", + "[]", + }), + wrapInDictionary( + makeIndices({0, 0, 0, 1, 2, 2, 3, 3}), + makeMapVectorFromJson({ + "{1: 1.0}", + "{3: 3.0, 4: 4.0, 5: 5.0}", + "{}", + "{6: 6.0, 7: 7.0, 8: 8.0, 9: 9.0}", + })), + makeConstant("hello", 8), + }); + + EXPECT_THAT( + summarizeToLines(*data), + ::testing::ElementsAre( + ::testing::MatchesRegex("ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" MAP 8 rows DICTIONARY [0-9]+.*"), + " Stats: 0 nulls, 4 unique", + ::testing::MatchesRegex(" MAP 4 rows MAP [0-9]+.*"), + " Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2]", + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" REAL 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" VARCHAR 8 rows CONSTANT [0-9]+.*"))); + + EXPECT_THAT( + summarizeToLines( + *data, {.includeChildNames = true, .includeNodeIds = true}), + ::testing::ElementsAre( + ::testing::MatchesRegex("0 ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" 0.0 INTEGER 8 rows FLAT [0-9]+.* a"), + ::testing::MatchesRegex(" 0.1 ARRAY 8 rows ARRAY [0-9]+.* b"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" 0.1.0 BIGINT 12 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" 0.2 MAP 8 rows DICTIONARY [0-9]+.* c"), + " Stats: 0 nulls, 4 unique", + ::testing::MatchesRegex(" 0.2.0 MAP 4 rows MAP [0-9]+.*"), + " Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2]", + ::testing::MatchesRegex( + " 0.2.0.0 INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" 0.2.0.1 REAL 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex( + " 0.3 VARCHAR 8 rows CONSTANT [0-9]+.* d"))); + + EXPECT_THAT( + summarizeToLines(*data, {.maxChildren = 2}), + ::testing::ElementsAre( + ::testing::MatchesRegex("ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + " ...2 more")); + + EXPECT_THAT( + summarizeToLines( + *data, {.maxChildren = 2, .indent = 2, .skipTopSummary = true}), + ::testing::ElementsAre( + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + " ...2 more")); +} + } // namespace facebook::velox::test