From 74bf4d6aecb354b09aec6ae13f59ffaef8a01cba Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Mon, 16 Dec 2024 03:57:37 -0800 Subject: [PATCH] feat: Add utility to print summary of a vector (#11859) Summary: Introduce VectorPrinter::summarizeToText helper function to generate human-friendly summary of a vector. The summary shows the overall hierarchy of the vector and annotates each node with data type, number of rows, encoding, and size in memory. Dictionary nodes include number of unique indices. Array and Map nodes specify number of empty arrays / maps as well as min/max/avg sizes of the arrays / maps. ``` ROW(4) 8 rows ROW 528B INTEGER 8 rows FLAT 32B ARRAY 8 rows ARRAY 288B Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3] BIGINT 12 rows FLAT 128B MAP 8 rows DICTIONARY 192B Stats: 0 nulls, 4 unique MAP 4 rows MAP 160B Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2] INTEGER 8 rows FLAT 32B REAL 8 rows FLAT 32B VARCHAR 8 rows CONSTANT 16B ``` The summary optionally includes unique node IDs to allow for easy referencing. ``` 0 ROW(4) 8 rows ROW 528B 0.0 INTEGER 8 rows FLAT 32B 0.1 ARRAY 8 rows ARRAY 288B Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3] 0.1.0 BIGINT 12 rows FLAT 128B 0.2 MAP 8 rows DICTIONARY 192B Stats: 0 nulls, 4 unique 0.2.0 MAP 4 rows MAP 160B Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2] 0.2.0.0 INTEGER 8 rows FLAT 32B 0.2.0.1 REAL 8 rows FLAT 32B 0.3 VARCHAR 8 rows CONSTANT 16B ``` The number of RowVector chidren is limited to 5, but can be increased by specifying options.maxChildren: ``` ROW(4) 8 rows ROW 528B INTEGER 8 rows FLAT 32B ARRAY 8 rows ARRAY 288B Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3] BIGINT 12 rows FLAT 128B ...2 more ``` Reviewed By: xiaoxmeng Differential Revision: D67229321 --- velox/vector/VectorPrinter.cpp | 272 +++++++++++++++++++++-- velox/vector/VectorPrinter.h | 69 ++++++ velox/vector/tests/CMakeLists.txt | 1 + velox/vector/tests/VectorPrinterTest.cpp | 102 ++++++++- 4 files changed, 428 insertions(+), 16 deletions(-) diff --git a/velox/vector/VectorPrinter.cpp b/velox/vector/VectorPrinter.cpp index eab9e9214919..f0eff3ca0648 100644 --- a/velox/vector/VectorPrinter.cpp +++ b/velox/vector/VectorPrinter.cpp @@ -39,11 +39,11 @@ std::string printFixedWidth( return base->toString(baseIndex); } -class VectorPrinter { +class VectorPrinterBase { public: - explicit VectorPrinter(const BaseVector& vector) : decoded_{vector} {} + explicit VectorPrinterBase(const BaseVector& vector) : decoded_{vector} {} - virtual ~VectorPrinter() = default; + virtual ~VectorPrinterBase() = default; std::string summarize(vector_size_t index) const { if (decoded_.isNullAt(index)) { @@ -75,15 +75,16 @@ class VectorPrinter { virtual std::string summarizeNonNull(vector_size_t index) const = 0; DecodedVector decoded_; - std::vector> children_; + std::vector> children_; }; -std::unique_ptr createVectorPrinter(const BaseVector& vector); +std::unique_ptr createVectorPrinter( + const BaseVector& vector); -class PrimitiveVectorPrinter : public VectorPrinter { +class PrimitiveVectorPrinter : public VectorPrinterBase { public: explicit PrimitiveVectorPrinter(const BaseVector& vector) - : VectorPrinter(vector) {} + : VectorPrinterBase(vector) {} protected: std::string printNonNull(vector_size_t index, const std::string& indent) @@ -109,10 +110,10 @@ class PrimitiveVectorPrinter : public VectorPrinter { } }; -class ArrayVectorPrinter : public VectorPrinter { +class ArrayVectorPrinter : public VectorPrinterBase { public: explicit ArrayVectorPrinter(const BaseVector& vector) - : VectorPrinter(vector) { + : VectorPrinterBase(vector) { auto* arrayVector = decoded_.base()->as(); children_.emplace_back(createVectorPrinter(*arrayVector->elements())); } @@ -157,9 +158,10 @@ class ArrayVectorPrinter : public VectorPrinter { } }; -class MapVectorPrinter : public VectorPrinter { +class MapVectorPrinter : public VectorPrinterBase { public: - explicit MapVectorPrinter(const BaseVector& vector) : VectorPrinter(vector) { + explicit MapVectorPrinter(const BaseVector& vector) + : VectorPrinterBase(vector) { auto* mapVector = decoded_.base()->as(); children_.emplace_back(createVectorPrinter(*mapVector->mapKeys())); children_.emplace_back(createVectorPrinter(*mapVector->mapValues())); @@ -217,9 +219,10 @@ class MapVectorPrinter : public VectorPrinter { } }; -class RowVectorPrinter : public VectorPrinter { +class RowVectorPrinter : public VectorPrinterBase { public: - explicit RowVectorPrinter(const BaseVector& vector) : VectorPrinter(vector) { + explicit RowVectorPrinter(const BaseVector& vector) + : VectorPrinterBase(vector) { auto* rowVector = decoded_.base()->as(); for (const auto& child : rowVector->children()) { children_.emplace_back(createVectorPrinter(*child)); @@ -249,7 +252,8 @@ class RowVectorPrinter : public VectorPrinter { } }; -std::unique_ptr createVectorPrinter(const BaseVector& vector) { +std::unique_ptr createVectorPrinter( + const BaseVector& vector) { switch (vector.typeKind()) { case TypeKind::ARRAY: return std::make_unique(vector); @@ -395,4 +399,244 @@ std::string printVector( return out.str(); } +namespace { +class VectorVisitor { + public: + struct Context { + VectorPrinter::Options options; + + std::stringstream text; + + int32_t indent{0}; + + bool skipTopSummary{false}; + + // Vector name if a child or a RowVector. + std::optional name; + + // Node ID in the format A.B.C.D, where each component is an index of the + // node in the corresponding layer of the hierarchy. + std::string parentNodeId; + + size_t nodeId{0}; + }; + + void visit(const BaseVector& vector, Context& ctx) { + const auto parentNodeId = ctx.parentNodeId; + const auto nodeId = ctx.nodeId; + const auto name = ctx.name; + + ctx.parentNodeId = ctx.parentNodeId.empty() + ? std::to_string(ctx.nodeId) + : fmt::format("{}.{}", parentNodeId, ctx.nodeId); + + if (ctx.skipTopSummary) { + ctx.skipTopSummary = false; + } else { + ctx.text << toIndentation(ctx.indent); + if (ctx.options.includeNodeIds) { + ctx.text << ctx.parentNodeId << " "; + } + ctx.text << toSummaryString(vector, ctx) << std::endl; + } + + ctx.nodeId = 0; + ctx.name.reset(); + ctx.indent++; + + SCOPE_EXIT { + ctx.parentNodeId = parentNodeId; + ctx.nodeId = nodeId; + ctx.name = name; + ctx.indent--; + }; + + switch (vector.encoding()) { + case VectorEncoding::Simple::FLAT: + break; + case VectorEncoding::Simple::ARRAY: + visitArrayVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::MAP: + visitMapVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::ROW: + visitRowVector(*vector.as(), ctx); + break; + case VectorEncoding::Simple::DICTIONARY: + visitDictionaryVector(vector, ctx); + break; + case VectorEncoding::Simple::CONSTANT: + visitConstantVector(vector, ctx); + break; + default: + VELOX_NYI(); + } + } + + private: + static std::string toIndentation(int32_t indent) { + static constexpr auto kIndentSize = 3; + + return std::string(indent * kIndentSize, ' '); + } + + static std::string truncate(const std::string& str, size_t maxLen = 50) { + return str.substr(0, maxLen); + } + + static std::string toSummaryString(const BaseVector& vector, Context& ctx) { + std::stringstream summary; + summary << vector.type()->toSummaryString(ctx.options.types); + summary << " " << vector.size() << " rows"; + + summary << " " << VectorEncoding::mapSimpleToName(vector.encoding()); + summary << " " << succinctBytes(vector.retainedSize()); + + if (ctx.name.has_value()) { + summary << " " << truncate(ctx.name.value()); + } + return summary.str(); + } + + // Computes basic statistics about integers: min, max, avg. + class IntegerStats { + public: + void add(int64_t value) { + min_ = std::min(min_, value); + max_ = std::max(max_, value); + sum_ += value; + ++cnt_; + } + + int64_t min() const { + return min_; + } + + int64_t max() const { + return max_; + } + + int64_t count() const { + return cnt_; + } + + double avg() const { + return cnt_ > 0 ? (sum_ / cnt_) : 0; + } + + private: + int64_t min_{std::numeric_limits::max()}; + int64_t max_{std::numeric_limits::min()}; + size_t cnt_{0}; + double sum_{0.0}; + }; + + static void appendArrayStats(const ArrayVectorBase& base, Context& ctx) { + size_t numNulls = 0; + size_t numEmpty = 0; + IntegerStats sizeStats; + + for (auto i = 0; i < base.size(); ++i) { + if (base.isNullAt(i)) { + ++numNulls; + } else if (base.sizeAt(i) == 0) { + ++numEmpty; + } else { + sizeStats.add(base.sizeAt(i)); + } + } + + const auto indent = toIndentation(ctx.indent + 1); + ctx.text << indent << "Stats: " << numNulls << " nulls, " << numEmpty + << " empty"; + + if (sizeStats.count() > 0) { + if (sizeStats.min() == sizeStats.max()) { + ctx.text << ", sizes: " << sizeStats.min(); + } else { + ctx.text << ", sizes: [" << sizeStats.min() << "..." << sizeStats.max() + << ", avg " << (int)sizeStats.avg() << "]"; + } + } + + ctx.text << std::endl; + } + + void visitArrayVector(const ArrayVector& vector, Context& ctx) { + appendArrayStats(vector, ctx); + + visit(*vector.elements(), ctx); + } + + void visitMapVector(const MapVector& vector, Context& ctx) { + appendArrayStats(vector, ctx); + + visit(*vector.mapKeys(), ctx); + + ctx.nodeId++; + visit(*vector.mapValues(), ctx); + } + + void visitRowVector(const RowVector& vector, Context& ctx) { + const auto& rowType = vector.type()->asRow(); + const auto cnt = + std::min(ctx.options.maxChildren, vector.childrenSize()); + for (size_t i = 0; i < cnt; ++i) { + if (ctx.options.includeChildNames) { + ctx.name = rowType.nameOf(i); + } + + visit(*vector.childAt(i), ctx); + ctx.nodeId++; + } + ctx.name.reset(); + + if (vector.childrenSize() > cnt) { + ctx.text << toIndentation(ctx.indent) << "..." + << (vector.childrenSize() - cnt) << " more" << std::endl; + } + } + + void visitDictionaryVector(const BaseVector& vector, Context& ctx) { + size_t numNulls = 0; + std::unordered_set uniqueIndices; + + const auto* rawIndices = vector.wrapInfo()->as(); + for (auto i = 0; i < vector.size(); ++i) { + if (vector.isNullAt(i)) { + ++numNulls; + } else { + uniqueIndices.insert(rawIndices[i]); + } + } + + ctx.text << toIndentation(ctx.indent + 1) << "Stats: " << numNulls + << " nulls, " << uniqueIndices.size() << " unique" << std::endl; + + visit(*vector.valueVector(), ctx); + } + + void visitConstantVector(const BaseVector& vector, Context& ctx) { + if (vector.valueVector() != nullptr) { + visit(*vector.valueVector(), ctx); + } + } +}; +} // namespace + +// static +std::string VectorPrinter::summarizeToText( + const BaseVector& vector, + const Options& options) { + VectorVisitor::Context ctx; + ctx.options = options; + ctx.skipTopSummary = options.skipTopSummary; + ctx.indent = options.indent; + + VectorVisitor visitor; + visitor.visit(vector, ctx); + return ctx.text.str(); +} + } // namespace facebook::velox diff --git a/velox/vector/VectorPrinter.h b/velox/vector/VectorPrinter.h index 83aeffce714b..74b7a251330c 100644 --- a/velox/vector/VectorPrinter.h +++ b/velox/vector/VectorPrinter.h @@ -40,4 +40,73 @@ std::string printVector( const BaseVector& vector, const SelectivityVector& rows); +class VectorPrinter { + public: + struct Options { + // Options that control summarization of types. + velox::Type::TypeSummaryOptions types; + + // Maximum number of child vectors to include in the summary. + size_t maxChildren{5}; + + // Whether to include the names of the RowVector child vectors. + bool includeChildNames{false}; + + // Whether to include unique IDs for each node in the vector hierarchy. + bool includeNodeIds{false}; + + // Optional indent to add to all lines. Useful when embedding the output of + // the printer into some other text. Each indentation is 3 spaces. 0 means + // no indentation. 1 - 3 spaces. 2 - 6 spaces. + int32_t indent{0}; + + // Whether to skip printing the summary of the top level vector. Similar to + // 'indent', useful when embedding the output of the printer into some other + // text. + bool skipTopSummary{false}; + + // Workaround for compiler error: default member initializer for + // 'maxChildren' needed within definition of enclosing class 'VectorPrinter' + // outside of member functions + // + // static std::string summarizeToText( + // const BaseVector& vector, + // Options options = {}); + static Options defaultOptions() { + return {}; + } + }; + + // Returns a summary of the vector in human-readable text format. + // + // Prints a hierarchy of vectors with up to options.maxChildren children at + // each level. For each vector, prints a header that includes the data type, + // number of rows, encoding. + // + // For example, + // + // A flat vector of integers: + // + // INTEGER 8 rows FLAT 32B + // + // A flat array of integers with 3 null arrays, 1 empty array and the rest + // of arrays of average size 3: + // + // ARRAY 8 rows ARRAY 288B + // Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3] + // BIGINT 12 rows FLAT 128B + // + // A dictionary over map with 4 unique maps: + // + // MAP 8 rows DICTIONARY 192B + // Stats: 0 nulls, 4 unique + // MAP 4 rows MAP 160B + // Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2] + // INTEGER 8 rows FLAT 32B + // REAL 8 rows FLAT 32B + static std::string summarizeToText( + const BaseVector& vector, + const Options& options = Options::defaultOptions()); +}; + } // namespace facebook::velox diff --git a/velox/vector/tests/CMakeLists.txt b/velox/vector/tests/CMakeLists.txt index 089529764477..c47ccec4d360 100644 --- a/velox/vector/tests/CMakeLists.txt +++ b/velox/vector/tests/CMakeLists.txt @@ -60,6 +60,7 @@ target_link_libraries( Boost::system GTest::gtest GTest::gtest_main + GTest::gmock Folly::folly gflags::gflags glog::glog diff --git a/velox/vector/tests/VectorPrinterTest.cpp b/velox/vector/tests/VectorPrinterTest.cpp index bbc5436f15ec..18543aef9272 100644 --- a/velox/vector/tests/VectorPrinterTest.cpp +++ b/velox/vector/tests/VectorPrinterTest.cpp @@ -17,6 +17,8 @@ #include "velox/vector/fuzzer/VectorFuzzer.h" #include "velox/vector/tests/utils/VectorTestBase.h" +#include + namespace facebook::velox::test { class VectorPrinterTest : public testing::Test, public VectorTestBase { @@ -24,10 +26,21 @@ class VectorPrinterTest : public testing::Test, public VectorTestBase { static void SetUpTestCase() { memory::MemoryManager::testingSetInstance({}); } + + static std::vector summarizeToLines( + const BaseVector& vector, + const VectorPrinter::Options& options = {}) { + std::vector lines; + folly::split('\n', VectorPrinter::summarizeToText(vector, options), lines); + if (lines.back().empty()) { + lines.pop_back(); + } + return lines; + } }; // Sanity check that printVector doesn't fail or crash. -TEST_F(VectorPrinterTest, basic) { +TEST_F(VectorPrinterTest, printVectorFuzz) { VectorFuzzer::Options options; options.vectorSize = 100; options.nullRatio = 0.1; @@ -50,11 +63,96 @@ TEST_F(VectorPrinterTest, basic) { } } -TEST_F(VectorPrinterTest, map) { +TEST_F(VectorPrinterTest, printVectorMap) { auto data = makeMapVector({ {}, {{1, 10}}, }); ASSERT_NO_THROW(printVector(*data)); } + +TEST_F(VectorPrinterTest, summarizeToText) { + auto data = makeRowVector( + {"a", "b", "c", "d"}, + { + makeFlatVector({1, 2, 3, 4, 5, 6, 7, 8}), + makeArrayVectorFromJson({ + "[1, 2, 3]", + "[4, 5]", + "null", + "[6, 7, 8, 9]", + "null", + "null", + "[10, 11, 12]", + "[]", + }), + wrapInDictionary( + makeIndices({0, 0, 0, 1, 2, 2, 3, 3}), + makeMapVectorFromJson({ + "{1: 1.0}", + "{3: 3.0, 4: 4.0, 5: 5.0}", + "{}", + "{6: 6.0, 7: 7.0, 8: 8.0, 9: 9.0}", + })), + makeConstant("hello", 8), + }); + + EXPECT_THAT( + summarizeToLines(*data), + ::testing::ElementsAre( + ::testing::MatchesRegex("ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" MAP 8 rows DICTIONARY [0-9]+.*"), + " Stats: 0 nulls, 4 unique", + ::testing::MatchesRegex(" MAP 4 rows MAP [0-9]+.*"), + " Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2]", + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" REAL 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" VARCHAR 8 rows CONSTANT [0-9]+.*"))); + + EXPECT_THAT( + summarizeToLines( + *data, + {.types = {}, .includeChildNames = true, .includeNodeIds = true}), + ::testing::ElementsAre( + ::testing::MatchesRegex("0 ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" 0.0 INTEGER 8 rows FLAT [0-9]+.* a"), + ::testing::MatchesRegex(" 0.1 ARRAY 8 rows ARRAY [0-9]+.* b"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" 0.1.0 BIGINT 12 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" 0.2 MAP 8 rows DICTIONARY [0-9]+.* c"), + " Stats: 0 nulls, 4 unique", + ::testing::MatchesRegex(" 0.2.0 MAP 4 rows MAP [0-9]+.*"), + " Stats: 0 nulls, 1 empty, sizes: [1...4, avg 2]", + ::testing::MatchesRegex( + " 0.2.0.0 INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" 0.2.0.1 REAL 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex( + " 0.3 VARCHAR 8 rows CONSTANT [0-9]+.* d"))); + + EXPECT_THAT( + summarizeToLines(*data, {.types = {}, .maxChildren = 2}), + ::testing::ElementsAre( + ::testing::MatchesRegex("ROW\\(4\\) 8 rows ROW [0-9]+.*"), + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + " ...2 more")); + + EXPECT_THAT( + summarizeToLines( + *data, + {.types = {}, .maxChildren = 2, .indent = 2, .skipTopSummary = true}), + ::testing::ElementsAre( + ::testing::MatchesRegex(" INTEGER 8 rows FLAT [0-9]+.*"), + ::testing::MatchesRegex(" ARRAY 8 rows ARRAY [0-9]+.*"), + " Stats: 3 nulls, 1 empty, sizes: [2...4, avg 3]", + ::testing::MatchesRegex(" BIGINT 12 rows FLAT [0-9]+.*"), + " ...2 more")); +} + } // namespace facebook::velox::test