Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Group histogram values by encoding type and sort by storage size #108

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 93 additions & 50 deletions dwio/nimble/tools/NimbleDumpLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/
#include <algorithm>
#include <fstream>
#include <locale>
#include <numeric>
#include <ostream>
Expand All @@ -23,7 +22,6 @@

#include "common/strings/Zstd.h"
#include "dwio/common/filesystem/FileSystem.h"
#include "dwio/nimble/common/EncodingPrimitives.h"
#include "dwio/nimble/common/FixedBitArray.h"
#include "dwio/nimble/common/Types.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
Expand Down Expand Up @@ -56,15 +54,20 @@ struct GroupingKey {
std::optional<CompressionType> compressinType;
};

struct GroupingKeyCompare {
size_t operator()(const GroupingKey& lhs, const GroupingKey& rhs) const {
if (lhs.encodingType != rhs.encodingType) {
return lhs.encodingType < rhs.encodingType;
} else if (lhs.dataType != rhs.dataType) {
return lhs.dataType < rhs.dataType;
} else {
return lhs.compressinType < rhs.compressinType;
}
struct GroupingKeyHash {
size_t operator()(const GroupingKey& key) const {
size_t h1 = std::hash<EncodingType>()(key.encodingType);
size_t h2 = std::hash<DataType>()(key.dataType);
size_t h3 = std::hash<std::optional<CompressionType>>()(key.compressinType);
return h1 ^ (h2 << 1) ^ (h3 << 2);
}
};

struct GroupingKeyEqual {
bool operator()(const GroupingKey& lhs, const GroupingKey& rhs) const {
return lhs.encodingType == rhs.encodingType &&
lhs.dataType == rhs.dataType &&
lhs.compressinType == rhs.compressinType;
}
};

Expand All @@ -73,21 +76,46 @@ struct EncodingHistogramValue {
size_t bytes;
};

struct HistogramRowCompare {
size_t operator()(
const std::unordered_map<GroupingKey, EncodingHistogramValue>::
const_iterator& lhs,
const std::unordered_map<GroupingKey, EncodingHistogramValue>::
const_iterator& rhs) const {
const auto lhsEncoding = lhs->first.encodingType;
const auto rhsEncoding = rhs->first.encodingType;
const auto lhsSize = lhs->second.bytes;
const auto rhsSize = rhs->second.bytes;
if (lhsEncoding != rhsEncoding) {
return lhsEncoding < rhsEncoding;
} else {
return lhsSize > rhsSize;
}
}
};

enum class Alignment {
Left,
Right,
};

class TableFormatter {
public:
TableFormatter(
std::ostream& ostream,
std::vector<std::tuple<
std::string /* Title */,
uint8_t /* Width */
uint8_t /* Width */,
Alignment /* Horizontal Alignment */
>> fields,
bool noHeader = false)
: ostream_{ostream}, fields_{std::move(fields)} {
if (!noHeader) {
ostream << YELLOW;
for (const auto& field : fields_) {
ostream << std::left << std::setw(std::get<1>(field) + 2)
<< std::get<0>(field);
ostream << (std::get<2>(field) == Alignment::Right ? std::right
: std::left)
<< std::setw(std::get<1>(field) + 2) << std::get<0>(field);
}
ostream << RESET_COLOR << std::endl;
}
Expand All @@ -96,8 +124,9 @@ class TableFormatter {
void writeRow(const std::vector<std::string>& values) {
assert(values.size() == fields_.size());
for (auto i = 0; i < values.size(); ++i) {
ostream_ << std::left << std::setw(std::get<1>(fields_[i]) + 2)
<< values[i];
ostream_ << (std::get<2>(fields_[i]) == Alignment::Right ? std::right
: std::left)
<< std::setw(std::get<1>(fields_[i]) + 2) << values[i];
}
ostream_ << std::endl;
}
Expand All @@ -106,7 +135,8 @@ class TableFormatter {
std::ostream& ostream_;
std::vector<std::tuple<
std::string /* Title */,
uint8_t /* Width */
uint8_t /* Width */,
Alignment /* Horizontal Alignment */
>>
fields_;
};
Expand Down Expand Up @@ -362,20 +392,20 @@ void NimbleDumpLib::emitStripes(bool noHeader) {
TabletReader tabletReader{*pool_, file_.get()};
TableFormatter formatter(
ostream_,
{{"Stripe Id", 11},
{"Stripe Offset", 15},
{"Stripe Size", 15},
{"Row Count", 15}},
{{"Stripe Id", 7, Alignment::Left},
{"Stripe Offset", 15, Alignment::Right},
{"Stripe Size", 15, Alignment::Right},
{"Row Count", 10, Alignment::Right}},
noHeader);
traverseTablet(*pool_, tabletReader, std::nullopt, [&](uint32_t stripeIndex) {
auto stripeIdentifier = tabletReader.getStripeIdentifier(stripeIndex);
auto sizes = tabletReader.streamSizes(stripeIdentifier);
auto stripeSize = std::accumulate(sizes.begin(), sizes.end(), 0UL);
formatter.writeRow({
folly::to<std::string>(stripeIndex),
folly::to<std::string>(tabletReader.stripeOffset(stripeIndex)),
folly::to<std::string>(stripeSize),
folly::to<std::string>(tabletReader.stripeRowCount(stripeIndex)),
commaSeparated(tabletReader.stripeOffset(stripeIndex)),
commaSeparated(stripeSize),
commaSeparated(tabletReader.stripeRowCount(stripeIndex)),
});
});
}
Expand All @@ -387,19 +417,19 @@ void NimbleDumpLib::emitStreams(
std::optional<uint32_t> stripeId) {
auto tabletReader = std::make_shared<TabletReader>(*pool_, file_.get());

std::vector<std::tuple<std::string, uint8_t>> fields;
fields.push_back({"Stripe Id", 11});
fields.push_back({"Stream Id", 11});
fields.push_back({"Stream Offset", 13});
fields.push_back({"Stream Size", 13});
std::vector<std::tuple<std::string, uint8_t, Alignment>> fields;
fields.push_back({"Stripe Id", 11, Alignment::Left});
fields.push_back({"Stream Id", 11, Alignment::Left});
fields.push_back({"Stream Offset", 13, Alignment::Left});
fields.push_back({"Stream Size", 13, Alignment::Left});
if (showStreamRawSize) {
fields.push_back({"Raw Stream Size", 16});
fields.push_back({"Raw Stream Size", 16, Alignment::Left});
}
fields.push_back({"Item Count", 13});
fields.push_back({"Item Count", 13, Alignment::Left});
if (showStreamLabels) {
fields.push_back({"Stream Label", 16});
fields.push_back({"Stream Label", 16, Alignment::Left});
}
fields.push_back({"Type", 30});
fields.push_back({"Type", 30, Alignment::Left});

TableFormatter formatter(ostream_, fields, noHeader);

Expand Down Expand Up @@ -452,7 +482,11 @@ void NimbleDumpLib::emitHistogram(
bool noHeader,
std::optional<uint32_t> stripeId) {
TabletReader tabletReader{*pool_, file_.get()};
std::map<GroupingKey, EncodingHistogramValue, GroupingKeyCompare>
std::unordered_map<
GroupingKey,
EncodingHistogramValue,
GroupingKeyHash,
GroupingKeyEqual>
encodingHistogram;
const std::unordered_map<std::string, CompressionType> compressionMap{
{toString(CompressionType::Uncompressed), CompressionType::Uncompressed},
Expand Down Expand Up @@ -499,20 +533,29 @@ void NimbleDumpLib::emitHistogram(

TableFormatter formatter(
ostream_,
{{"Encoding Type", 17},
{"Data Type", 13},
{"Compression", 15},
{"Instance Count", 15},
{"Storage Bytes", 15}},
{{"Encoding Type", 17, Alignment::Left},
{"Data Type", 13, Alignment::Left},
{"Compression", 15, Alignment::Left},
{"Instance Count", 15, Alignment::Right},
{"Storage Bytes", 15, Alignment::Right}},
noHeader);

for (auto& [key, value] : encodingHistogram) {
std::vector<
std::unordered_map<GroupingKey, EncodingHistogramValue>::const_iterator>
rows;
for (auto it = encodingHistogram.begin(); it != encodingHistogram.end();
++it) {
rows.push_back(it);
}
std::sort(rows.begin(), rows.end(), HistogramRowCompare{});

for (const auto& it : rows) {
formatter.writeRow({
toString(key.encodingType),
toString(key.dataType),
key.compressinType ? toString(*key.compressinType) : "",
folly::to<std::string>(value.count),
folly::to<std::string>(value.bytes),
toString(it->first.encodingType),
toString(it->first.dataType),
it->first.compressinType ? toString(*it->first.compressinType) : "",
commaSeparated(it->second.count),
commaSeparated(it->second.bytes),
});
}
}
Expand Down Expand Up @@ -722,11 +765,11 @@ void NimbleDumpLib::emitLayout(bool noHeader, bool compressed) {
TableFormatter formatter(
ostream_,
{
{"Node Id", 11},
{"Parent Id", 11},
{"Node Type", 15},
{"Node Name", 17},
{"Encoding Layout", 20},
{"Node Id", 11, Alignment::Left},
{"Parent Id", 11, Alignment::Left},
{"Node Type", 15, Alignment::Left},
{"Node Name", 17, Alignment::Left},
{"Encoding Layout", 20, Alignment::Left},
},
noHeader);

Expand Down
Loading