Skip to content

Commit

Permalink
Combine multiple layers of dictionaries when writing flatmap column (#…
Browse files Browse the repository at this point in the history
…114)

Summary:
Pull Request resolved: #114

The current writer cannot handle dictionary around flat map column.  Fix this case by push the dictionary to flat map values, so they can be potentially encoded with `ArrayWithOffsets`.

Reviewed By: xiaoxmeng, HuamengJiang

Differential Revision: D66992657
  • Loading branch information
Yuhta authored and facebook-github-bot committed Dec 12, 2024
1 parent cc6724a commit e692c24
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 3 deletions.
8 changes: 6 additions & 2 deletions dwio/nimble/velox/FieldWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,8 +851,12 @@ class FlatMapFieldWriter : public FieldWriter {
folly::Executor* executor = nullptr) override {
// Check if the vector received is already flattened
const auto isFlatMap = vector->type()->kind() == velox::TypeKind::ROW;
isFlatMap ? ingestFlattenedMap(vector, ranges)
: ingestMap(vector, ranges, executor);
if (isFlatMap) {
ingestFlattenedMap(
velox::RowVector::pushDictionaryToRowVectorLeaves(vector), ranges);
} else {
ingestMap(vector, ranges, executor);
}
}

FlatMapPassthroughValueFieldWriter& createPassthroughValueFieldWriter(
Expand Down
58 changes: 58 additions & 0 deletions dwio/nimble/velox/tests/VeloxWriterTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,64 @@ TEST_F(VeloxWriterTests, EncodingLayoutSchemaEvolutionExpandingRow) {
// that no captured encoding was used.
}

TEST_F(VeloxWriterTests, CombineMultipleLayersOfDictionaries) {
using namespace facebook::velox;
test::VectorMaker vectorMaker{leafPool_.get()};
auto wrapInDictionary = [&](const std::vector<vector_size_t>& indices,
const VectorPtr& values) {
auto buf =
AlignedBuffer::allocate<vector_size_t>(indices.size(), leafPool_.get());
memcpy(
buf->asMutable<vector_size_t>(),
indices.data(),
sizeof(vector_size_t) * indices.size());
return BaseVector::wrapInDictionary(nullptr, buf, indices.size(), values);
};
auto vector = vectorMaker.rowVector({
wrapInDictionary(
{0, 0, 1, 1},
vectorMaker.rowVector({
wrapInDictionary(
{0, 0}, vectorMaker.arrayVector<int64_t>({{1, 2, 3}})),
})),
});
nimble::VeloxWriterOptions options;
options.flatMapColumns = {"c0"};
options.dictionaryArrayColumns = {"c0"};
std::string file;
auto writeFile = std::make_unique<InMemoryWriteFile>(&file);
nimble::VeloxWriter writer(
*rootPool_,
ROW({"c0"}, {MAP(VARCHAR(), ARRAY(BIGINT()))}),
std::move(writeFile),
std::move(options));
writer.write(vector);
writer.close();
InMemoryReadFile readFile(file);
nimble::VeloxReadParams params;
params.readFlatMapFieldAsStruct = {"c0"};
params.flatMapFeatureSelector["c0"].features = {"c0"};
nimble::VeloxReader reader(*leafPool_, &readFile, nullptr, std::move(params));
VectorPtr result;
ASSERT_TRUE(reader.next(4, result));
ASSERT_EQ(result->size(), 4);
auto* c0 = result->asChecked<RowVector>()->childAt(0)->asChecked<RowVector>();
auto& dict = c0->childAt(0);
ASSERT_EQ(dict->encoding(), VectorEncoding::Simple::DICTIONARY);
ASSERT_EQ(dict->size(), 4);
auto* indices = dict->wrapInfo()->as<vector_size_t>();
for (int i = 0; i < 4; ++i) {
ASSERT_EQ(indices[i], 0);
}
auto* values = dict->valueVector()->asChecked<ArrayVector>();
ASSERT_EQ(values->size(), 1);
auto* elements = values->elements()->asChecked<SimpleVector<int64_t>>();
ASSERT_EQ(values->sizeAt(0), 3);
for (int i = 0; i < 3; ++i) {
ASSERT_EQ(elements->valueAt(i + values->offsetAt(0)), 1 + i);
}
}

#define ASSERT_CHUNK_COUNT(count, chunked) \
for (auto __i = 0; __i < count; ++__i) { \
ASSERT_TRUE(chunked.hasNext()); \
Expand Down
2 changes: 1 addition & 1 deletion velox
Submodule velox updated 1626 files

0 comments on commit e692c24

Please sign in to comment.