Skip to content

Commit

Permalink
Back out "Pre-load lazy vectors that are referenced by multiple sub e…
Browse files Browse the repository at this point in the history
…xpressions" (#2295)

Summary:
Pull Request resolved: #2295

Original commit changeset: d88dc5ec5cb0

Original Phabricator Diff: D38583570 (0062f14)

Reviewed By: bikramSingh91

Differential Revision: D38710882

fbshipit-source-id: ef0df3aa6a421ec0b07c858198ca67cb0a7b66cb
  • Loading branch information
tanjialiang authored and facebook-github-bot committed Aug 15, 2022
1 parent ccbc865 commit 5e2c7e3
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 133 deletions.
13 changes: 13 additions & 0 deletions velox/exec/FilterProject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,19 @@ RowVectorPtr FilterProject::getOutput() {
}

void FilterProject::project(const SelectivityVector& rows, EvalCtx* evalCtx) {
// Make sure LazyVectors are loaded for all the "rows".
//
// Consider projection with 2 expressions: f(a) AND g(b), h(b)
// If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it
// will load b only for rows where f(a) is true. However, h(b) projection
// needs all rows for "b".
//
// This works, but may load more rows than necessary. E.g. if we only have
// f(a) AND g(b) expression and b is not used anywhere else, it is sufficient
// to load b for a subset of rows where f(a) is true.
*evalCtx->mutableIsFinalSelection() = false;
*evalCtx->mutableFinalSelection() = &rows;

exprs_->eval(
hasFilter_ ? 1 : 0, numExprs_, !hasFilter_, rows, evalCtx, &results_);
}
Expand Down
22 changes: 0 additions & 22 deletions velox/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1830,28 +1830,6 @@ TEST_F(TableScanTest, structLazy) {
assertQuery(op, {filePath}, "select c0 % 3 from tmp");
}

TEST_F(TableScanTest, lazyVectorAccessTwiceWithDifferentRows) {
auto data = makeRowVector({
makeNullableFlatVector<int64_t>({1, 1, 1, std::nullopt}),
makeNullableFlatVector<int64_t>({0, 1, 2, 3}),
});

auto filePath = TempFilePath::create();
writeToFile(filePath->path, {data});
createDuckDbTable({data});

auto plan =
PlanBuilder()
.tableScan(asRowType(data->type()))
.filter(
"element_at(array_constructor(c0 + c1, if(c1 >= 0, c1, 0)), 1) > 0")
.planNode();
assertQuery(
plan,
{filePath},
"SELECT c0, c1 from tmp where ([c0 + c1, if(c1 >= 0, c1, 0)])[1] > 0");
}

TEST_F(TableScanTest, structInArrayOrMap) {
vector_size_t size = 1'000;

Expand Down
40 changes: 4 additions & 36 deletions velox/expression/Expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,11 @@ bool isMember(
}

void mergeFields(
std::vector<FieldReference*>& distinctFields,
std::unordered_set<FieldReference*>& multiplyReferencedFields_,
std::vector<FieldReference*>& fields,
const std::vector<FieldReference*>& moreFields) {
for (auto* newField : moreFields) {
if (isMember(distinctFields, *newField)) {
multiplyReferencedFields_.insert(newField);
} else {
distinctFields.emplace_back(newField);
if (!isMember(fields, *newField)) {
fields.emplace_back(newField);
}
}
}
Expand Down Expand Up @@ -187,13 +184,11 @@ void Expr::computeMetadata() {
propagatesNulls_ = vectorFunction_->isDefaultNullBehavior();
deterministic_ = vectorFunction_->isDeterministic();
}

for (auto& input : inputs_) {
input->computeMetadata();
deterministic_ &= input->deterministic_;
propagatesNulls_ &= input->propagatesNulls_;
mergeFields(
distinctFields_, multiplyReferencedFields_, input->distinctFields_);
mergeFields(distinctFields_, input->distinctFields_);
}
if (isSpecialForm()) {
propagatesNulls_ = propagatesNulls();
Expand Down Expand Up @@ -373,9 +368,6 @@ void Expr::eval(
// all the time. Therefore, we should delay loading lazy vectors until we
// know the minimum subset of rows needed to be loaded.
//
// Load fields multiply referenced by inputs unconditionally. It's hard to
// know the superset of rows the multiple inputs need to load.
//
// If there is only one field, load it unconditionally. The very first IF,
// AND or OR will have to load it anyway. Pre-loading enables peeling of
// encodings at a higher level in the expression tree and avoids repeated
Expand All @@ -387,12 +379,6 @@ void Expr::eval(
for (const auto& field : distinctFields_) {
context.ensureFieldLoaded(field->index(context), rows);
}
} else if (!propagatesNulls_) {
// Load multiply-referenced fields at common parent expr with "rows".
// Delay loading fields that are not in multiplyReferencedFields_.
for (const auto& field : multiplyReferencedFields_) {
context.ensureFieldLoaded(field->index(context), rows);
}
}

if (inputs_.empty()) {
Expand Down Expand Up @@ -1298,11 +1284,6 @@ ExprSet::ExprSet(
: execCtx_(execCtx) {
exprs_ = compileExpressions(
std::move(sources), execCtx, this, enableConstantFolding);
std::vector<FieldReference*> allDistinctFields;
for (auto& expr : exprs_) {
mergeFields(
allDistinctFields, multiplyReferencedFields_, expr->distinctFields());
}
}

namespace {
Expand Down Expand Up @@ -1377,18 +1358,6 @@ void ExprSet::eval(
if (initialize) {
clearSharedSubexprs();
}

// Make sure LazyVectors, referenced by multiple expressions, are loaded
// for all the "rows".
//
// Consider projection with 2 expressions: f(a) AND g(b), h(b)
// If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it
// will load b only for rows where f(a) is true. However, h(b) projection
// needs all rows for "b".
for (const auto& field : multiplyReferencedFields_) {
context->ensureFieldLoaded(field->index(*context), rows);
}

for (int32_t i = begin; i < end; ++i) {
exprs_[i]->eval(rows, *context, (*result)[i]);
}
Expand All @@ -1405,7 +1374,6 @@ void ExprSet::clear() {
for (auto* memo : memoizingExprs_) {
memo->clearMemo();
}
multiplyReferencedFields_.clear();
}

void ExprSetSimplified::eval(
Expand Down
7 changes: 0 additions & 7 deletions velox/expression/Expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,6 @@ class Expr {
// parent Expr.
std::vector<FieldReference * FOLLY_NONNULL> distinctFields_;

// Fields referenced by multiple inputs, which is subset of distinctFields_.
// Used to determine pre-loading of lazy vectors at current expr.
std::unordered_set<FieldReference * FOLLY_NONNULL> multiplyReferencedFields_;

// True if a null in any of 'distinctFields_' causes 'this' to be
// null for the row.
bool propagatesNulls_ = false;
Expand Down Expand Up @@ -446,9 +442,6 @@ class ExprSet {

std::vector<std::shared_ptr<Expr>> exprs_;

// Fields referenced by multiple expressions in ExprSet.
std::unordered_set<FieldReference * FOLLY_NONNULL> multiplyReferencedFields_;

// Distinct Exprs reachable from 'exprs_' for which reset() needs to
// be called at the start of eval().
std::vector<std::shared_ptr<Expr>> toReset_;
Expand Down
68 changes: 0 additions & 68 deletions velox/expression/tests/ExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,74 +940,6 @@ TEST_F(ExprTest, selectiveLazyLoadingOr) {
assertEqualVectors(expected, result);
}

TEST_F(ExprTest, lazyVectorAccessTwiceWithDifferentRows) {
const vector_size_t size = 4;

auto c0 = makeNullableFlatVector<int64_t>({1, 1, 1, std::nullopt});
// [0, 1, 2, 3] if fully loaded
std::vector<vector_size_t> loadedRows;
auto valueAt = [](auto row) { return row; };
VectorPtr c1 = std::make_shared<LazyVector>(
pool_.get(),
BIGINT(),
size,
std::make_unique<test::SimpleVectorLoader>([&](auto rows) {
for (auto row : rows) {
loadedRows.push_back(row);
}
return makeFlatVector<int64_t>(rows.back() + 1, valueAt);
}));

auto result = evaluate(
"row_constructor(c0 + c1, if (c1 >= 0, c1, 0))", makeRowVector({c0, c1}));

auto expected = makeRowVector(
{makeNullableFlatVector<int64_t>({1, 2, 3, std::nullopt}),
makeNullableFlatVector<int64_t>({0, 1, 2, 3})});

assertEqualVectors(expected, result);
}

TEST_F(ExprTest, lazyVectorAccessTwiceInDifferentExpressions) {
const vector_size_t size = 1'000;

// Fields referenced by multiple expressions will load lazy vector
// immediately in ExprSet::eval().
auto isNullAtColA = [](auto row) { return row % 4 == 0; };
auto isNullAtColC = [](auto row) { return row % 2 == 0; };

auto a = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row; },
isNullAtColA,
size,
[](auto row) { return row; });
auto b = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row * 2; },
nullptr,
size,
[](auto row) { return row; });
auto c = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row; },
isNullAtColC,
size,
[](auto row) { return row; });

auto result = evaluateMultiple(
{"if(c0 is not null, c0, c1)", "if (c2 is not null, c2, c1)"},
makeRowVector({a, b, c}));

auto expected = makeFlatVector<int64_t>(
size, [](auto row) { return row % 4 == 0 ? row * 2 : row; });
assertEqualVectors(expected, result[0]);

expected = makeFlatVector<int64_t>(
size, [](auto row) { return row % 2 == 0 ? row * 2 : row; });
assertEqualVectors(expected, result[1]);
}

TEST_F(ExprTest, selectiveLazyLoadingIf) {
const vector_size_t size = 1'000;

Expand Down

0 comments on commit 5e2c7e3

Please sign in to comment.