Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Back out "Pre-load lazy vectors that are referenced by multiple sub expressions" #2295

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions velox/exec/FilterProject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,19 @@ RowVectorPtr FilterProject::getOutput() {
}

void FilterProject::project(const SelectivityVector& rows, EvalCtx* evalCtx) {
// Make sure LazyVectors are loaded for all the "rows".
//
// Consider projection with 2 expressions: f(a) AND g(b), h(b)
// If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it
// will load b only for rows where f(a) is true. However, h(b) projection
// needs all rows for "b".
//
// This works, but may load more rows than necessary. E.g. if we only have
// f(a) AND g(b) expression and b is not used anywhere else, it is sufficient
// to load b for a subset of rows where f(a) is true.
*evalCtx->mutableIsFinalSelection() = false;
*evalCtx->mutableFinalSelection() = &rows;

exprs_->eval(
hasFilter_ ? 1 : 0, numExprs_, !hasFilter_, rows, evalCtx, &results_);
}
Expand Down
22 changes: 0 additions & 22 deletions velox/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1830,28 +1830,6 @@ TEST_F(TableScanTest, structLazy) {
assertQuery(op, {filePath}, "select c0 % 3 from tmp");
}

TEST_F(TableScanTest, lazyVectorAccessTwiceWithDifferentRows) {
auto data = makeRowVector({
makeNullableFlatVector<int64_t>({1, 1, 1, std::nullopt}),
makeNullableFlatVector<int64_t>({0, 1, 2, 3}),
});

auto filePath = TempFilePath::create();
writeToFile(filePath->path, {data});
createDuckDbTable({data});

auto plan =
PlanBuilder()
.tableScan(asRowType(data->type()))
.filter(
"element_at(array_constructor(c0 + c1, if(c1 >= 0, c1, 0)), 1) > 0")
.planNode();
assertQuery(
plan,
{filePath},
"SELECT c0, c1 from tmp where ([c0 + c1, if(c1 >= 0, c1, 0)])[1] > 0");
}

TEST_F(TableScanTest, structInArrayOrMap) {
vector_size_t size = 1'000;

Expand Down
40 changes: 4 additions & 36 deletions velox/expression/Expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,11 @@ bool isMember(
}

void mergeFields(
std::vector<FieldReference*>& distinctFields,
std::unordered_set<FieldReference*>& multiplyReferencedFields_,
std::vector<FieldReference*>& fields,
const std::vector<FieldReference*>& moreFields) {
for (auto* newField : moreFields) {
if (isMember(distinctFields, *newField)) {
multiplyReferencedFields_.insert(newField);
} else {
distinctFields.emplace_back(newField);
if (!isMember(fields, *newField)) {
fields.emplace_back(newField);
}
}
}
Expand Down Expand Up @@ -187,13 +184,11 @@ void Expr::computeMetadata() {
propagatesNulls_ = vectorFunction_->isDefaultNullBehavior();
deterministic_ = vectorFunction_->isDeterministic();
}

for (auto& input : inputs_) {
input->computeMetadata();
deterministic_ &= input->deterministic_;
propagatesNulls_ &= input->propagatesNulls_;
mergeFields(
distinctFields_, multiplyReferencedFields_, input->distinctFields_);
mergeFields(distinctFields_, input->distinctFields_);
}
if (isSpecialForm()) {
propagatesNulls_ = propagatesNulls();
Expand Down Expand Up @@ -373,9 +368,6 @@ void Expr::eval(
// all the time. Therefore, we should delay loading lazy vectors until we
// know the minimum subset of rows needed to be loaded.
//
// Load fields multiply referenced by inputs unconditionally. It's hard to
// know the superset of rows the multiple inputs need to load.
//
// If there is only one field, load it unconditionally. The very first IF,
// AND or OR will have to load it anyway. Pre-loading enables peeling of
// encodings at a higher level in the expression tree and avoids repeated
Expand All @@ -387,12 +379,6 @@ void Expr::eval(
for (const auto& field : distinctFields_) {
context.ensureFieldLoaded(field->index(context), rows);
}
} else if (!propagatesNulls_) {
// Load multiply-referenced fields at common parent expr with "rows".
// Delay loading fields that are not in multiplyReferencedFields_.
for (const auto& field : multiplyReferencedFields_) {
context.ensureFieldLoaded(field->index(context), rows);
}
}

if (inputs_.empty()) {
Expand Down Expand Up @@ -1298,11 +1284,6 @@ ExprSet::ExprSet(
: execCtx_(execCtx) {
exprs_ = compileExpressions(
std::move(sources), execCtx, this, enableConstantFolding);
std::vector<FieldReference*> allDistinctFields;
for (auto& expr : exprs_) {
mergeFields(
allDistinctFields, multiplyReferencedFields_, expr->distinctFields());
}
}

namespace {
Expand Down Expand Up @@ -1377,18 +1358,6 @@ void ExprSet::eval(
if (initialize) {
clearSharedSubexprs();
}

// Make sure LazyVectors, referenced by multiple expressions, are loaded
// for all the "rows".
//
// Consider projection with 2 expressions: f(a) AND g(b), h(b)
// If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it
// will load b only for rows where f(a) is true. However, h(b) projection
// needs all rows for "b".
for (const auto& field : multiplyReferencedFields_) {
context->ensureFieldLoaded(field->index(*context), rows);
}

for (int32_t i = begin; i < end; ++i) {
exprs_[i]->eval(rows, *context, (*result)[i]);
}
Expand All @@ -1405,7 +1374,6 @@ void ExprSet::clear() {
for (auto* memo : memoizingExprs_) {
memo->clearMemo();
}
multiplyReferencedFields_.clear();
}

void ExprSetSimplified::eval(
Expand Down
7 changes: 0 additions & 7 deletions velox/expression/Expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,10 +335,6 @@ class Expr {
// parent Expr.
std::vector<FieldReference * FOLLY_NONNULL> distinctFields_;

// Fields referenced by multiple inputs, which is subset of distinctFields_.
// Used to determine pre-loading of lazy vectors at current expr.
std::unordered_set<FieldReference * FOLLY_NONNULL> multiplyReferencedFields_;

// True if a null in any of 'distinctFields_' causes 'this' to be
// null for the row.
bool propagatesNulls_ = false;
Expand Down Expand Up @@ -446,9 +442,6 @@ class ExprSet {

std::vector<std::shared_ptr<Expr>> exprs_;

// Fields referenced by multiple expressions in ExprSet.
std::unordered_set<FieldReference * FOLLY_NONNULL> multiplyReferencedFields_;

// Distinct Exprs reachable from 'exprs_' for which reset() needs to
// be called at the start of eval().
std::vector<std::shared_ptr<Expr>> toReset_;
Expand Down
68 changes: 0 additions & 68 deletions velox/expression/tests/ExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,74 +940,6 @@ TEST_F(ExprTest, selectiveLazyLoadingOr) {
assertEqualVectors(expected, result);
}

TEST_F(ExprTest, lazyVectorAccessTwiceWithDifferentRows) {
const vector_size_t size = 4;

auto c0 = makeNullableFlatVector<int64_t>({1, 1, 1, std::nullopt});
// [0, 1, 2, 3] if fully loaded
std::vector<vector_size_t> loadedRows;
auto valueAt = [](auto row) { return row; };
VectorPtr c1 = std::make_shared<LazyVector>(
pool_.get(),
BIGINT(),
size,
std::make_unique<test::SimpleVectorLoader>([&](auto rows) {
for (auto row : rows) {
loadedRows.push_back(row);
}
return makeFlatVector<int64_t>(rows.back() + 1, valueAt);
}));

auto result = evaluate(
"row_constructor(c0 + c1, if (c1 >= 0, c1, 0))", makeRowVector({c0, c1}));

auto expected = makeRowVector(
{makeNullableFlatVector<int64_t>({1, 2, 3, std::nullopt}),
makeNullableFlatVector<int64_t>({0, 1, 2, 3})});

assertEqualVectors(expected, result);
}

TEST_F(ExprTest, lazyVectorAccessTwiceInDifferentExpressions) {
const vector_size_t size = 1'000;

// Fields referenced by multiple expressions will load lazy vector
// immediately in ExprSet::eval().
auto isNullAtColA = [](auto row) { return row % 4 == 0; };
auto isNullAtColC = [](auto row) { return row % 2 == 0; };

auto a = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row; },
isNullAtColA,
size,
[](auto row) { return row; });
auto b = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row * 2; },
nullptr,
size,
[](auto row) { return row; });
auto c = makeLazyFlatVector<int64_t>(
size,
[](auto row) { return row; },
isNullAtColC,
size,
[](auto row) { return row; });

auto result = evaluateMultiple(
{"if(c0 is not null, c0, c1)", "if (c2 is not null, c2, c1)"},
makeRowVector({a, b, c}));

auto expected = makeFlatVector<int64_t>(
size, [](auto row) { return row % 4 == 0 ? row * 2 : row; });
assertEqualVectors(expected, result[0]);

expected = makeFlatVector<int64_t>(
size, [](auto row) { return row % 2 == 0 ? row * 2 : row; });
assertEqualVectors(expected, result[1]);
}

TEST_F(ExprTest, selectiveLazyLoadingIf) {
const vector_size_t size = 1'000;

Expand Down