diff --git a/velox/exec/FilterProject.cpp b/velox/exec/FilterProject.cpp index 106a000f5ee4..b876a9638519 100644 --- a/velox/exec/FilterProject.cpp +++ b/velox/exec/FilterProject.cpp @@ -159,6 +159,19 @@ RowVectorPtr FilterProject::getOutput() { } void FilterProject::project(const SelectivityVector& rows, EvalCtx* evalCtx) { + // Make sure LazyVectors are loaded for all the "rows". + // + // Consider projection with 2 expressions: f(a) AND g(b), h(b) + // If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it + // will load b only for rows where f(a) is true. However, h(b) projection + // needs all rows for "b". + // + // This works, but may load more rows than necessary. E.g. if we only have + // f(a) AND g(b) expression and b is not used anywhere else, it is sufficient + // to load b for a subset of rows where f(a) is true. + *evalCtx->mutableIsFinalSelection() = false; + *evalCtx->mutableFinalSelection() = &rows; + exprs_->eval( hasFilter_ ? 1 : 0, numExprs_, !hasFilter_, rows, evalCtx, &results_); } diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 8637249da053..2e340ce502ca 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -1830,28 +1830,6 @@ TEST_F(TableScanTest, structLazy) { assertQuery(op, {filePath}, "select c0 % 3 from tmp"); } -TEST_F(TableScanTest, lazyVectorAccessTwiceWithDifferentRows) { - auto data = makeRowVector({ - makeNullableFlatVector({1, 1, 1, std::nullopt}), - makeNullableFlatVector({0, 1, 2, 3}), - }); - - auto filePath = TempFilePath::create(); - writeToFile(filePath->path, {data}); - createDuckDbTable({data}); - - auto plan = - PlanBuilder() - .tableScan(asRowType(data->type())) - .filter( - "element_at(array_constructor(c0 + c1, if(c1 >= 0, c1, 0)), 1) > 0") - .planNode(); - assertQuery( - plan, - {filePath}, - "SELECT c0, c1 from tmp where ([c0 + c1, if(c1 >= 0, c1, 0)])[1] > 0"); -} - TEST_F(TableScanTest, structInArrayOrMap) { vector_size_t size = 1'000; diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index c316a5a78ad8..859fdf528b68 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -78,14 +78,11 @@ bool isMember( } void mergeFields( - std::vector& distinctFields, - std::unordered_set& multiplyReferencedFields_, + std::vector& fields, const std::vector& moreFields) { for (auto* newField : moreFields) { - if (isMember(distinctFields, *newField)) { - multiplyReferencedFields_.insert(newField); - } else { - distinctFields.emplace_back(newField); + if (!isMember(fields, *newField)) { + fields.emplace_back(newField); } } } @@ -187,13 +184,11 @@ void Expr::computeMetadata() { propagatesNulls_ = vectorFunction_->isDefaultNullBehavior(); deterministic_ = vectorFunction_->isDeterministic(); } - for (auto& input : inputs_) { input->computeMetadata(); deterministic_ &= input->deterministic_; propagatesNulls_ &= input->propagatesNulls_; - mergeFields( - distinctFields_, multiplyReferencedFields_, input->distinctFields_); + mergeFields(distinctFields_, input->distinctFields_); } if (isSpecialForm()) { propagatesNulls_ = propagatesNulls(); @@ -373,9 +368,6 @@ void Expr::eval( // all the time. Therefore, we should delay loading lazy vectors until we // know the minimum subset of rows needed to be loaded. // - // Load fields multiply referenced by inputs unconditionally. It's hard to - // know the superset of rows the multiple inputs need to load. - // // If there is only one field, load it unconditionally. The very first IF, // AND or OR will have to load it anyway. Pre-loading enables peeling of // encodings at a higher level in the expression tree and avoids repeated @@ -387,12 +379,6 @@ void Expr::eval( for (const auto& field : distinctFields_) { context.ensureFieldLoaded(field->index(context), rows); } - } else if (!propagatesNulls_) { - // Load multiply-referenced fields at common parent expr with "rows". - // Delay loading fields that are not in multiplyReferencedFields_. - for (const auto& field : multiplyReferencedFields_) { - context.ensureFieldLoaded(field->index(context), rows); - } } if (inputs_.empty()) { @@ -1298,11 +1284,6 @@ ExprSet::ExprSet( : execCtx_(execCtx) { exprs_ = compileExpressions( std::move(sources), execCtx, this, enableConstantFolding); - std::vector allDistinctFields; - for (auto& expr : exprs_) { - mergeFields( - allDistinctFields, multiplyReferencedFields_, expr->distinctFields()); - } } namespace { @@ -1377,18 +1358,6 @@ void ExprSet::eval( if (initialize) { clearSharedSubexprs(); } - - // Make sure LazyVectors, referenced by multiple expressions, are loaded - // for all the "rows". - // - // Consider projection with 2 expressions: f(a) AND g(b), h(b) - // If b is a LazyVector and f(a) AND g(b) expression is evaluated first, it - // will load b only for rows where f(a) is true. However, h(b) projection - // needs all rows for "b". - for (const auto& field : multiplyReferencedFields_) { - context->ensureFieldLoaded(field->index(*context), rows); - } - for (int32_t i = begin; i < end; ++i) { exprs_[i]->eval(rows, *context, (*result)[i]); } @@ -1405,7 +1374,6 @@ void ExprSet::clear() { for (auto* memo : memoizingExprs_) { memo->clearMemo(); } - multiplyReferencedFields_.clear(); } void ExprSetSimplified::eval( diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index 3560decaa506..b8dcad735a0a 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -335,10 +335,6 @@ class Expr { // parent Expr. std::vector distinctFields_; - // Fields referenced by multiple inputs, which is subset of distinctFields_. - // Used to determine pre-loading of lazy vectors at current expr. - std::unordered_set multiplyReferencedFields_; - // True if a null in any of 'distinctFields_' causes 'this' to be // null for the row. bool propagatesNulls_ = false; @@ -446,9 +442,6 @@ class ExprSet { std::vector> exprs_; - // Fields referenced by multiple expressions in ExprSet. - std::unordered_set multiplyReferencedFields_; - // Distinct Exprs reachable from 'exprs_' for which reset() needs to // be called at the start of eval(). std::vector> toReset_; diff --git a/velox/expression/tests/ExprTest.cpp b/velox/expression/tests/ExprTest.cpp index 1ba15d7b3828..5a04439be958 100644 --- a/velox/expression/tests/ExprTest.cpp +++ b/velox/expression/tests/ExprTest.cpp @@ -940,74 +940,6 @@ TEST_F(ExprTest, selectiveLazyLoadingOr) { assertEqualVectors(expected, result); } -TEST_F(ExprTest, lazyVectorAccessTwiceWithDifferentRows) { - const vector_size_t size = 4; - - auto c0 = makeNullableFlatVector({1, 1, 1, std::nullopt}); - // [0, 1, 2, 3] if fully loaded - std::vector loadedRows; - auto valueAt = [](auto row) { return row; }; - VectorPtr c1 = std::make_shared( - pool_.get(), - BIGINT(), - size, - std::make_unique([&](auto rows) { - for (auto row : rows) { - loadedRows.push_back(row); - } - return makeFlatVector(rows.back() + 1, valueAt); - })); - - auto result = evaluate( - "row_constructor(c0 + c1, if (c1 >= 0, c1, 0))", makeRowVector({c0, c1})); - - auto expected = makeRowVector( - {makeNullableFlatVector({1, 2, 3, std::nullopt}), - makeNullableFlatVector({0, 1, 2, 3})}); - - assertEqualVectors(expected, result); -} - -TEST_F(ExprTest, lazyVectorAccessTwiceInDifferentExpressions) { - const vector_size_t size = 1'000; - - // Fields referenced by multiple expressions will load lazy vector - // immediately in ExprSet::eval(). - auto isNullAtColA = [](auto row) { return row % 4 == 0; }; - auto isNullAtColC = [](auto row) { return row % 2 == 0; }; - - auto a = makeLazyFlatVector( - size, - [](auto row) { return row; }, - isNullAtColA, - size, - [](auto row) { return row; }); - auto b = makeLazyFlatVector( - size, - [](auto row) { return row * 2; }, - nullptr, - size, - [](auto row) { return row; }); - auto c = makeLazyFlatVector( - size, - [](auto row) { return row; }, - isNullAtColC, - size, - [](auto row) { return row; }); - - auto result = evaluateMultiple( - {"if(c0 is not null, c0, c1)", "if (c2 is not null, c2, c1)"}, - makeRowVector({a, b, c})); - - auto expected = makeFlatVector( - size, [](auto row) { return row % 4 == 0 ? row * 2 : row; }); - assertEqualVectors(expected, result[0]); - - expected = makeFlatVector( - size, [](auto row) { return row % 2 == 0 ? row * 2 : row; }); - assertEqualVectors(expected, result[1]); -} - TEST_F(ExprTest, selectiveLazyLoadingIf) { const vector_size_t size = 1'000;