Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement lead and lag built-in window function #429

Merged
merged 3 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ballista/rust/core/src/serde/scheduler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ pub mod to_proto;

/// Action that can be sent to an executor
#[derive(Debug, Clone)]

pub enum Action {
/// Execute a query and store the results in memory
ExecutePartition(ExecutePartition),
Expand Down
181 changes: 181 additions & 0 deletions datafusion/src/physical_plan/expressions/lead_lag.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines physical expression for `lead` and `lag` that can evaluated
//! at runtime during query execution

use crate::error::{DataFusionError, Result};
use crate::physical_plan::window_functions::PartitionEvaluator;
use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr};
use arrow::array::ArrayRef;
use arrow::compute::kernels::window::shift;
use arrow::datatypes::{DataType, Field};
use arrow::record_batch::RecordBatch;
use std::any::Any;
use std::ops::Range;
use std::sync::Arc;

/// window shift expression
#[derive(Debug)]
pub struct WindowShift {
name: String,
data_type: DataType,
shift_offset: i64,
expr: Arc<dyn PhysicalExpr>,
}

/// lead() window function
pub fn lead(
name: String,
data_type: DataType,
expr: Arc<dyn PhysicalExpr>,
) -> WindowShift {
WindowShift {
name,
data_type,
shift_offset: -1,
expr,
}
}

/// lag() window function
pub fn lag(
name: String,
data_type: DataType,
expr: Arc<dyn PhysicalExpr>,
) -> WindowShift {
WindowShift {
name,
data_type,
shift_offset: 1,
expr,
}
}

impl BuiltInWindowFunctionExpr for WindowShift {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
self
}

fn field(&self) -> Result<Field> {
let nullable = true;
Ok(Field::new(&self.name, self.data_type.clone(), nullable))
}

fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
vec![self.expr.clone()]
}

fn name(&self) -> &str {
&self.name
}

fn create_evaluator(
&self,
batch: &RecordBatch,
) -> Result<Box<dyn PartitionEvaluator>> {
let values = self
.expressions()
.iter()
.map(|e| e.evaluate(batch))
.map(|r| r.map(|v| v.into_array(batch.num_rows())))
.collect::<Result<Vec<_>>>()?;
Ok(Box::new(WindowShiftEvaluator {
shift_offset: self.shift_offset,
values,
}))
}
}

pub(crate) struct WindowShiftEvaluator {
shift_offset: i64,
values: Vec<ArrayRef>,
}

impl PartitionEvaluator for WindowShiftEvaluator {
fn evaluate_partition(&self, partition: Range<usize>) -> Result<ArrayRef> {
let value = &self.values[0];
let value = value.slice(partition.start, partition.end - partition.start);
shift(value.as_ref(), self.shift_offset).map_err(DataFusionError::ArrowError)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need to restrict the window to the partition bounds? If the input array had 10 rows in 2 partitions, wouldn't this code produce 2 output partitions of 10 rows each (rather than 2 output partitions of 5 rows each)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb good catch, this is fixed and add with integration tests.

}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::error::Result;
use crate::physical_plan::expressions::Column;
use arrow::record_batch::RecordBatch;
use arrow::{array::*, datatypes::*};

fn test_i32_result(expr: WindowShift, expected: Int32Array) -> Result<()> {
let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, -2, 3, -4, 5, -6, 7, 8]));
let values = vec![arr];
let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]);
let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?;
let result = expr.create_evaluator(&batch)?.evaluate(vec![0..8])?;
assert_eq!(1, result.len());
let result = result[0].as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(expected, *result);
Ok(())
}

#[test]
fn lead_lag_window_shift() -> Result<()> {
test_i32_result(
lead(
"lead".to_owned(),
DataType::Float32,
Arc::new(Column::new("c3", 0)),
),
vec![
Some(-2),
Some(3),
Some(-4),
Some(5),
Some(-6),
Some(7),
Some(8),
None,
]
.iter()
.collect::<Int32Array>(),
)?;

test_i32_result(
lag(
"lead".to_owned(),
DataType::Float32,
Arc::new(Column::new("c3", 0)),
),
vec![
None,
Some(1),
Some(-2),
Some(3),
Some(-4),
Some(5),
Some(-6),
Some(7),
]
.iter()
.collect::<Int32Array>(),
)?;
Ok(())
}
}
2 changes: 2 additions & 0 deletions datafusion/src/physical_plan/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ mod count;
mod in_list;
mod is_not_null;
mod is_null;
mod lead_lag;
mod literal;
mod min_max;
mod negative;
Expand All @@ -58,6 +59,7 @@ pub use count::Count;
pub use in_list::{in_list, InListExpr};
pub use is_not_null::{is_not_null, IsNotNullExpr};
pub use is_null::{is_null, IsNullExpr};
pub use lead_lag::{lag, lead};
pub use literal::{lit, Literal};
pub use min_max::{Max, Min};
pub use negative::{negative, NegativeExpr};
Expand Down
3 changes: 2 additions & 1 deletion datafusion/src/physical_plan/expressions/nth_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.

//! Defines physical expressions that can evaluated at runtime during query execution
//! Defines physical expressions for `first_value`, `last_value`, and `nth_value`
//! that can evaluated at runtime during query execution

use crate::error::{DataFusionError, Result};
use crate::physical_plan::window_functions::PartitionEvaluator;
Expand Down
42 changes: 29 additions & 13 deletions datafusion/src/physical_plan/windows.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ use crate::error::{DataFusionError, Result};
use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits};
use crate::physical_plan::{
aggregates, common,
expressions::{dense_rank, rank, Literal, NthValue, PhysicalSortExpr, RowNumber},
expressions::{
dense_rank, lag, lead, rank, Literal, NthValue, PhysicalSortExpr, RowNumber,
},
type_coercion::coerce,
window_functions::{
signature_for_built_in, BuiltInWindowFunction, BuiltInWindowFunctionExpr,
Expand Down Expand Up @@ -100,10 +102,22 @@ fn create_built_in_window_expr(
input_schema: &Schema,
name: String,
) -> Result<Arc<dyn BuiltInWindowFunctionExpr>> {
match fun {
BuiltInWindowFunction::RowNumber => Ok(Arc::new(RowNumber::new(name))),
BuiltInWindowFunction::Rank => Ok(Arc::new(rank(name))),
BuiltInWindowFunction::DenseRank => Ok(Arc::new(dense_rank(name))),
Ok(match fun {
BuiltInWindowFunction::RowNumber => Arc::new(RowNumber::new(name)),
BuiltInWindowFunction::Rank => Arc::new(rank(name)),
BuiltInWindowFunction::DenseRank => Arc::new(dense_rank(name)),
BuiltInWindowFunction::Lag => {
let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?;
let arg = coerced_args[0].clone();
let data_type = args[0].data_type(input_schema)?;
Arc::new(lag(name, data_type, arg))
}
BuiltInWindowFunction::Lead => {
let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?;
let arg = coerced_args[0].clone();
let data_type = args[0].data_type(input_schema)?;
Arc::new(lead(name, data_type, arg))
}
BuiltInWindowFunction::NthValue => {
let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?;
let arg = coerced_args[0].clone();
Expand All @@ -118,25 +132,27 @@ fn create_built_in_window_expr(
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?;
let n: u32 = n as u32;
let data_type = args[0].data_type(input_schema)?;
Ok(Arc::new(NthValue::nth_value(name, arg, data_type, n)?))
Arc::new(NthValue::nth_value(name, arg, data_type, n)?)
}
BuiltInWindowFunction::FirstValue => {
let arg =
coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone();
let data_type = args[0].data_type(input_schema)?;
Ok(Arc::new(NthValue::first_value(name, arg, data_type)))
Arc::new(NthValue::first_value(name, arg, data_type))
}
BuiltInWindowFunction::LastValue => {
let arg =
coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone();
let data_type = args[0].data_type(input_schema)?;
Ok(Arc::new(NthValue::last_value(name, arg, data_type)))
Arc::new(NthValue::last_value(name, arg, data_type))
}
_ => Err(DataFusionError::NotImplemented(format!(
"Window function with {:?} not yet implemented",
fun
))),
}
_ => {
return Err(DataFusionError::NotImplemented(format!(
"Window function with {:?} not yet implemented",
fun
)))
}
})
}

/// A window expr that takes the form of a built in window function
Expand Down
29 changes: 29 additions & 0 deletions integration-tests/sqls/partitioned_window_built_in_functions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at

-- http://www.apache.org/licenses/LICENSE-2.0

-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

SELECT
c9,
row_number() OVER (PARTITION BY c2 ORDER BY c9) row_num,
lead(c9) OVER (PARTITION BY c2 ORDER BY c9) lead_c9,
lag(c9) OVER (PARTITION BY c2 ORDER BY c9) lag_c9,
first_value(c9) OVER (PARTITION BY c2 ORDER BY c9) first_c9,
first_value(c9) OVER (PARTITION BY c2 ORDER BY c9 DESC) first_c9_desc,
last_value(c9) OVER (PARTITION BY c2 ORDER BY c9) last_c9,
last_value(c9) OVER (PARTITION BY c2 ORDER BY c9 DESC) last_c9_desc,
nth_value(c9, 2) OVER (PARTITION BY c2 ORDER BY c9) second_c9,
nth_value(c9, 2) OVER (PARTITION BY c2 ORDER BY c9 DESC) second_c9_desc
FROM test
ORDER BY c9;
2 changes: 2 additions & 0 deletions integration-tests/sqls/simple_window_built_in_functions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
SELECT
c9,
row_number() OVER (ORDER BY c9) row_num,
lead(c9) OVER (ORDER BY c9) lead_c9,
lag(c9) OVER (ORDER BY c9) lag_c9,
first_value(c9) OVER (ORDER BY c9) first_c9,
first_value(c9) OVER (ORDER BY c9 DESC) first_c9_desc,
last_value(c9) OVER (ORDER BY c9) last_c9,
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/test_psql_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase):
def test_parity(self):
root = Path(os.path.dirname(__file__)) / "sqls"
files = set(root.glob("*.sql"))
self.assertEqual(len(files), 11, msg="tests are missed")
self.assertEqual(len(files), 12, msg="tests are missed")
for fname in files:
with self.subTest(fname=fname):
datafusion_output = pd.read_csv(
Expand Down