forked from Eventual-Inc/Daft
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Adds list constructor to Expression and SQL APIs (Eventual-Inc#…
…3737) This PR adds the ability to "merge" multiple series into a series of lists which enables construction of lists via _daft expressions_ rather than _python expressions_ via lit.
- Loading branch information
Showing
20 changed files
with
498 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from __future__ import annotations | ||
|
||
from .expressions import Expression, ExpressionsProjection, col, lit, interval, coalesce | ||
from .expressions import Expression, ExpressionsProjection, col, list_, lit, interval, coalesce | ||
|
||
__all__ = ["Expression", "ExpressionsProjection", "coalesce", "col", "interval", "lit"] | ||
__all__ = ["Expression", "ExpressionsProjection", "coalesce", "col", "interval", "list_", "lit"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ Constructors | |
|
||
col | ||
lit | ||
list_ | ||
|
||
Generic | ||
####### | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
use std::cmp::{max, min}; | ||
|
||
use arrow2::offset::Offsets; | ||
use common_error::{DaftError, DaftResult}; | ||
use daft_schema::{dtype::DataType, field::Field}; | ||
|
||
use crate::{ | ||
array::{growable::make_growable, ListArray}, | ||
series::{IntoSeries, Series}, | ||
}; | ||
|
||
impl Series { | ||
/// Zips series into a single series of lists. | ||
/// ex: | ||
/// ```text | ||
/// A: Series := ( a_0, a_1, .. , a_n ) | ||
/// B: Series := ( b_0, b_1, .. , b_n ) | ||
/// C: Series := Zip(A, B) <-> ( [a_0, b_0], [a_1, b_1], [a_2, b_2] ) | ||
/// ``` | ||
pub fn zip(field: Field, series: &[&Self]) -> DaftResult<Self> { | ||
// err if no series to zip | ||
if series.is_empty() { | ||
return Err(DaftError::ValueError( | ||
"Need at least 1 series to perform zip".to_string(), | ||
)); | ||
} | ||
|
||
// homogeneity checks naturally happen in make_growable's downcast. | ||
let dtype = match &field.dtype { | ||
DataType::List(dtype) => dtype.as_ref(), | ||
DataType::FixedSizeList(..) => { | ||
return Err(DaftError::ValueError( | ||
"Fixed size list constructor is currently not supported".to_string(), | ||
)); | ||
} | ||
_ => { | ||
return Err(DaftError::ValueError( | ||
"Cannot zip field with non-list type".to_string(), | ||
)); | ||
} | ||
}; | ||
|
||
// 0 -> index of child in 'arrays' vector | ||
// 1 -> last index of child | ||
type Child = (usize, usize); | ||
|
||
// build a null series mask so we can skip making full_nulls and avoid downcast "Null to T" errors. | ||
let mut mask: Vec<Option<Child>> = vec![]; | ||
let mut rows = 0; | ||
let mut capacity = 0; | ||
let mut arrays = vec![]; | ||
|
||
for s in series { | ||
let len = s.len(); | ||
if is_null(s) { | ||
mask.push(None); | ||
} else { | ||
mask.push(Some((arrays.len(), len - 1))); | ||
arrays.push(*s); | ||
} | ||
rows = max(rows, len); | ||
capacity += len; | ||
} | ||
|
||
// initialize a growable child | ||
let mut offsets = Offsets::<i64>::with_capacity(capacity); | ||
let mut child = make_growable("list", dtype, arrays, true, capacity); | ||
let sublist_len = series.len() as i64; | ||
|
||
// merge each series based upon the mask | ||
for row in 0..rows { | ||
for i in &mask { | ||
if let Some((i, end)) = *i { | ||
child.extend(i, min(row, end), 1); | ||
} else { | ||
child.extend_nulls(1); | ||
} | ||
} | ||
offsets.try_push(sublist_len)?; | ||
} | ||
|
||
// create the outer array with offsets | ||
Ok(ListArray::new(field, child.build()?, offsets.into(), None).into_series()) | ||
} | ||
} | ||
|
||
/// Same null check logic as in Series::concat, but may need an audit since there are other is_null impls. | ||
fn is_null(series: &&Series) -> bool { | ||
series.data_type() == &DataType::Null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
use std::fmt::Write; | ||
|
||
use itertools::Itertools; | ||
|
||
use super::{Expr, ExprRef, Operator}; | ||
|
||
/// Display for Expr::BinaryOp | ||
pub fn expr_binary_op_display_without_formatter( | ||
op: &Operator, | ||
left: &ExprRef, | ||
right: &ExprRef, | ||
) -> std::result::Result<String, std::fmt::Error> { | ||
let mut f = String::default(); | ||
let write_out_expr = |f: &mut String, input: &Expr| match input { | ||
Expr::Alias(e, _) => write!(f, "{e}"), | ||
Expr::BinaryOp { .. } => write!(f, "[{input}]"), | ||
_ => write!(f, "{input}"), | ||
}; | ||
write_out_expr(&mut f, left)?; | ||
write!(&mut f, " {op} ")?; | ||
write_out_expr(&mut f, right)?; | ||
Ok(f) | ||
} | ||
|
||
/// Display for Expr::IsIn | ||
pub fn expr_is_in_display_without_formatter( | ||
expr: &ExprRef, | ||
inputs: &[ExprRef], | ||
) -> std::result::Result<String, std::fmt::Error> { | ||
let mut f = String::default(); | ||
write!(&mut f, "{expr} IN (")?; | ||
for (i, input) in inputs.iter().enumerate() { | ||
if i != 0 { | ||
write!(&mut f, ", ")?; | ||
} | ||
write!(&mut f, "{input}")?; | ||
} | ||
write!(&mut f, ")")?; | ||
Ok(f) | ||
} | ||
|
||
/// Display for Expr::List | ||
pub fn expr_list_display_without_formatter( | ||
items: &[ExprRef], | ||
) -> std::result::Result<String, std::fmt::Error> { | ||
let mut f = String::default(); | ||
write!( | ||
&mut f, | ||
"list({})", | ||
items.iter().map(|x| x.to_string()).join(", ") | ||
)?; | ||
Ok(f) | ||
} |
Oops, something went wrong.