Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Polars to v0.43.1 #985

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 119 additions & 107 deletions native/explorer/Cargo.lock

Large diffs are not rendered by default.

10 changes: 2 additions & 8 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object_store = { version = "0.10", default-features = false, optional = true }
mimalloc = { version = "*", default-features = false }

[dependencies.polars]
version = "0.42"
version = "0.43"
default-features = false
features = [
"abs",
Expand Down Expand Up @@ -81,15 +81,9 @@ features = [
]

[dependencies.polars-ops]
version = "0.42"
version = "0.43"
features = ["abs", "ewma", "cum_agg", "cov"]

# This dep is only needed to activate "timezones" feature
# for the polars-json crate. We should remove when Polars fixes it.
[dependencies.polars-json]
version = "*"
features = ["timezones", "chrono-tz"]

[features]
default = ["ndjson", "cloud", "nif_version_2_15"]

Expand Down
20 changes: 12 additions & 8 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ pub fn df_transpose(

#[rustler::nif]
pub fn df_names(df: ExDataFrame) -> Result<Vec<String>, ExplorerError> {
let names = to_string_names(df.get_column_names());
let names = df
.get_column_names()
.iter()
.map(|name| name.to_string())
.collect();
Ok(names)
}

Expand Down Expand Up @@ -73,10 +77,10 @@ pub fn df_concat_columns(dfs: Vec<ExDataFrame>) -> Result<ExDataFrame, ExplorerE
.iter()
.map(|col| {
let name = col.name();
if previous_names.contains(name) {
if previous_names.contains(&name.clone().to_string()) {
let new_name = format!("{name}_{idx}");
previous_names.insert(new_name.clone());
col.clone().rename(&new_name).to_owned()
col.clone().rename(new_name.into()).to_owned()
} else {
previous_names.insert(name.to_string());
col.clone().to_owned()
Expand Down Expand Up @@ -125,7 +129,7 @@ pub fn df_slice_by_indices(
indices: Vec<u32>,
groups: Vec<&str>,
) -> Result<ExDataFrame, ExplorerError> {
let idx = UInt32Chunked::from_vec("idx", indices);
let idx = UInt32Chunked::from_vec("idx".into(), indices);
let new_df = if groups.is_empty() {
df.take(&idx)?
} else {
Expand Down Expand Up @@ -167,7 +171,7 @@ pub fn df_sample_n(
seed: Option<u64>,
groups: Vec<String>,
) -> Result<ExDataFrame, ExplorerError> {
let n_s = Series::new("n", &[n]);
let n_s = Series::new("n".into(), &[n]);
let new_df = if groups.is_empty() {
df.sample_n(&n_s, replace, shuffle, seed)?
} else {
Expand All @@ -187,7 +191,7 @@ pub fn df_sample_frac(
seed: Option<u64>,
groups: Vec<String>,
) -> Result<ExDataFrame, ExplorerError> {
let frac_s = Series::new("frac", &[frac]);
let frac_s = Series::new("frac".into(), &[frac]);
let new_df = if groups.is_empty() {
df.sample_frac(&frac_s, replace, shuffle, seed)?
} else {
Expand Down Expand Up @@ -396,7 +400,7 @@ pub fn df_pivot_wider(
.collect();

for (id_name, new_name) in id_columns.iter().zip(&temp_id_names) {
df.rename(id_name, new_name)?;
df.rename(id_name, new_name.into())?;
}

let mut new_df = pivot_stable(
Expand Down Expand Up @@ -465,7 +469,7 @@ pub fn df_lazy(df: ExDataFrame) -> Result<ExLazyFrame, ExplorerError> {

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_re_dtype(pattern: &str) -> Result<ExSeriesDtype, ExplorerError> {
let s = Series::new("dummy", [""])
let s = Series::new("dummy".into(), [""])
.into_frame()
.lazy()
.with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy"))
Expand Down
24 changes: 19 additions & 5 deletions native/explorer/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,22 @@ pub fn df_from_csv(
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection.map(Arc::new))
.with_rechunk(do_rechunk)
.with_columns(column_names.map(Arc::from))
.with_columns(column_names.map(|names| {
names
.iter()
.map(|name| PlSmallStr::from_string(name.clone()))
.collect()
}))
.with_parse_options(
CsvParseOptions::default()
.with_encoding(encoding)
.with_truncate_ragged_lines(true)
.with_try_parse_dates(parse_dates)
.with_separator(delimiter_as_byte)
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
.with_null_values(Some(NullValues::AllColumns(null_vals))),
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|val| val.into()).collect(),
))),
)
.try_into_reader_with_file_path(Some(filename.into()))?
.finish();
Expand All @@ -79,7 +86,7 @@ pub fn schema_from_dtypes_pairs(
return Ok(None);
}

let mut schema = Schema::new();
let mut schema = Schema::with_capacity(dtypes.len());
for (name, ex_dtype) in dtypes {
let dtype = DataType::try_from(&ex_dtype)?;
schema.with_column(name.into(), dtype);
Expand Down Expand Up @@ -174,7 +181,12 @@ pub fn df_load_csv(
.with_has_header(has_header)
.with_infer_schema_length(infer_schema_length)
.with_n_rows(stop_after_n_rows)
.with_columns(column_names.map(Arc::from))
.with_columns(column_names.map(|names| {
names
.iter()
.map(|name| PlSmallStr::from_string(name.clone()))
.collect()
}))
.with_skip_rows(skip_rows)
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection.map(Arc::new))
Expand All @@ -183,7 +195,9 @@ pub fn df_load_csv(
CsvParseOptions::default()
.with_separator(delimiter_as_byte)
.with_encoding(encoding)
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|x| x.into()).collect(),
)))
.with_try_parse_dates(parse_dates)
.with_eol_char(eol_delimiter.unwrap_or(b'\n')),
)
Expand Down
2 changes: 1 addition & 1 deletion native/explorer/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ impl<'tz> Literal for ExDateTime<'tz> {
Expr::Literal(LiteralValue::DateTime(
ndt.and_utc().timestamp_micros(),
TimeUnit::Microseconds,
Some(time_zone),
Some(time_zone.into()),
))
}
}
Expand Down
7 changes: 3 additions & 4 deletions native/explorer/src/datatypes/ex_dtypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ impl TryFrom<&DataType> for ExSeriesDtype {
let mut struct_fields = Vec::new();

for field in fields {
struct_fields
.push((field.name().to_string(), Self::try_from(field.data_type())?));
struct_fields.push((field.name().to_string(), Self::try_from(field.dtype())?));
}

Ok(ExSeriesDtype::Struct(struct_fields))
Expand Down Expand Up @@ -160,7 +159,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
}
ExSeriesDtype::Datetime(ex_timeunit, tz_option) => Ok(DataType::Datetime(
ex_timeunit.try_into()?,
Some(tz_option.clone()),
Some(tz_option.into()),
)),
ExSeriesDtype::Duration(ex_timeunit) => Ok(DataType::Duration(ex_timeunit.try_into()?)),
ExSeriesDtype::List(inner) => {
Expand All @@ -169,7 +168,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
ExSeriesDtype::Struct(fields) => Ok(DataType::Struct(
fields
.iter()
.map(|(k, v)| Ok(Field::new(k.as_str(), v.try_into()?)))
.map(|(k, v)| Ok(Field::new(k.into(), v.try_into()?)))
.collect::<Result<Vec<Field>, Self::Error>>()?,
)),
}
Expand Down
2 changes: 1 addition & 1 deletion native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ pub fn expr_atan(expr: ExExpr) -> ExExpr {
#[rustler::nif]
pub fn expr_strptime(expr: ExExpr, format_string: &str) -> ExExpr {
let options = StrptimeOptions {
format: Some(format_string.to_string()),
format: Some(format_string.into()),
strict: false,
exact: true,
cache: true,
Expand Down
17 changes: 8 additions & 9 deletions native/explorer/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ pub fn lf_tail(
pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
let mut lf = data.clone_inner();
let names = lf
.schema()?
.collect_schema()?
.iter_names()
.map(|smart_string| smart_string.to_string())
.collect();
Expand All @@ -76,9 +76,9 @@ pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
#[rustler::nif]
pub fn lf_dtypes(data: ExLazyFrame) -> Result<Vec<ExSeriesDtype>, ExplorerError> {
let mut dtypes: Vec<ExSeriesDtype> = vec![];
let schema = data.clone_inner().schema()?;
let schema = data.clone_inner().collect_schema()?;

for dtype in schema.iter_dtypes() {
for (_name, dtype) in schema.iter_names_and_dtypes() {
dtypes.push(ExSeriesDtype::try_from(dtype)?)
}

Expand Down Expand Up @@ -108,7 +108,7 @@ pub fn lf_slice(
let result_lf = if groups.is_empty() {
lf.slice(offset, length)
} else {
let groups_exprs: Vec<Expr> = groups.iter().map(|group| col(group)).collect();
let groups_exprs: Vec<Expr> = groups.iter().map(col).collect();
lf.group_by_stable(groups_exprs)
.agg([col("*").slice(offset, length)])
.explode([col("*").exclude(groups)])
Expand Down Expand Up @@ -181,6 +181,7 @@ pub fn lf_distinct(
columns_to_keep: Option<Vec<ExExpr>>,
) -> Result<ExLazyFrame, ExplorerError> {
let df = data.clone_inner();
let subset = subset.iter().map(|x| x.into()).collect::<Vec<PlSmallStr>>();
let new_df = df.unique_stable(Some(subset), UniqueKeepStrategy::First);

match columns_to_keep {
Expand Down Expand Up @@ -219,11 +220,9 @@ pub fn lf_summarise_with(
// We do add a "shadow" column to be able to group by it.
// This is going to force some aggregations like "mode" to be always inside
// a "list".
let s = Series::new_null("__explorer_null_for_group__", 1);
ldf.with_column(s.lit())
.group_by_stable(["__explorer_null_for_group__"])
ldf.group_by_stable([1.lit().alias("__explorer_literal_for_group__")])
.agg(aggs)
.select(&[col("*").exclude(["__explorer_null_for_group__"])])
.select(&[col("*").exclude(["__explorer_literal_for_group__"])])
} else {
ldf.group_by_stable(groups).agg(aggs)
};
Expand Down Expand Up @@ -344,7 +343,7 @@ pub fn lf_concat_columns(ldfs: Vec<ExLazyFrame>) -> Result<ExLazyFrame, Explorer
.map(|(idx, ex_ldf)| {
let mut ldf = ex_ldf.clone_inner();
let names: Vec<String> = ldf
.schema()
.collect_schema()
.expect("should be able to get schema")
.iter_names()
.map(|smart_string| smart_string.to_string())
Expand Down
8 changes: 5 additions & 3 deletions native/explorer/src/lazyframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub fn lf_from_parquet(
};

let cols: Vec<Expr> = if let Some(cols) = columns {
cols.iter().map(|column| col(column)).collect()
cols.iter().map(col).collect()
} else {
vec![all()]
};
Expand All @@ -43,7 +43,7 @@ pub fn lf_from_parquet_cloud(
..Default::default()
};
let cols: Vec<Expr> = if let Some(cols) = columns {
cols.iter().map(|column| col(column)).collect()
cols.iter().map(col).collect()
} else {
vec![all()]
};
Expand Down Expand Up @@ -247,7 +247,9 @@ pub fn lf_from_csv(
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(schema_from_dtypes_pairs(dtypes)?)
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|x| x.into()).collect(),
)))
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
.finish()?;

Expand Down
Loading
Loading