Skip to content

Commit

Permalink
DataFrame hash_rows function with detemine seed
Browse files Browse the repository at this point in the history
* python polars 0.8.12

* DataFrame hash_rows function with detemine seed

* Fix docstring

* Switch to using black to fix formatting

Co-authored-by: Ritchie Vink <ritchie46@gmail.com>
Co-authored-by: thanhtm1 <thanhtm1@msb.com.vn>
  • Loading branch information
3 people committed Jul 19, 2021
1 parent 091e8b4 commit 173e11f
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 17 deletions.
4 changes: 2 additions & 2 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1644,9 +1644,9 @@ impl DataFrame {

/// Hash and combine the row values
#[cfg(feature = "row_hash")]
pub fn hash_rows(&self) -> Result<UInt64Chunked> {
pub fn hash_rows(&self, hasher_builder: Option<RandomState>) -> Result<UInt64Chunked> {
let dfs = split_df(self, POOL.current_num_threads())?;
let (cas, _) = df_rows_to_hashes_threaded(&dfs, None);
let (cas, _) = df_rows_to_hashes_threaded(&dfs, hasher_builder);

let mut iter = cas.into_iter();
let mut acc_ca = iter.next().unwrap();
Expand Down
9 changes: 1 addition & 8 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion py-polars/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-polars"
version = "0.8.11-beta.1"
version = "0.8.12"
authors = ["ritchie46 <ritchie46@gmail.com>"]
edition = "2018"
readme = "README.md"
Expand Down
19 changes: 16 additions & 3 deletions py-polars/polars/eager/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,13 +2035,26 @@ def shrink_to_fit(self, in_place: bool = False) -> Optional["DataFrame"]:
df._df.shrink_to_fit()
return df

def hash_rows(self) -> "pl.Series":
def hash_rows(
self, k0: int = 0, k1: int = 1, k2: int = 2, k3: int = 3
) -> "pl.Series":
"""
Hash and combine the rows in this DataFrame.
Hash value is UInt64
"""
return pl.wrap_s(self._df.hash_rows())
Parameters
----------
k0
seed parameter
k1
seed parameter
k2
seed parameter
k3
seed parameter
"""
return pl.wrap_s(self._df.hash_rows(k0, k1, k2, k3))


class GroupBy:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,7 +1558,7 @@ def hash(self, k0: int = 0, k1: int = 1, k2: int = 2, k3: int = 3) -> "pl.Series
"""
Hash the Series.
The hash value is of type `Date64`
The hash value is of type `UInt64`
Parameters
----------
Expand Down
5 changes: 3 additions & 2 deletions py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -886,8 +886,9 @@ impl PyDataFrame {
self.df.shrink_to_fit();
}

pub fn hash_rows(&self) -> PyResult<PySeries> {
let hash = self.df.hash_rows().map_err(PyPolarsEr::from)?;
pub fn hash_rows(&self, k0: u64, k1: u64, k2: u64, k3: u64) -> PyResult<PySeries> {
let hb = ahash::RandomState::with_seeds(k0, k1, k2, k3);
let hash = self.df.hash_rows(Some(hb)).map_err(PyPolarsEr::from)?;
Ok(hash.into_series().into())
}
}
Expand Down

0 comments on commit 173e11f

Please sign in to comment.