Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CHORE]: move utf8 functions from daft-dsl to daft-functions #3101

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 34 additions & 28 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1095,34 +1095,6 @@ class PyExpr:
def __repr__(self) -> str: ...
def __hash__(self) -> int: ...
def __reduce__(self) -> tuple: ...
def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ...
def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ...
def utf8_contains(self, pattern: PyExpr) -> PyExpr: ...
def utf8_match(self, pattern: PyExpr) -> PyExpr: ...
def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ...
def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
def utf8_length(self) -> PyExpr: ...
def utf8_length_bytes(self) -> PyExpr: ...
def utf8_lower(self) -> PyExpr: ...
def utf8_upper(self) -> PyExpr: ...
def utf8_lstrip(self) -> PyExpr: ...
def utf8_rstrip(self) -> PyExpr: ...
def utf8_reverse(self) -> PyExpr: ...
def utf8_capitalize(self) -> PyExpr: ...
def utf8_left(self, nchars: PyExpr) -> PyExpr: ...
def utf8_right(self, nchars: PyExpr) -> PyExpr: ...
def utf8_find(self, substr: PyExpr) -> PyExpr: ...
def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_repeat(self, n: PyExpr) -> PyExpr: ...
def utf8_like(self, pattern: PyExpr) -> PyExpr: ...
def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ...
def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ...
def utf8_to_date(self, format: str) -> PyExpr: ...
def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ...
def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ...
def struct_get(self, name: str) -> PyExpr: ...
def map_get(self, key: PyExpr) -> PyExpr: ...
def partitioning_days(self) -> PyExpr: ...
Expand Down Expand Up @@ -1320,6 +1292,40 @@ def list_max(expr: PyExpr) -> PyExpr: ...
def list_slice(expr: PyExpr, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ...
def list_chunk(expr: PyExpr, size: int) -> PyExpr: ...

# ---
# expr.utf8 namespace
# ---
def utf8_endswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_startswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_contains(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_match(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_split(expr: PyExpr, pattern: PyExpr, regex: bool) -> PyExpr: ...
def utf8_extract(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_extract_all(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_replace(expr: PyExpr, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
def utf8_length(expr: PyExpr) -> PyExpr: ...
def utf8_length_bytes(expr: PyExpr) -> PyExpr: ...
def utf8_lower(expr: PyExpr) -> PyExpr: ...
def utf8_upper(expr: PyExpr) -> PyExpr: ...
def utf8_lstrip(expr: PyExpr) -> PyExpr: ...
def utf8_rstrip(expr: PyExpr) -> PyExpr: ...
def utf8_reverse(expr: PyExpr) -> PyExpr: ...
def utf8_capitalize(expr: PyExpr) -> PyExpr: ...
def utf8_left(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
def utf8_right(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
def utf8_find(expr: PyExpr, substr: PyExpr) -> PyExpr: ...
def utf8_rpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_lpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_repeat(expr: PyExpr, n: PyExpr) -> PyExpr: ...
def utf8_like(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_ilike(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_substr(expr: PyExpr, start: PyExpr, length: PyExpr) -> PyExpr: ...
def utf8_to_date(expr: PyExpr, format: str) -> PyExpr: ...
def utf8_to_datetime(expr: PyExpr, format: str, timezone: str | None = None) -> PyExpr: ...
def utf8_normalize(
expr: PyExpr, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool
) -> PyExpr: ...

class PyCatalog:
@staticmethod
def new() -> PyCatalog: ...
Expand Down
2 changes: 1 addition & 1 deletion daft/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,7 +1601,7 @@ def limit(self, num: int) -> "DataFrame":
│ --- │
│ Int64 │
╞═══════╡
│ 1 │ f
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A small fix since this breaks the doc tests.

│ 1 │
├╌╌╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌╌╌┤
Expand Down
60 changes: 32 additions & 28 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ def contains(self, substr: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value contains the provided pattern
"""
substr_expr = Expression._to_expression(substr)
return Expression._from_pyexpr(self._expr.utf8_contains(substr_expr._expr))
return Expression._from_pyexpr(native.utf8_contains(self._expr, substr_expr._expr))

def match(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given regular expression pattern in a string column
Expand Down Expand Up @@ -1917,7 +1917,7 @@ def match(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_match(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_match(self._expr, pattern_expr._expr))

def endswith(self, suffix: str | Expression) -> Expression:
"""Checks whether each string ends with the given pattern in a string column
Expand Down Expand Up @@ -1947,7 +1947,7 @@ def endswith(self, suffix: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value ends with the provided pattern
"""
suffix_expr = Expression._to_expression(suffix)
return Expression._from_pyexpr(self._expr.utf8_endswith(suffix_expr._expr))
return Expression._from_pyexpr(native.utf8_endswith(self._expr, suffix_expr._expr))

def startswith(self, prefix: str | Expression) -> Expression:
"""Checks whether each string starts with the given pattern in a string column
Expand Down Expand Up @@ -1977,7 +1977,7 @@ def startswith(self, prefix: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value starts with the provided pattern
"""
prefix_expr = Expression._to_expression(prefix)
return Expression._from_pyexpr(self._expr.utf8_startswith(prefix_expr._expr))
return Expression._from_pyexpr(native.utf8_startswith(self._expr, prefix_expr._expr))

def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
r"""Splits each string on the given literal or regex pattern, into a list of strings.
Expand Down Expand Up @@ -2028,7 +2028,7 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
Expression: A List[Utf8] expression containing the string splits for each string in the column.
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_split(pattern_expr._expr, regex))
return Expression._from_pyexpr(native.utf8_split(self._expr, pattern_expr._expr, regex))

def concat(self, other: str | Expression) -> Expression:
"""Concatenates two string expressions together
Expand Down Expand Up @@ -2119,7 +2119,7 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression:
`extract_all`
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_extract(pattern_expr._expr, index))
return Expression._from_pyexpr(native.utf8_extract(self._expr, pattern_expr._expr, index))

def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
r"""Extracts the specified match group from all regex matches in each string in a string column.
Expand Down Expand Up @@ -2175,7 +2175,7 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
`extract`
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_extract_all(pattern_expr._expr, index))
return Expression._from_pyexpr(native.utf8_extract_all(self._expr, pattern_expr._expr, index))

def replace(
self,
Expand Down Expand Up @@ -2232,7 +2232,9 @@ def replace(
"""
pattern_expr = Expression._to_expression(pattern)
replacement_expr = Expression._to_expression(replacement)
return Expression._from_pyexpr(self._expr.utf8_replace(pattern_expr._expr, replacement_expr._expr, regex))
return Expression._from_pyexpr(
native.utf8_replace(self._expr, pattern_expr._expr, replacement_expr._expr, regex)
)

def length(self) -> Expression:
"""Retrieves the length for a UTF-8 string column
Expand All @@ -2259,7 +2261,7 @@ def length(self) -> Expression:
Returns:
Expression: an UInt64 expression with the length of each string
"""
return Expression._from_pyexpr(self._expr.utf8_length())
return Expression._from_pyexpr(native.utf8_length(self._expr))

def length_bytes(self) -> Expression:
"""Retrieves the length for a UTF-8 string column in bytes.
Expand All @@ -2286,7 +2288,7 @@ def length_bytes(self) -> Expression:
Returns:
Expression: an UInt64 expression with the length of each string
"""
return Expression._from_pyexpr(self._expr.utf8_length_bytes())
return Expression._from_pyexpr(native.utf8_length_bytes(self._expr))

def lower(self) -> Expression:
"""Convert UTF-8 string to all lowercase
Expand All @@ -2313,7 +2315,7 @@ def lower(self) -> Expression:
Returns:
Expression: a String expression which is `self` lowercased
"""
return Expression._from_pyexpr(self._expr.utf8_lower())
return Expression._from_pyexpr(native.utf8_lower(self._expr))

def upper(self) -> Expression:
"""Convert UTF-8 string to all upper
Expand All @@ -2340,7 +2342,7 @@ def upper(self) -> Expression:
Returns:
Expression: a String expression which is `self` uppercased
"""
return Expression._from_pyexpr(self._expr.utf8_upper())
return Expression._from_pyexpr(native.utf8_upper(self._expr))

def lstrip(self) -> Expression:
"""Strip whitespace from the left side of a UTF-8 string
Expand All @@ -2367,7 +2369,7 @@ def lstrip(self) -> Expression:
Returns:
Expression: a String expression which is `self` with leading whitespace stripped
"""
return Expression._from_pyexpr(self._expr.utf8_lstrip())
return Expression._from_pyexpr(native.utf8_lstrip(self._expr))

def rstrip(self) -> Expression:
"""Strip whitespace from the right side of a UTF-8 string
Expand All @@ -2394,7 +2396,7 @@ def rstrip(self) -> Expression:
Returns:
Expression: a String expression which is `self` with trailing whitespace stripped
"""
return Expression._from_pyexpr(self._expr.utf8_rstrip())
return Expression._from_pyexpr(native.utf8_rstrip(self._expr))

def reverse(self) -> Expression:
"""Reverse a UTF-8 string
Expand All @@ -2421,7 +2423,7 @@ def reverse(self) -> Expression:
Returns:
Expression: a String expression which is `self` reversed
"""
return Expression._from_pyexpr(self._expr.utf8_reverse())
return Expression._from_pyexpr(native.utf8_reverse(self._expr))

def capitalize(self) -> Expression:
"""Capitalize a UTF-8 string
Expand All @@ -2448,7 +2450,7 @@ def capitalize(self) -> Expression:
Returns:
Expression: a String expression which is `self` uppercased with the first character and lowercased the rest
"""
return Expression._from_pyexpr(self._expr.utf8_capitalize())
return Expression._from_pyexpr(native.utf8_capitalize(self._expr))

def left(self, nchars: int | Expression) -> Expression:
"""Gets the n (from nchars) left-most characters of each string
Expand Down Expand Up @@ -2476,7 +2478,7 @@ def left(self, nchars: int | Expression) -> Expression:
Expression: a String expression which is the `n` left-most characters of `self`
"""
nchars_expr = Expression._to_expression(nchars)
return Expression._from_pyexpr(self._expr.utf8_left(nchars_expr._expr))
return Expression._from_pyexpr(native.utf8_left(self._expr, nchars_expr._expr))

def right(self, nchars: int | Expression) -> Expression:
"""Gets the n (from nchars) right-most characters of each string
Expand Down Expand Up @@ -2504,7 +2506,7 @@ def right(self, nchars: int | Expression) -> Expression:
Expression: a String expression which is the `n` right-most characters of `self`
"""
nchars_expr = Expression._to_expression(nchars)
return Expression._from_pyexpr(self._expr.utf8_right(nchars_expr._expr))
return Expression._from_pyexpr(native.utf8_right(self._expr, nchars_expr._expr))

def find(self, substr: str | Expression) -> Expression:
"""Returns the index of the first occurrence of the substring in each string
Expand Down Expand Up @@ -2536,7 +2538,7 @@ def find(self, substr: str | Expression) -> Expression:
Expression: an Int64 expression with the index of the first occurrence of the substring in each string
"""
substr_expr = Expression._to_expression(substr)
return Expression._from_pyexpr(self._expr.utf8_find(substr_expr._expr))
return Expression._from_pyexpr(native.utf8_find(self._expr, substr_expr._expr))

def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""Right-pads each string by truncating or padding with the character
Expand Down Expand Up @@ -2569,7 +2571,7 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""
length_expr = Expression._to_expression(length)
pad_expr = Expression._to_expression(pad)
return Expression._from_pyexpr(self._expr.utf8_rpad(length_expr._expr, pad_expr._expr))
return Expression._from_pyexpr(native.utf8_rpad(self._expr, length_expr._expr, pad_expr._expr))

def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""Left-pads each string by truncating on the right or padding with the character
Expand Down Expand Up @@ -2602,7 +2604,7 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""
length_expr = Expression._to_expression(length)
pad_expr = Expression._to_expression(pad)
return Expression._from_pyexpr(self._expr.utf8_lpad(length_expr._expr, pad_expr._expr))
return Expression._from_pyexpr(native.utf8_lpad(self._expr, length_expr._expr, pad_expr._expr))

def repeat(self, n: int | Expression) -> Expression:
"""Repeats each string n times
Expand Down Expand Up @@ -2630,7 +2632,7 @@ def repeat(self, n: int | Expression) -> Expression:
Expression: a String expression which is `self` repeated `n` times
"""
n_expr = Expression._to_expression(n)
return Expression._from_pyexpr(self._expr.utf8_repeat(n_expr._expr))
return Expression._from_pyexpr(native.utf8_repeat(self._expr, n_expr._expr))

def like(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given SQL LIKE pattern, case sensitive
Expand Down Expand Up @@ -2661,7 +2663,7 @@ def like(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_like(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_like(self._expr, pattern_expr._expr))

def ilike(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given SQL LIKE pattern, case insensitive
Expand Down Expand Up @@ -2692,7 +2694,7 @@ def ilike(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_ilike(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_ilike(self._expr, pattern_expr._expr))

def substr(self, start: int | Expression, length: int | Expression | None = None) -> Expression:
"""Extract a substring from a string, starting at a specified index and extending for a given length.
Expand Down Expand Up @@ -2724,7 +2726,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None
"""
start_expr = Expression._to_expression(start)
length_expr = Expression._to_expression(length)
return Expression._from_pyexpr(self._expr.utf8_substr(start_expr._expr, length_expr._expr))
return Expression._from_pyexpr(native.utf8_substr(self._expr, start_expr._expr, length_expr._expr))

def to_date(self, format: str) -> Expression:
"""Converts a string to a date using the specified format
Expand Down Expand Up @@ -2755,7 +2757,7 @@ def to_date(self, format: str) -> Expression:
Returns:
Expression: a Date expression which is parsed by given format
"""
return Expression._from_pyexpr(self._expr.utf8_to_date(format))
return Expression._from_pyexpr(native.utf8_to_date(self._expr, format))

def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
"""Converts a string to a datetime using the specified format and timezone
Expand Down Expand Up @@ -2805,7 +2807,7 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
Returns:
Expression: a DateTime expression which is parsed by given format and timezone
"""
return Expression._from_pyexpr(self._expr.utf8_to_datetime(format, timezone))
return Expression._from_pyexpr(native.utf8_to_datetime(self._expr, format, timezone))

def normalize(
self,
Expand Down Expand Up @@ -2849,7 +2851,9 @@ def normalize(
Returns:
Expression: a String expression which is normalized.
"""
return Expression._from_pyexpr(self._expr.utf8_normalize(remove_punct, lowercase, nfd_unicode, white_space))
return Expression._from_pyexpr(
native.utf8_normalize(self._expr, remove_punct, lowercase, nfd_unicode, white_space)
)

def tokenize_encode(
self,
Expand Down
8 changes: 1 addition & 7 deletions src/daft-dsl/src/functions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ pub mod python;
pub mod scalar;
pub mod sketch;
pub mod struct_;
pub mod utf8;

use std::{
fmt::{Display, Formatter, Result, Write},
Expand All @@ -18,15 +17,11 @@ use python::PythonUDF;
pub use scalar::*;
use serde::{Deserialize, Serialize};

use self::{
map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr,
utf8::Utf8Expr,
};
use self::{map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr};
use crate::{Expr, ExprRef, Operator};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum FunctionExpr {
Utf8(Utf8Expr),
Map(MapExpr),
Sketch(SketchExpr),
Struct(StructExpr),
Expand All @@ -49,7 +44,6 @@ impl FunctionExpr {
#[inline]
fn get_evaluator(&self) -> &dyn FunctionEvaluator {
match self {
Self::Utf8(expr) => expr.get_evaluator(),
Self::Map(expr) => expr.get_evaluator(),
Self::Sketch(expr) => expr.get_evaluator(),
Self::Struct(expr) => expr.get_evaluator(),
Expand Down
Loading
Loading