Eventual-Inc · ConeyLiu · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1095,34 +1095,6 @@ class PyExpr:
     def __repr__(self) -> str: ...
     def __hash__(self) -> int: ...
     def __reduce__(self) -> tuple: ...
-    def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_contains(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_match(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ...
-    def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ...
-    def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ...
-    def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
-    def utf8_length(self) -> PyExpr: ...
-    def utf8_length_bytes(self) -> PyExpr: ...
-    def utf8_lower(self) -> PyExpr: ...
-    def utf8_upper(self) -> PyExpr: ...
-    def utf8_lstrip(self) -> PyExpr: ...
-    def utf8_rstrip(self) -> PyExpr: ...
-    def utf8_reverse(self) -> PyExpr: ...
-    def utf8_capitalize(self) -> PyExpr: ...
-    def utf8_left(self, nchars: PyExpr) -> PyExpr: ...
-    def utf8_right(self, nchars: PyExpr) -> PyExpr: ...
-    def utf8_find(self, substr: PyExpr) -> PyExpr: ...
-    def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
-    def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
-    def utf8_repeat(self, n: PyExpr) -> PyExpr: ...
-    def utf8_like(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ...
-    def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ...
-    def utf8_to_date(self, format: str) -> PyExpr: ...
-    def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ...
-    def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ...
     def struct_get(self, name: str) -> PyExpr: ...
     def map_get(self, key: PyExpr) -> PyExpr: ...
     def partitioning_days(self) -> PyExpr: ...
@@ -1320,6 +1292,40 @@ def list_max(expr: PyExpr) -> PyExpr: ...
 def list_slice(expr: PyExpr, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ...
 def list_chunk(expr: PyExpr, size: int) -> PyExpr: ...
 
+# ---
+# expr.utf8 namespace
+# ---
+def utf8_endswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_startswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_contains(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_match(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_split(expr: PyExpr, pattern: PyExpr, regex: bool) -> PyExpr: ...
+def utf8_extract(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
+def utf8_extract_all(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
+def utf8_replace(expr: PyExpr, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
+def utf8_length(expr: PyExpr) -> PyExpr: ...
+def utf8_length_bytes(expr: PyExpr) -> PyExpr: ...
+def utf8_lower(expr: PyExpr) -> PyExpr: ...
+def utf8_upper(expr: PyExpr) -> PyExpr: ...
+def utf8_lstrip(expr: PyExpr) -> PyExpr: ...
+def utf8_rstrip(expr: PyExpr) -> PyExpr: ...
+def utf8_reverse(expr: PyExpr) -> PyExpr: ...
+def utf8_capitalize(expr: PyExpr) -> PyExpr: ...
+def utf8_left(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
+def utf8_right(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
+def utf8_find(expr: PyExpr, substr: PyExpr) -> PyExpr: ...
+def utf8_rpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
+def utf8_lpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
+def utf8_repeat(expr: PyExpr, n: PyExpr) -> PyExpr: ...
+def utf8_like(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_ilike(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
+def utf8_substr(expr: PyExpr, start: PyExpr, length: PyExpr) -> PyExpr: ...
+def utf8_to_date(expr: PyExpr, format: str) -> PyExpr: ...
+def utf8_to_datetime(expr: PyExpr, format: str, timezone: str | None = None) -> PyExpr: ...
+def utf8_normalize(
+    expr: PyExpr, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool
+) -> PyExpr: ...
+
 class PyCatalog:
     @staticmethod
     def new() -> PyCatalog: ...

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -1601,7 +1601,7 @@ def limit(self, num: int) -> "DataFrame":
             │ ---   │
             │ Int64 │
             ╞═══════╡
-            │ 1     │ f
+            │ 1     │
             ├╌╌╌╌╌╌╌┤
             │ 2     │
             ├╌╌╌╌╌╌╌┤

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -1887,7 +1887,7 @@ def contains(self, substr: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value contains the provided pattern
         """
         substr_expr = Expression._to_expression(substr)
-        return Expression._from_pyexpr(self._expr.utf8_contains(substr_expr._expr))
+        return Expression._from_pyexpr(native.utf8_contains(self._expr, substr_expr._expr))
 
     def match(self, pattern: str | Expression) -> Expression:
         """Checks whether each string matches the given regular expression pattern in a string column
@@ -1917,7 +1917,7 @@ def match(self, pattern: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value matches the provided pattern
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_match(pattern_expr._expr))
+        return Expression._from_pyexpr(native.utf8_match(self._expr, pattern_expr._expr))
 
     def endswith(self, suffix: str | Expression) -> Expression:
         """Checks whether each string ends with the given pattern in a string column
@@ -1947,7 +1947,7 @@ def endswith(self, suffix: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value ends with the provided pattern
         """
         suffix_expr = Expression._to_expression(suffix)
-        return Expression._from_pyexpr(self._expr.utf8_endswith(suffix_expr._expr))
+        return Expression._from_pyexpr(native.utf8_endswith(self._expr, suffix_expr._expr))
 
     def startswith(self, prefix: str | Expression) -> Expression:
         """Checks whether each string starts with the given pattern in a string column
@@ -1977,7 +1977,7 @@ def startswith(self, prefix: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value starts with the provided pattern
         """
         prefix_expr = Expression._to_expression(prefix)
-        return Expression._from_pyexpr(self._expr.utf8_startswith(prefix_expr._expr))
+        return Expression._from_pyexpr(native.utf8_startswith(self._expr, prefix_expr._expr))
 
     def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
         r"""Splits each string on the given literal or regex pattern, into a list of strings.
@@ -2028,7 +2028,7 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
             Expression: A List[Utf8] expression containing the string splits for each string in the column.
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_split(pattern_expr._expr, regex))
+        return Expression._from_pyexpr(native.utf8_split(self._expr, pattern_expr._expr, regex))
 
     def concat(self, other: str | Expression) -> Expression:
         """Concatenates two string expressions together
@@ -2119,7 +2119,7 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression:
             `extract_all`
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_extract(pattern_expr._expr, index))
+        return Expression._from_pyexpr(native.utf8_extract(self._expr, pattern_expr._expr, index))
 
     def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
         r"""Extracts the specified match group from all regex matches in each string in a string column.
@@ -2175,7 +2175,7 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
             `extract`
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_extract_all(pattern_expr._expr, index))
+        return Expression._from_pyexpr(native.utf8_extract_all(self._expr, pattern_expr._expr, index))
 
     def replace(
         self,
@@ -2232,7 +2232,9 @@ def replace(
         """
         pattern_expr = Expression._to_expression(pattern)
         replacement_expr = Expression._to_expression(replacement)
-        return Expression._from_pyexpr(self._expr.utf8_replace(pattern_expr._expr, replacement_expr._expr, regex))
+        return Expression._from_pyexpr(
+            native.utf8_replace(self._expr, pattern_expr._expr, replacement_expr._expr, regex)
+        )
 
     def length(self) -> Expression:
         """Retrieves the length for a UTF-8 string column
@@ -2259,7 +2261,7 @@ def length(self) -> Expression:
         Returns:
             Expression: an UInt64 expression with the length of each string
         """
-        return Expression._from_pyexpr(self._expr.utf8_length())
+        return Expression._from_pyexpr(native.utf8_length(self._expr))
 
     def length_bytes(self) -> Expression:
         """Retrieves the length for a UTF-8 string column in bytes.
@@ -2286,7 +2288,7 @@ def length_bytes(self) -> Expression:
         Returns:
             Expression: an UInt64 expression with the length of each string
         """
-        return Expression._from_pyexpr(self._expr.utf8_length_bytes())
+        return Expression._from_pyexpr(native.utf8_length_bytes(self._expr))
 
     def lower(self) -> Expression:
         """Convert UTF-8 string to all lowercase
@@ -2313,7 +2315,7 @@ def lower(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` lowercased
         """
-        return Expression._from_pyexpr(self._expr.utf8_lower())
+        return Expression._from_pyexpr(native.utf8_lower(self._expr))
 
     def upper(self) -> Expression:
         """Convert UTF-8 string to all upper
@@ -2340,7 +2342,7 @@ def upper(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` uppercased
         """
-        return Expression._from_pyexpr(self._expr.utf8_upper())
+        return Expression._from_pyexpr(native.utf8_upper(self._expr))
 
     def lstrip(self) -> Expression:
         """Strip whitespace from the left side of a UTF-8 string
@@ -2367,7 +2369,7 @@ def lstrip(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` with leading whitespace stripped
         """
-        return Expression._from_pyexpr(self._expr.utf8_lstrip())
+        return Expression._from_pyexpr(native.utf8_lstrip(self._expr))
 
     def rstrip(self) -> Expression:
         """Strip whitespace from the right side of a UTF-8 string
@@ -2394,7 +2396,7 @@ def rstrip(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` with trailing whitespace stripped
         """
-        return Expression._from_pyexpr(self._expr.utf8_rstrip())
+        return Expression._from_pyexpr(native.utf8_rstrip(self._expr))
 
     def reverse(self) -> Expression:
         """Reverse a UTF-8 string
@@ -2421,7 +2423,7 @@ def reverse(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` reversed
         """
-        return Expression._from_pyexpr(self._expr.utf8_reverse())
+        return Expression._from_pyexpr(native.utf8_reverse(self._expr))
 
     def capitalize(self) -> Expression:
         """Capitalize a UTF-8 string
@@ -2448,7 +2450,7 @@ def capitalize(self) -> Expression:
         Returns:
             Expression: a String expression which is `self` uppercased with the first character and lowercased the rest
         """
-        return Expression._from_pyexpr(self._expr.utf8_capitalize())
+        return Expression._from_pyexpr(native.utf8_capitalize(self._expr))
 
     def left(self, nchars: int | Expression) -> Expression:
         """Gets the n (from nchars) left-most characters of each string
@@ -2476,7 +2478,7 @@ def left(self, nchars: int | Expression) -> Expression:
             Expression: a String expression which is the `n` left-most characters of `self`
         """
         nchars_expr = Expression._to_expression(nchars)
-        return Expression._from_pyexpr(self._expr.utf8_left(nchars_expr._expr))
+        return Expression._from_pyexpr(native.utf8_left(self._expr, nchars_expr._expr))
 
     def right(self, nchars: int | Expression) -> Expression:
         """Gets the n (from nchars) right-most characters of each string
@@ -2504,7 +2506,7 @@ def right(self, nchars: int | Expression) -> Expression:
             Expression: a String expression which is the `n` right-most characters of `self`
         """
         nchars_expr = Expression._to_expression(nchars)
-        return Expression._from_pyexpr(self._expr.utf8_right(nchars_expr._expr))
+        return Expression._from_pyexpr(native.utf8_right(self._expr, nchars_expr._expr))
 
     def find(self, substr: str | Expression) -> Expression:
         """Returns the index of the first occurrence of the substring in each string
@@ -2536,7 +2538,7 @@ def find(self, substr: str | Expression) -> Expression:
             Expression: an Int64 expression with the index of the first occurrence of the substring in each string
         """
         substr_expr = Expression._to_expression(substr)
-        return Expression._from_pyexpr(self._expr.utf8_find(substr_expr._expr))
+        return Expression._from_pyexpr(native.utf8_find(self._expr, substr_expr._expr))
 
     def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
         """Right-pads each string by truncating or padding with the character
@@ -2569,7 +2571,7 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
         """
         length_expr = Expression._to_expression(length)
         pad_expr = Expression._to_expression(pad)
-        return Expression._from_pyexpr(self._expr.utf8_rpad(length_expr._expr, pad_expr._expr))
+        return Expression._from_pyexpr(native.utf8_rpad(self._expr, length_expr._expr, pad_expr._expr))
 
     def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
         """Left-pads each string by truncating on the right or padding with the character
@@ -2602,7 +2604,7 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
         """
         length_expr = Expression._to_expression(length)
         pad_expr = Expression._to_expression(pad)
-        return Expression._from_pyexpr(self._expr.utf8_lpad(length_expr._expr, pad_expr._expr))
+        return Expression._from_pyexpr(native.utf8_lpad(self._expr, length_expr._expr, pad_expr._expr))
 
     def repeat(self, n: int | Expression) -> Expression:
         """Repeats each string n times
@@ -2630,7 +2632,7 @@ def repeat(self, n: int | Expression) -> Expression:
             Expression: a String expression which is `self` repeated `n` times
         """
         n_expr = Expression._to_expression(n)
-        return Expression._from_pyexpr(self._expr.utf8_repeat(n_expr._expr))
+        return Expression._from_pyexpr(native.utf8_repeat(self._expr, n_expr._expr))
 
     def like(self, pattern: str | Expression) -> Expression:
         """Checks whether each string matches the given SQL LIKE pattern, case sensitive
@@ -2661,7 +2663,7 @@ def like(self, pattern: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value matches the provided pattern
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_like(pattern_expr._expr))
+        return Expression._from_pyexpr(native.utf8_like(self._expr, pattern_expr._expr))
 
     def ilike(self, pattern: str | Expression) -> Expression:
         """Checks whether each string matches the given SQL LIKE pattern, case insensitive
@@ -2692,7 +2694,7 @@ def ilike(self, pattern: str | Expression) -> Expression:
             Expression: a Boolean expression indicating whether each value matches the provided pattern
         """
         pattern_expr = Expression._to_expression(pattern)
-        return Expression._from_pyexpr(self._expr.utf8_ilike(pattern_expr._expr))
+        return Expression._from_pyexpr(native.utf8_ilike(self._expr, pattern_expr._expr))
 
     def substr(self, start: int | Expression, length: int | Expression | None = None) -> Expression:
         """Extract a substring from a string, starting at a specified index and extending for a given length.
@@ -2724,7 +2726,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None
         """
         start_expr = Expression._to_expression(start)
         length_expr = Expression._to_expression(length)
-        return Expression._from_pyexpr(self._expr.utf8_substr(start_expr._expr, length_expr._expr))
+        return Expression._from_pyexpr(native.utf8_substr(self._expr, start_expr._expr, length_expr._expr))
 
     def to_date(self, format: str) -> Expression:
         """Converts a string to a date using the specified format
@@ -2755,7 +2757,7 @@ def to_date(self, format: str) -> Expression:
         Returns:
             Expression: a Date expression which is parsed by given format
         """
-        return Expression._from_pyexpr(self._expr.utf8_to_date(format))
+        return Expression._from_pyexpr(native.utf8_to_date(self._expr, format))
 
     def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
         """Converts a string to a datetime using the specified format and timezone
@@ -2805,7 +2807,7 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
         Returns:
             Expression: a DateTime expression which is parsed by given format and timezone
         """
-        return Expression._from_pyexpr(self._expr.utf8_to_datetime(format, timezone))
+        return Expression._from_pyexpr(native.utf8_to_datetime(self._expr, format, timezone))
 
     def normalize(
         self,
@@ -2849,7 +2851,9 @@ def normalize(
         Returns:
             Expression: a String expression which is normalized.
         """
-        return Expression._from_pyexpr(self._expr.utf8_normalize(remove_punct, lowercase, nfd_unicode, white_space))
+        return Expression._from_pyexpr(
+            native.utf8_normalize(self._expr, remove_punct, lowercase, nfd_unicode, white_space)
+        )
 
     def tokenize_encode(
         self,

diff --git a/src/daft-dsl/src/functions/mod.rs b/src/daft-dsl/src/functions/mod.rs
@@ -5,7 +5,6 @@ pub mod python;
 pub mod scalar;
 pub mod sketch;
 pub mod struct_;
-pub mod utf8;
 
 use std::{
     fmt::{Display, Formatter, Result, Write},
@@ -18,15 +17,11 @@ use python::PythonUDF;
 pub use scalar::*;
 use serde::{Deserialize, Serialize};
 
-use self::{
-    map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr,
-    utf8::Utf8Expr,
-};
+use self::{map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr};
 use crate::{Expr, ExprRef, Operator};
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
 pub enum FunctionExpr {
-    Utf8(Utf8Expr),
     Map(MapExpr),
     Sketch(SketchExpr),
     Struct(StructExpr),
@@ -49,7 +44,6 @@ impl FunctionExpr {
     #[inline]
     fn get_evaluator(&self) -> &dyn FunctionEvaluator {
         match self {
-            Self::Utf8(expr) => expr.get_evaluator(),
             Self::Map(expr) => expr.get_evaluator(),
             Self::Sketch(expr) => expr.get_evaluator(),
             Self::Struct(expr) => expr.get_evaluator(),