assign: Include type promotion in assign

fennel-ai · Sep 6, 2024 · 1d22322 · 1d22322
1 parent e10f4fa
commit 1d22322
Show file tree

Hide file tree

Showing 12 changed files with 712 additions and 419 deletions.
diff --git a/fennel/CHANGELOG.md b/fennel/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## [1.5.18] - 2024-09-05
+- Struct initializer + arrow fixes + type promotion in assign
+
 ## [1.5.17] - 2024-09-04
 - Add support for several more expressions
 

diff --git a/fennel/client_tests/test_complex_struct.py b/fennel/client_tests/test_complex_struct.py
@@ -8,6 +8,7 @@
 from fennel.connectors import Webhook, source
 from fennel.datasets import dataset, Dataset, field, pipeline, LastK
 from fennel.dtypes import struct, Continuous
+from fennel.expr.expr import col, make_struct
 from fennel.featuresets import featureset, feature as F, extractor
 from fennel.lib import inputs, outputs
 from fennel.testing import mock
@@ -87,6 +88,118 @@ def movie_info(cls, movie: Dataset):
  )
 
 
+@dataset(index=True)
+class MovieInfoExpr:
+ director_id: int = field(key=True)
+ movie_id: int = field(key=True)
+ role_list: List[Role]
+ timestamp: datetime = field(timestamp=True)
+
+ @pipeline
+ @inputs(MovieDS)
+ def movie_info(cls, movie: Dataset):
+ return (
+ movie.assign(
+ role=make_struct(
+ {
+ "role_id": col("role_id"),
+ "name": col("name"),
+ "cost": col("cost"),
+ },
+ Role,
+ ).astype(Role)
+ )
+ .drop(columns=["role_id", "name", "cost"])
+ .groupby("director_id", "movie_id")
+ .aggregate(
+ LastK(
+ into_field="role_list",
+ of="role",
+ window=Continuous("forever"),
+ limit=3,
+ dedup=False,
+ ),
+ )
+ )
+
+
+@dataset(index=True)
+class MovieInfoExpr2:
+ director_id: int = field(key=True)
+ movie_id: int = field(key=True)
+ role_list: List[Role]
+ timestamp: datetime = field(timestamp=True)
+
+ @pipeline
+ @inputs(MovieDS)
+ def movie_info(cls, movie: Dataset):
+ return (
+ movie.assign(
+ role=Role.expr( # type: ignore
+ role_id=col("role_id"), name=col("name"), cost=col("cost")
+ ).astype(Role)
+ )
+ .drop(columns=["role_id", "name", "cost"])
+ .groupby("director_id", "movie_id")
+ .aggregate(
+ LastK(
+ into_field="role_list",
+ of="role",
+ window=Continuous("forever"),
+ limit=3,
+ dedup=False,
+ ),
+ )
+ )
+
+
+@struct
+class FullName:
+ first_name: str
+ last_name: str
+
+
+@struct
+class RoleExtended:
+ role_id: int
+ name: FullName
+ cost: int
+
+
+@dataset(index=True)
+class MovieInfoExprNested:
+ director_id: int = field(key=True)
+ movie_id: int = field(key=True)
+ role_list: List[RoleExtended]
+ timestamp: datetime = field(timestamp=True)
+
+ @pipeline
+ @inputs(MovieDS)
+ def movie_info(cls, movie: Dataset):
+ return (
+ movie.assign(
+ role=RoleExtended.expr( # type: ignore
+ role_id=col("role_id"),
+ name=FullName.expr( # type: ignore
+ first_name=col("name"), last_name="rando"
+ ),
+ cost=col("cost"),
+ ).astype(RoleExtended)
+ )
+ .drop(columns=["role_id", "name", "cost"])
+ .groupby("director_id", "movie_id")
+ .aggregate(
+ LastK(
+ into_field="role_list",
+ of="role",
+ window=Continuous("forever"),
+ limit=3,
+ dedup=False,
+ ),
+ )
+ )
+
+
 @featureset
 class Request:
  director_id: int
@@ -218,7 +331,13 @@ def test_complex_struct(client):
 
  client.commit(
  message="msg",
- datasets=[MovieDS, MovieInfo],
+ datasets=[
+ MovieDS,
+ MovieInfo,
+ MovieInfoExpr,
+ MovieInfoExpr2,
+ MovieInfoExprNested,
+ ],
  featuresets=[Request, MovieFeatures],
  )
 
@@ -255,6 +374,36 @@ def test_complex_struct(client):
  input_dataframe=input_df,
  )
 
+ res1, found1 = client.lookup(
+ "MovieInfo",
+ keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}),
+ )
+ res2, found2 = client.lookup(
+ "MovieInfoExpr",
+ keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}),
+ )
+ res3, found3 = client.lookup(
+ "MovieInfoExpr2",
+ keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}),
+ )
+ assert res1.shape == res2.shape
+ assert res1.shape == res3.shape
+ for c in res1.columns:
+ assert res1[c].equals(res2[c])
+ assert res1[c].equals(res3[c])
+ assert list(found1) == list(found2)
+ assert list(found1) == list(found3)
+
+ res4, found4 = client.lookup(
+ "MovieInfoExprNested",
+ keys=pd.DataFrame({"director_id": [1, 2], "movie_id": [1, 3]}),
+ )
+ assert res1.shape == res4.shape
+ assert list(found1) == list(found4)
+ for r in res4["role_list"]:
+ for role in r:
+ assert role.name.last_name == "rando"
+
  assert df.shape[0] == 4
  assert len(df["MovieFeatures.role_list_py"].tolist()[0]) == 3
  assert df["MovieFeatures.role_list_py"].tolist()[0][0].as_json() == {

diff --git a/fennel/client_tests/test_dataset.py b/fennel/client_tests/test_dataset.py
@@ -606,81 +606,6 @@ class UserInfoDataset:
  )
 
 
-# On demand datasets are not supported for now.
-
-# class TestDocumentDataset(unittest.TestCase):
-# @mock_client
-# def test_log_to_document_dataset(self, client):
-# """Log some data to the dataset and check if it is logged correctly."""
-#
-# @meta(owner="aditya@fennel.ai")
-# @dataset
-# class DocumentContentDataset:
-# doc_id: int = field(key=True)
-# bert_embedding: Embedding[4]
-# fast_text_embedding: Embedding[3]
-# num_words: int
-# timestamp: datetime = field(timestamp=True)
-#
-# @on_demand(expires_after="3d")
-# @inputs(datetime, int)
-# def get_embedding(cls, ts: pd.Series, doc_ids: pd.Series):
-# data = []
-# doc_ids = doc_ids.tolist()
-# for i in range(len(ts)):
-# data.append(
-# [
-# doc_ids[i],
-# [0.1, 0.2, 0.3, 0.4],
-# [1.1, 1.2, 1.3],
-# 10 * i,
-# ts[i],
-# ]
-# )
-# columns = [
-# str(cls.doc_id),
-# str(cls.bert_embedding),
-# str(cls.fast_text_embedding),
-# str(cls.num_words),
-# str(cls.timestamp),
-# ]
-# return pd.DataFrame(data, columns=columns), pd.Series(
-# [True] * len(ts)
-# )
-#
-# # Sync the dataset
-# client.commit(datasets=[DocumentContentDataset])
-# now = datetime.now(timezone.utc)
-# data = [
-# [18232, np.array([1, 2, 3, 4]), np.array([1, 2, 3]), 10, now],
-# [
-# 18234,
-# np.array([1, 2.2, 0.213, 0.343]),
-# np.array([0.87, 2, 3]),
-# 9,
-# now,
-# ],
-# [18934, [1, 2.2, 0.213, 0.343], [0.87, 2, 3], 12, now],
-# ]
-# columns = [
-# "doc_id",
-# "bert_embedding",
-# "fast_text_embedding",
-# "num_words",
-# "timestamp",
-# ]
-# df = pd.DataFrame(data, columns=columns)
-# response = client.log("fennel_webhook","DocumentContentDataset", df)
-# assert response.status_code == requests.codes.OK, response.json()
-#
-# # Do some lookups
-# doc_ids = pd.Series([18232, 1728, 18234, 18934, 19200, 91012])
-# ts = pd.Series([now, now, now, now, now, now])
-# df, _ = DocumentContentDataset.lookup(ts, doc_id=doc_ids)
-# assert df.shape == (6, 5)
-# assert df["num_words"].tolist() == [10.0, 9.0, 12, 0, 10.0, 20.0]
-
-
 ################################################################################
 # Dataset & Pipelines Unit Tests
 ################################################################################
@@ -1012,6 +937,25 @@ class Orders:
  timestamp: datetime
 
 
+@dataset
+class OrdersOptional:
+ uid: Optional[int]
+ uid_float: float
+ uid_twice: float
+ skus: List[int]
+ prices: List[float]
+ timestamp: datetime
+
+ @pipeline
+ @inputs(Orders)
+ def cast(cls, ds: Dataset):
+ return ds.assign(
+ uid=col("uid").astype(Optional[int]), # type: ignore
+ uid_float=col("uid").astype(float), # type: ignore
+ uid_twice=(col("uid") * 2.0).astype(float), # type: ignore
+ )
+
+
 @dataset(index=True)
 class Derived:
  uid: int = field(key=True)
@@ -1066,6 +1010,39 @@ def test_basic_explode(self, client):
  assert df["price"].tolist()[0] == 10.1
  assert pd.isna(df["price"].tolist()[1])
 
+ @pytest.mark.integration
+ @mock
+ def test_basic_cast(self, client):
+ # # Sync the dataset
+ client.commit(message="msg", datasets=[Orders, OrdersOptional])
+ # log some rows to the transaction dataset
+ df = pd.DataFrame(
+ [
+ {
+ "uid": 1,
+ "skus": [1, 2],
+ "prices": [10.1, 20.0],
+ "timestamp": "2021-01-01T00:00:00",
+ },
+ {
+ "uid": 2,
+ "skus": [],
+ "prices": [],
+ "timestamp": "2021-01-01T00:00:00",
+ },
+ ]
+ )
+ client.log("webhook", "Orders", df)
+ client.sleep()
+
+ # do lookup on the WithSquare dataset
+ df = client.inspect("OrdersOptional")
+ assert df.shape == (2, 6)
+ assert df["uid"].tolist() == [1, 2]
+ assert df["uid_float"].tolist() == [1.0, 2.0]
+ assert df["uid_twice"].tolist() == [2.0, 4.0]
+ assert df["skus"].tolist() == [[1, 2], []]
+
 
 class TestBasicAssign(unittest.TestCase):
  @pytest.mark.integration

diff --git a/fennel/datasets/datasets.py b/fennel/datasets/datasets.py
@@ -2935,15 +2935,16 @@ def visitAssign(self, obj) -> DSSchema:
  raise ValueError(
  f"invalid assign - {output_schema_name} error in expression for column `{col}`: {str(e)}"
  )
- if typed_expr.dtype != expr_type:
+ if not typed_expr.expr.matches_type(
+ typed_expr.dtype, input_schema.schema()
+ ):
  printer = ExprPrinter()
  type_errors.append(
  f"'{col}' is expected to be of type `{dtype_to_string(typed_expr.dtype)}`, but evaluates to `{dtype_to_string(expr_type)}`. Full expression: `{printer.print(typed_expr.expr.root)}`"
  )
 
  if len(type_errors) > 0:
  joined_errors = "\n\t".join(type_errors)
- print(joined_errors)
  raise TypeError(
  f"found type errors in assign node of `{self.dsname}.{self.pipeline_name}`:\n\t{joined_errors}"
  )

diff --git a/fennel/dtypes/dtypes.py b/fennel/dtypes/dtypes.py
@@ -1,4 +1,5 @@
 import dataclasses
+from functools import partial
 import inspect
 import sys
 from dataclasses import dataclass
@@ -75,6 +76,16 @@ def get_fennel_struct(annotation) -> Any:
  return None
 
 
+def make_struct_expr(cls, **kwargs):
+ from fennel.expr.expr import Expr, make_expr, make_struct
+
+ fields = {}
+ for name, value in kwargs.items():
+ fields[name] = make_expr(value)
+
+ return make_struct(fields, cls)
+
+
 def struct(cls):
  for name, member in inspect.getmembers(cls):
  if inspect.isfunction(member) and name in cls.__dict__:
@@ -131,6 +142,7 @@ def struct(cls):
  setattr(cls, FENNEL_STRUCT_SRC_CODE, "")
  setattr(cls, FENNEL_STRUCT_DEPENDENCIES_SRC_CODE, dependency_code)
  cls.as_json = as_json
+ cls.expr = partial(make_struct_expr, cls)
  return dataclasses.dataclass(cls)