Skip to content

Commit

Permalink
expr: Read side expressions (#531)
Browse files Browse the repository at this point in the history
* expr: Read side expressions

* expr: Rename F to col

* feature: Bring back F for feature

* Improve error messages
  • Loading branch information
aditya-nambiar authored Aug 20, 2024
1 parent 6fbad75 commit ab24cd5
Show file tree
Hide file tree
Showing 30 changed files with 691 additions and 167 deletions.
4 changes: 2 additions & 2 deletions docs/examples/api-reference/client/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_incremental(client):
# docsnip incremental
from fennel.datasets import dataset, field
from fennel.connectors import source, Webhook
from fennel.featuresets import featureset, feature, extractor
from fennel.featuresets import featureset, feature as F, extractor
from fennel.lib import inputs, outputs

webhook = Webhook(name="some_webhook")
Expand All @@ -82,7 +82,7 @@ class Transaction:
@featureset
class TransactionFeatures:
txid: int
amount: int = feature(Transaction.amount, default=0)
amount: int = F(Transaction.amount, default=0)
amount_is_high: bool

@extractor(env="bronze")
Expand Down
12 changes: 6 additions & 6 deletions docs/examples/featuresets/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

def test_featureset_overview():
# docsnip featureset
from fennel.featuresets import featureset, extractor
from fennel.featuresets import featureset, extractor, feature as F
from fennel.lib import inputs, outputs

@featureset
Expand Down Expand Up @@ -75,7 +75,7 @@ def fn(cls, ts: pd.Series, durations: pd.Series) -> pd.Series:
@mock
def test_featureset_auto_extractors(client):
from fennel.datasets import dataset, field
from fennel.featuresets import featureset, extractor
from fennel.featuresets import featureset, extractor, feature as F
from fennel.lib import inputs, outputs
from fennel.connectors import source, Webhook

Expand All @@ -94,7 +94,7 @@ class Movie:
class MovieFeatures:
id: int
# docsnip-highlight next-line
duration: int = feature(Movie.duration, default=-1)
duration: int = F(Movie.duration, default=-1)
over_2hrs: bool

@extractor
Expand Down Expand Up @@ -148,7 +148,7 @@ def is_over_2hrs(cls, ts: pd.Series, durations: pd.Series) -> pd.Series:

@mock
def test_featureset_alias(client):
from fennel.featuresets import featureset, feature, extractor
from fennel.featuresets import featureset, feature as F, extractor
from fennel.lib import inputs, outputs
from fennel.connectors import source, Webhook

Expand All @@ -168,8 +168,8 @@ class Request:

@featureset
class MovieFeatures:
id: int = feature(Request.movie_id) # docsnip-highlight
duration: int = feature(Movie.duration, default=-1)
id: int = F(Request.movie_id) # docsnip-highlight
duration: int = F(Movie.duration, default=-1)

# /docsnip

Expand Down
2 changes: 1 addition & 1 deletion docs/examples/testing-and-ci-cd/ci_cd/featuresets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


# docsnip gh_action_featureset
from fennel import featureset, feature as F
from fennel.featuresets import featureset, feature as F


@featureset
Expand Down
8 changes: 4 additions & 4 deletions docs/pages/data-quality/metaflags.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ class User:
@meta(owner='feed-team@xyz.ai')
@featureset
class UserFeatures:
uid: int = feature()
zip: str = feature().meta(tags=['PII'])
bmi: float = feature().meta(owner='alan@xyz.ai')
bmr: float = feature().meta(deprecated=True)
uid: int = F()
zip: str = F().meta(tags=['PII'])
bmi: float = F().meta(owner='alan@xyz.ai')
bmr: float = F().meta(deprecated=True)
..

@meta(description='based on algorithm specified here: bit.ly/xyy123')
Expand Down
20 changes: 10 additions & 10 deletions docs/pages/orphan/composite-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ In many cases, it's important to write features about a composite of two or more
```python
@featureset
class User:
id: int = feature()
name: str = feature()
id: int = F()
name: str = F()
..

@featureset
class Post:
id: int = feature()
creator_uid: int = feature()
id: int = F()
creator_uid: int = F()
...

@extractor
Expand All @@ -29,9 +29,9 @@ class Post:
@featureset
class UserCreator:
# describes features for (uid, creator_uid) pairs
viewer: int = feature()
creator: int = feature()
affinity: float = feature()
viewer: int = F()
creator: int = F()
affinity: float = F()
...

@extractor
Expand All @@ -43,9 +43,9 @@ class UserCreator:
@featureset
class UserPost:
# describes features for (uid, pid) pairs
uid: int = feature()
pid: int = feature()
viewer_author_affinity = feature()
uid: int = F()
pid: int = F()
viewer_author_affinity = F()
...

@extractor
Expand Down
12 changes: 6 additions & 6 deletions docs/pages/orphan/request-based-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ are very naturally modeled in Fennel. Let's look at one good way of doing this:
```python
@featureset
class SearchRequest:
time: datetime = feature()
ip: str = feature()
device_type: str = feature()
query: str = feature()
time: datetime = F()
ip: str = F()
device_type: str = F()
query: str = F()

@featureset
class UserFeatures:
uid: int = feature()
uid: int = F()
...
ctr_by_device_type: float = feature()
ctr_by_device_type: float = F()
..

@extractor
Expand Down
3 changes: 3 additions & 0 deletions fennel/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## [1.5.6] - 2024-08-20
- Add support for expression based extractors

## [1.5.5] - 2024-08-20
- Enable discrete aggregation with lookback

Expand Down
2 changes: 1 addition & 1 deletion fennel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@
AggregateType,
Stddev,
)
from fennel.featuresets import featureset, feature, extractor
from fennel.featuresets import featureset, feature as F, extractor
from fennel.lib import meta
3 changes: 2 additions & 1 deletion fennel/client_tests/test_complex_autogen_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import pytest
from dateutil.relativedelta import relativedelta # type: ignore

from fennel import meta, Count, featureset, feature as F, extractor
from fennel import meta, Count, featureset, extractor
from fennel.featuresets import feature as F
from fennel.connectors import Webhook
from fennel.connectors import source
from fennel.datasets import dataset, field, pipeline, Dataset
Expand Down
18 changes: 10 additions & 8 deletions fennel/client_tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import fennel._vendor.requests as requests
from fennel.connectors import source, Webhook, ref
from fennel.expr import F
from fennel.expr import col
from fennel.datasets import (
dataset,
field,
Expand Down Expand Up @@ -935,9 +935,11 @@ class MovieRatingAssignExpr:
@inputs(MovieRating)
def pipeline_assign(cls, m: Dataset):
rating_transformed = m.assign(
rating_sq=(F("rating") * F("rating")).astype(float),
rating_cube=(F("rating") * F("rating") * F("rating")).astype(float),
rating_into_5=(F("rating") * 5).astype(float),
rating_sq=(col("rating") * col("rating")).astype(float),
rating_cube=(col("rating") * col("rating") * col("rating")).astype(
float
),
rating_into_5=(col("rating") * 5).astype(float),
)
return rating_transformed.drop("num_ratings", "sum_ratings", "rating")

Expand Down Expand Up @@ -1873,11 +1875,11 @@ class PositiveRatingActivityExpr:
@pipeline
@inputs(RatingActivity)
def filter_positive_ratings(cls, rating: Dataset):
filtered_ds = rating.filter(F("rating") >= 3.5)
filtered_ds = rating.filter(col("rating") >= 3.5)
filter2 = filtered_ds.filter(
(F("movie") == "Jumanji")
| (F("movie") == "Titanic")
| (F("movie") == "RaOne")
(col("movie") == "Jumanji")
| (col("movie") == "Titanic")
| (col("movie") == "RaOne")
)
return filter2.groupby("movie").aggregate(
Count(window=Continuous("forever"), into_field=str(cls.cnt_rating)),
Expand Down
104 changes: 102 additions & 2 deletions fennel/client_tests/test_featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
expectations,
expect_column_values_to_be_between,
)
from fennel.expr import col
from fennel.testing import mock, log

################################################################################
Expand Down Expand Up @@ -94,6 +95,8 @@ class UserInfoMultipleExtractor:
is_name_common: bool
age_reciprocal: float
age_doubled: int
age_reciprocal_expr: float = F(1 / (col("age") / (3600.0 * 24)) + 0.01)
age_double_expr: int = F(col("age") * 2)

@extractor(deps=[UserInfoDataset]) # type: ignore
@inputs("userid")
Expand Down Expand Up @@ -154,7 +157,6 @@ def get_common_names(cls):
class TestSimpleExtractor(unittest.TestCase):
@pytest.mark.integration
def test_get_age_and_name_features(self):
print("Running test_get_age_and_name_features")
age = pd.Series([32, 24])
name = pd.Series(["John", "Rahul"])
assert UserInfoMultipleExtractor.all() == [
Expand All @@ -167,6 +169,8 @@ def test_get_age_and_name_features(self):
"UserInfoMultipleExtractor.is_name_common",
"UserInfoMultipleExtractor.age_reciprocal",
"UserInfoMultipleExtractor.age_doubled",
"UserInfoMultipleExtractor.age_reciprocal_expr",
"UserInfoMultipleExtractor.age_double_expr",
]
ts = pd.Series([datetime(2020, 1, 1), datetime(2020, 1, 1)])
df = UserInfoMultipleExtractor.get_age_and_name_features(
Expand Down Expand Up @@ -235,6 +239,90 @@ def test_simple_extractor(self, client):
res["UserInfoMultipleExtractor.age_doubled"].tolist(), [64, 48]
)

@pytest.mark.integration
@mock
def test_e2e_query(self, client):
client.commit(
message="some commit msg",
datasets=[UserInfoDataset],
featuresets=[UserInfoMultipleExtractor],
)
now = datetime.now(timezone.utc)
data = [
[18232, "John", 32, "USA", now],
[18234, "Monica", 24, "Chile", now],
]
columns = ["user_id", "name", "age", "country", "timestamp"]
input_df = pd.DataFrame(data, columns=columns)
response = client.log("fennel_webhook", "UserInfoDataset", input_df)
assert response.status_code == requests.codes.OK, response.json()
client.sleep()

feature_df = client.query(
outputs=[UserInfoMultipleExtractor],
inputs=[UserInfoMultipleExtractor.userid],
input_dataframe=pd.DataFrame(
{"UserInfoMultipleExtractor.userid": [18232, 18234]}
),
)
assert feature_df.shape == (2, 11)
assert feature_df.columns.tolist() == [
"UserInfoMultipleExtractor.userid",
"UserInfoMultipleExtractor.name",
"UserInfoMultipleExtractor.country_geoid",
"UserInfoMultipleExtractor.age",
"UserInfoMultipleExtractor.age_squared",
"UserInfoMultipleExtractor.age_cubed",
"UserInfoMultipleExtractor.is_name_common",
"UserInfoMultipleExtractor.age_reciprocal",
"UserInfoMultipleExtractor.age_doubled",
"UserInfoMultipleExtractor.age_reciprocal_expr",
"UserInfoMultipleExtractor.age_double_expr",
]
assert feature_df["UserInfoMultipleExtractor.userid"].tolist() == [
18232,
18234,
]

assert feature_df["UserInfoMultipleExtractor.age"].tolist() == [32, 24]
assert feature_df["UserInfoMultipleExtractor.age_squared"].tolist() == [
1024,
576,
]
assert feature_df["UserInfoMultipleExtractor.age_cubed"].tolist() == [
32768,
13824,
]
assert feature_df[
"UserInfoMultipleExtractor.is_name_common"
].tolist() == [
True,
False,
]
expected_age_reciprocal = [2700.01, 3600.01]
assert (
feature_df["UserInfoMultipleExtractor.age_reciprocal"].tolist()
== expected_age_reciprocal
)
assert feature_df["UserInfoMultipleExtractor.age_doubled"].tolist() == [
64,
48,
]
assert feature_df[
"UserInfoMultipleExtractor.country_geoid"
].tolist() == [5, 3]
assert feature_df["UserInfoMultipleExtractor.name"].tolist() == [
"John",
"Monica",
]
assert (
feature_df["UserInfoMultipleExtractor.age_reciprocal_expr"].tolist()
== expected_age_reciprocal
)
assert feature_df[
"UserInfoMultipleExtractor.age_double_expr"
].tolist() == [64, 48]


@struct
class Velocity:
Expand Down Expand Up @@ -420,11 +508,23 @@ def test_dag_resolution2(self, client):
{"UserInfoMultipleExtractor.userid": [18232, 18234]}
),
)
self.assertEqual(feature_df.shape, (2, 9))
self.assertEqual(feature_df.shape, (2, 11))
self.assertEqual(
list(feature_df["UserInfoMultipleExtractor.age_reciprocal"]),
[2700.01, 3600.01],
)
self.assertEqual(
list(feature_df["UserInfoMultipleExtractor.age_doubled"]),
[64, 48],
)
self.assertEqual(
list(feature_df["UserInfoMultipleExtractor.age_reciprocal_expr"]),
[2700.01, 3600.01],
)
self.assertEqual(
list(feature_df["UserInfoMultipleExtractor.age_double_expr"]),
[64, 48],
)


@featureset
Expand Down
Loading

0 comments on commit ab24cd5

Please sign in to comment.