Add FirstK aggregation (#553)

Add FirstK aggregation in codebase along the lines of LastK aggregation.
fennel-ai · Sep 21, 2024 · c7a227b · c7a227b
1 parent 3864bc8
commit c7a227b
Show file tree

Hide file tree

Showing 36 changed files with 585 additions and 31 deletions.
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -37,6 +37,7 @@ Eval
 eval
 endswith
 FennelDataAccessRole
+FirstK
 Flink
 Flink's
 GCP

diff --git a/docs/api.yml b/docs/api.yml
@@ -76,6 +76,7 @@ sidebar:
  - "api-reference/aggregations/count"
  - "api-reference/aggregations/distinct"
  - "api-reference/aggregations/lastk"
+ - "api-reference/aggregations/firstk"
  - "api-reference/aggregations/max"
  - "api-reference/aggregations/min"
  - "api-reference/aggregations/stddev"
@@ -171,4 +172,4 @@ sidebar:
  - slug: "api-reference/expectations"
  title: "Expectations"
  pages:
- - "api-reference/expectations"
+ - "api-reference/expectations"
diff --git a/docs/examples/api-reference/aggregations/firstk.py b/docs/examples/api-reference/aggregations/firstk.py
@@ -0,0 +1,150 @@
+from datetime import datetime
+from typing import List
+
+import pandas as pd
+import pytest
+
+from fennel.testing import mock
+
+__owner__ = "satwant@fennel.ai"
+
+
+@mock
+def test_basic(client):
+ # docsnip basic
+ from fennel.datasets import dataset, field, pipeline, Dataset, FirstK
+ from fennel.dtypes import Continuous
+ from fennel.lib import inputs
+ from fennel.connectors import source, Webhook
+
+ webhook = Webhook(name="webhook")
+
+ @source(webhook.endpoint("Transaction"), disorder="14d", cdc="append")
+ @dataset
+ class Transaction:
+ uid: int
+ amount: int
+ timestamp: datetime
+
+ @dataset(index=True)
+ class Aggregated:
+ uid: int = field(key=True)
+ amounts: List[int]
+ timestamp: datetime
+
+ @pipeline
+ @inputs(Transaction)
+ def firstk_pipeline(cls, ds: Dataset):
+ return ds.groupby("uid").aggregate(
+ # docsnip-highlight start
+ amounts=FirstK(
+ of="amount",
+ limit=10,
+ dedup=False,
+ window=Continuous("1d"),
+ ),
+ # docsnip-highlight end
+ )
+
+ # /docsnip
+ client.commit(message="msg", datasets=[Transaction, Aggregated])
+ # log some rows to the transaction dataset
+ client.log(
+ "webhook",
+ "Transaction",
+ pd.DataFrame(
+ [
+ {
+ "uid": 1,
+ "vendor": "A",
+ "amount": 10,
+ "timestamp": "2021-01-01T00:00:00",
+ },
+ {
+ "uid": 1,
+ "vendor": "B",
+ "amount": 20,
+ "timestamp": "2021-01-02T00:00:00",
+ },
+ {
+ "uid": 2,
+ "vendor": "A",
+ "amount": 30,
+ "timestamp": "2021-01-03T00:00:00",
+ },
+ {
+ "uid": 2,
+ "vendor": "B",
+ "amount": 40,
+ "timestamp": "2021-01-04T00:00:00",
+ },
+ {
+ "uid": 3,
+ "vendor": "A",
+ "amount": 50,
+ "timestamp": "2021-01-05T00:00:00",
+ },
+ {
+ "uid": 3,
+ "vendor": "B",
+ "amount": 60,
+ "timestamp": "2021-01-06T00:00:00",
+ },
+ ]
+ ),
+ )
+
+ # do lookup on the Aggregated dataset
+ ts = pd.Series(
+ [
+ datetime(2021, 1, 6, 0, 0, 0),
+ datetime(2021, 1, 6, 0, 0, 0),
+ datetime(2021, 1, 6, 0, 0, 0),
+ ]
+ )
+ df, found = Aggregated.lookup(ts, uid=pd.Series([1, 2, 3]))
+ assert found.tolist() == [True, True, True]
+ assert df["uid"].tolist() == [1, 2, 3]
+ assert df["amounts"].tolist() == [[], [], [50, 60]]
+
+
+@mock
+def test_invalid_type(client):
+ with pytest.raises(Exception):
+ # docsnip incorrect_type
+ from fennel.datasets import dataset, field, pipeline, Dataset, FirstK
+ from fennel.dtypes import Continuous
+ from fennel.lib import inputs
+ from fennel.connectors import source, Webhook
+
+ webhook = Webhook(name="webhook")
+
+ @source(webhook.endpoint("Transaction"), disorder="14d", cdc="append")
+ @dataset
+ class Transaction:
+ uid: int
+ amount: int
+ timestamp: datetime
+
+ @dataset
+ class Aggregated:
+ uid: int = field(key=True)
+ # docsnip-highlight next-line
+ amounts: int # should be List[int]
+ timestamp: datetime
+
+ @pipeline
+ @inputs(Transaction)
+ def bad_pipeline(cls, ds: Dataset):
+ return ds.groupby("uid").aggregate(
+ # docsnip-highlight start
+ amounts=FirstK(
+ of="amount",
+ limit=10,
+ dedup=False,
+ window=Continuous("1d"),
+ ),
+ # docsnip-highlight end
+ )
+
+ # /docsnip
diff --git a/docs/pages/api-reference/aggregations/firstk.md b/docs/pages/api-reference/aggregations/firstk.md
@@ -0,0 +1,63 @@
+---
+title: FirstK
+order: 0
+status: published
+---
+### FirstK
+Aggregation to computes a rolling list of the earliest values for each group 
+within a window. 
+
+#### Parameters
+<Expandable title="of" type="str">
+Name of the field in the input dataset over which the aggregation should be 
+computed.
+</Expandable>
+
+<Expandable title="window" type="Window">
+The continuous window within which aggregation needs to be computed. Possible 
+values are `"forever"` or any [time duration](/api-reference/data-types/duration).
+</Expandable>
+
+<Expandable title="into_field" type="str">
+The name of the field in the output dataset that should store the result of this
+aggregation. This field is expected to be of type `List[T]` where `T` is the type
+of the field denoted by `of`.
+</Expandable>
+
+<Expandable title="limit" type="int">
+Since storing all the values for a group can get costly, FirstK expects a 
+`limit` to be specified which denotes the maximum size of the list that should 
+be maintained at any point.
+</Expandable>
+
+<Expandable title="dedup" type="bool">
+If set to True, only distinct values are stored else values stored in the first
+can have duplicates too.
+</Expandable>
+
+<pre snippet="api-reference/aggregations/firstk#basic" status="success" 
+ message="FirstK in window of 1 day">
+</pre>
+
+#### Returns
+<Expandable type="List[T]">
+Stores the result of the aggregation in the appropriate field of the output 
+dataset. 
+</Expandable>
+
+
+#### Errors
+<Expandable title="Incorrect output type">
+The column denoted by `into_field` in the output dataset must be of type `List[T]`
+where T is the type of the column denoted by `of` in the input dataset. Commit error
+is raised if this is not the case.
+</Expandable>
+
+:::warning
+Storing the full set of values and maintaining order between them can get costly, 
+so use this aggregation only when needed.
+:::
+
+<pre snippet="api-reference/aggregations/firstk#incorrect_type" status="error" 
+ message="amounts should be of type List[int], not int">
+</pre>
diff --git a/fennel/CHANGELOG.md b/fennel/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## [1.5.28] - 2024-09-21
+- Add FirstK aggregation
+
 ## [1.5.27] - 2024-09-20
 - Fix schema validation for int64 columns during mock client. Earlier, float64
  was considered valid for optional[int64] fields to account for typing limitation

diff --git a/fennel/__init__.py b/fennel/__init__.py
@@ -9,6 +9,7 @@
  Distinct,
  Sum,
  LastK,
+ FirstK,
  Min,
  Max,
  Average,

diff --git a/fennel/client_tests/test_social_network.py b/fennel/client_tests/test_social_network.py
@@ -5,7 +5,7 @@
 import pytest
 
 import fennel._vendor.requests as requests
-from fennel import LastK
+from fennel import LastK, FirstK
 from fennel.connectors import source, Webhook
 from fennel.datasets import dataset, field, Dataset, pipeline, Count
 from fennel.dtypes import regex, oneof, Continuous
@@ -166,6 +166,27 @@ def last_viewed_post(cls, view_data: Dataset):
  )
 
 
+@meta(owner="ml-eng@myspace.com")
+@dataset(index=True)
+class FirstViewedPostByAgg:
+ user_id: str = field(key=True)
+ post_id: List[int]
+ time_stamp: datetime
+
+ @pipeline
+ @inputs(ViewData)
+ def first_viewed_post(cls, view_data: Dataset):
+ return view_data.groupby("user_id").aggregate(
+ FirstK(
+ into_field="post_id",
+ of="post_id",
+ window=Continuous("forever"),
+ limit=1,
+ dedup=False,
+ )
+ )
+
+
 # --- Featuresets ---
 
 
@@ -188,6 +209,9 @@ class UserFeatures:
  last_viewed_post2: List[int] = F(
  LastViewedPostByAgg.post_id, default=[-1] # type: ignore
  )
+ first_viewed_post: List[int] = F(
+ FirstViewedPostByAgg.post_id, default=[-1] # type: ignore
+ )
 
  @extractor(deps=[UserViewsDataset]) # type: ignore
  @inputs(Request.user_id)
@@ -234,6 +258,7 @@ def test_social_network(client):
  UserCategoryDataset,
  LastViewedPost,
  LastViewedPostByAgg,
+ FirstViewedPostByAgg,
  ],
  featuresets=[Request, UserFeatures],
  )
@@ -307,6 +332,11 @@ def test_social_network(client):
  assert last_post_viewed == [936609766, 735291550]
  assert last_post_viewed2 == last_post_viewed
 
+ first_post_viewed = [
+ x[0] for x in feature_df["UserFeatures.first_viewed_post"].to_list()
+ ]
+ assert first_post_viewed == [508698801, 43094523]
+
  if client.is_integration_client():
  return
  df = client.get_dataset_df("UserCategoryDataset")
@@ -427,6 +457,7 @@ def test_social_network_with_mock_log(client):
  UserCategoryDataset,
  LastViewedPost,
  LastViewedPostByAgg,
+ FirstViewedPostByAgg,
  ],
  featuresets=[Request, UserFeatures],
  )
@@ -494,5 +525,10 @@ def test_social_network_with_mock_log(client):
  assert last_post_viewed == [936609766, 735291550]
  assert last_post_viewed2 == last_post_viewed
 
+ first_post_viewed = [
+ x[0] for x in feature_df["UserFeatures.first_viewed_post"].to_list()
+ ]
+ assert first_post_viewed == [508698801, 43094523]
+
  df = client.get_dataset_df("UserCategoryDataset")
  assert df.shape == (1998, 4)