diff --git a/.github/workflows/packagr_publish.yml b/.github/workflows/packagr_publish.yml index e3ac53a74..c200e3a9b 100644 --- a/.github/workflows/packagr_publish.yml +++ b/.github/workflows/packagr_publish.yml @@ -1,86 +1,87 @@ -name: Packagr Publish +name: Build, Test, and Publish -# Controls when the workflow will run on: - # Triggers the workflow on push but only for the main branch push: branches: [ main ] - - # Allows you to run this workflow manually from the Actions tab workflow_dispatch: -# A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: - # This workflow contains a single job called "build" - build: - # The type of runner that the job will run on + build-and-publish: runs-on: ubuntu-latest + name: Build, Test, and Publish Package - # Steps represent a sequence of tasks that will be executed as part of the job - name: Run tests and publish latest version if succcessful steps: - #---------------------------------------------- - # check-out repo and set-up python - #---------------------------------------------- - - name: Check out repository + # Checkout the Python client repository + - name: Check out client repository (Python) uses: actions/checkout@v2 + with: + path: 'client' - - name: Set up python - id: setup-python + # Checkout the Rust server repository + - name: Check out server repository (Rust) + uses: actions/checkout@v2 + with: + repository: 'fennel-ai/server' + token: ${{ secrets.SERVER_CHECKOUT }} + path: 'server' + ref: 'main' + + # Set up Python environment + - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.12 - #---------------------------------------------- - # ----- install & configure poetry ----- - #---------------------------------------------- + # Install Poetry - name: Install Poetry uses: snok/install-poetry@v1 with: virtualenvs-create: true virtualenvs-in-project: true - #---------------------------------------------- - # load cached venv if cache exists - #---------------------------------------------- + # Load cached Python virtual environment if it exists - name: Load cached venv id: cached-poetry-dependencies uses: actions/cache@v2 with: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} - - #---------------------------------------------- - # install dependencies if cache does not exist - #---------------------------------------------- + + # Install Python dependencies if the cache does not exist - name: Install dependencies if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction --no-root - #---------------------------------------------- - # install your root project, if required - #---------------------------------------------- - - name: Install library - run: poetry install --no-interaction + # Set up Rust toolchain + - name: Set up Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + default: true + + # Install protobuf compiler (protoc) + - name: Install protobuf compiler (protoc) + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + # Install Maturin to build the package + - name: Install Maturin + run: python -m pip install maturin + + # Build the package using Maturin + - name: Build the package using Maturin + run: maturin build --release --manifest-path ./server/fennel_data_lib/Cargo.toml --out dist --no-sdist - # Run test suite + # Install the built package + - name: Install built package + run: pip install dist/*.whl + + # Run Python tests - name: Run tests run: | source .venv/bin/activate - poetry run pytest . --run_slow - poetry run python -m unittest discover fennel - + poetry run pytest fennel --run_slow - #---------------------------------------------- - # publish to pypi - this is run only if the previous steps are successful - #---------------------------------------------- - - name: Publish to Pypi + # Publish the package to PyPI if all tests pass + - name: Publish to PyPI run: | - rm -rf dist/* - pip install setuptools wheel twine - poetry build twine upload --repository pypi dist/* --verbose -u __token__ -p ${{ secrets.PYPI_PACKAGR_ACCESS_TOKEN }} - - - - diff --git a/.github/workflows/test_linux.yml b/.github/workflows/test_linux.yml index 1e922c2fe..eb74b9414 100644 --- a/.github/workflows/test_linux.yml +++ b/.github/workflows/test_linux.yml @@ -23,9 +23,15 @@ jobs: # The type of runner that the job will run on runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job name: Unit tests with py-version ${{ matrix.python-version }} steps: + - name: Test GITHUB_TOKEN permissions + run: | + curl -H "Authorization: token ${{ secrets.SERVER_CHECKOUT }}" https://api.github.com/repos/fennel-ai/server/contents/ + #---------------------------------------------- # check-out repo and set-up python #---------------------------------------------- @@ -36,6 +42,27 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + + #---------------------------------------------- + # check-out server repo + #---------------------------------------------- + - name: Check out server repository (Rust) + uses: actions/checkout@v2 + with: + repository: 'fennel-ai/server' + token: ${{ secrets.SERVER_CHECKOUT }} + path: 'server' + ref: 'main' + + - name: Set up Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + default: true + + - name: Install protobuf compiler (protoc) + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + #---------------------------------------------- # ----- install & configure poetry ----- #---------------------------------------------- @@ -59,6 +86,17 @@ jobs: - name: Install dependencies if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction --no-root + + #---------------------------------------------- + # install maturin + #---------------------------------------------- + - name: Install maturin + run: python -m pip install maturin + + - name: Build and link the package using Maturin Develop + run: maturin develop --release --manifest-path ./server/fennel_data_lib/Cargo.toml + + #---------------------------------------------- # install your root project, if required #---------------------------------------------- @@ -69,5 +107,6 @@ jobs: - name: Run tests run: | source .venv/bin/activate - poetry run pytest . --run_slow + poetry run pytest fennel --run_slow + poetry run pytest docs poetry run python -m unittest discover fennel diff --git a/fennel/CHANGELOG.md b/fennel/CHANGELOG.md index 35b38dee4..44628ab92 100644 --- a/fennel/CHANGELOG.md +++ b/fennel/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## [1.5.0] - 2024-08-04 +- Rust based expressions + ## [1.4.6] - 2024-07-30 - Add support for indirections in preproc ref type for Protobuf format diff --git a/fennel/client/client.py b/fennel/client/client.py index c3b932c7a..4eef74ccd 100644 --- a/fennel/client/client.py +++ b/fennel/client/client.py @@ -34,9 +34,9 @@ # Connection timeout i.e. the time spent by the client to establish a # connection to the remote machine. NOTE: This should be slightly larger than # a multiple of 3 (this is the default TCP retransmission window) -_DEFAULT_CONNECT_TIMEOUT = 10 +_DEFAULT_CONNECT_TIMEOUT = 300 # Default request timeout(s). -_DEFAULT_TIMEOUT = 180 +_DEFAULT_TIMEOUT = 300 # Name of the default branch _MAIN_BRANCH = "main" _BRANCH_HEADER_NAME = "X-FENNEL-BRANCH" diff --git a/fennel/client_tests/test_complex_struct.py b/fennel/client_tests/test_complex_struct.py index 17df3a59c..285007654 100644 --- a/fennel/client_tests/test_complex_struct.py +++ b/fennel/client_tests/test_complex_struct.py @@ -64,9 +64,9 @@ class MovieInfo: def movie_info(cls, movie: Dataset): return ( movie.assign( - name="role", - dtype=Role, - func=lambda x: x[["role_id", "name", "cost"]].apply( + "role", + Role, + lambda x: x[["role_id", "name", "cost"]].apply( lambda z: Role( **{"role_id": z[0], "name": z[1], "cost": z[2]} ), diff --git a/fennel/client_tests/test_dataset.py b/fennel/client_tests/test_dataset.py index 9d43ecabe..ac40068b9 100644 --- a/fennel/client_tests/test_dataset.py +++ b/fennel/client_tests/test_dataset.py @@ -12,6 +12,7 @@ import fennel._vendor.requests as requests from fennel.connectors import source, Webhook, ref +from fennel.expr import F from fennel.datasets import ( dataset, field, @@ -892,11 +893,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: }, ) - return x.assign( - name="rating_orig", - dtype=float, - func=lambda df: df["rating_sq"] ** 0.5, - ) + return x.assign("rating_orig", float, lambda df: df["rating_sq"] ** 0.5) @meta(owner="test@test.com") @@ -923,6 +920,28 @@ def pipeline_assign(cls, m: Dataset): return rating_into_5.drop("num_ratings", "sum_ratings", "rating") +@meta(owner="test@test.com") +@dataset(index=True) +class MovieRatingAssignExpr: + movie: oneof(str, ["Jumanji", "Titanic", "RaOne", "ABC"]) = field( # type: ignore + key=True + ) + rating_sq: float + rating_cube: float + rating_into_5: float + t: datetime + + @pipeline + @inputs(MovieRating) + def pipeline_assign(cls, m: Dataset): + rating_transformed = m.assign( + rating_sq=(F("rating") * F("rating")).astype(float), + rating_cube=(F("rating") * F("rating") * F("rating")).astype(float), + rating_into_5=(F("rating") * 5).astype(float), + ) + return rating_transformed.drop("num_ratings", "sum_ratings", "rating") + + class TestBasicTransform(unittest.TestCase): @pytest.mark.integration @mock @@ -1050,53 +1069,55 @@ class TestBasicAssign(unittest.TestCase): @pytest.mark.integration @mock def test_basic_assign(self, client): - # # Sync the dataset - client.commit( - message="msg", - datasets=[MovieRating, MovieRatingAssign, RatingActivity], - ) - now = datetime.now(timezone.utc) - two_hours_ago = now - timedelta(hours=2) - data = [ - ["Jumanji", 4, 343, 789, two_hours_ago], - ["Titanic", 5, 729, 1232, now], - ] - columns = ["movie", "rating", "num_ratings", "sum_ratings", "t"] - df = pd.DataFrame(data, columns=columns) - response = client.log("fennel_webhook", "MovieRating", df) - assert response.status_code == requests.codes.OK, response.json() - client.sleep() - # Do some lookups to verify pipeline_transform is working as expected - an_hour_ago = now - timedelta(hours=1) - ts = pd.Series([an_hour_ago, an_hour_ago]) - keys = pd.DataFrame({"movie": ["Jumanji", "Titanic"]}) - df, found = client.lookup( - "MovieRatingAssign", - timestamps=ts, - keys=keys, - ) + test_cases = [MovieRatingAssignExpr, MovieRatingAssign] + for case in test_cases: + client.init_branch(case.__name__) + client.commit( + message="msg", + datasets=[MovieRating, case, RatingActivity], + ) + now = datetime.now(timezone.utc) + two_hours_ago = now - timedelta(hours=2) + data = [ + ["Jumanji", 4, 343, 789, two_hours_ago], + ["Titanic", 5, 729, 1232, now], + ] + columns = ["movie", "rating", "num_ratings", "sum_ratings", "t"] + df = pd.DataFrame(data, columns=columns) + response = client.log("fennel_webhook", "MovieRating", df) + assert response.status_code == requests.codes.OK, response.json() + client.sleep() + # Do some lookups to verify pipeline_transform is working as expected + an_hour_ago = now - timedelta(hours=1) + ts = pd.Series([an_hour_ago, an_hour_ago]) + keys = pd.DataFrame({"movie": ["Jumanji", "Titanic"]}) + df, found = client.lookup( + case.__name__, + timestamps=ts, + keys=keys, + ) - assert found.tolist() == [True, False] - assert df.shape == (2, 5) - assert df["movie"].tolist() == ["Jumanji", "Titanic"] - assert df["rating_sq"].tolist()[0] == 16 - assert pd.isna(df["rating_sq"].tolist()[1]) - assert df["rating_cube"].tolist()[0] == 64 - assert pd.isna(df["rating_cube"].tolist()[1]) - assert df["rating_into_5"].tolist()[0] == 20 - assert pd.isna(df["rating_into_5"].tolist()[1]) + assert found.tolist() == [True, False] + assert df.shape == (2, 5) + assert df["movie"].tolist() == ["Jumanji", "Titanic"] + assert df["rating_sq"].tolist()[0] == 16 + assert pd.isna(df["rating_sq"].tolist()[1]) + assert df["rating_cube"].tolist()[0] == 64 + assert pd.isna(df["rating_cube"].tolist()[1]) + assert df["rating_into_5"].tolist()[0] == 20 + assert pd.isna(df["rating_into_5"].tolist()[1]) - client.sleep() - df, _ = client.lookup( - "MovieRatingAssign", - keys=keys, - ) + client.sleep() + df, _ = client.lookup( + case.__name__, + keys=keys, + ) - assert df.shape == (2, 5) - assert df["movie"].tolist() == ["Jumanji", "Titanic"] - assert df["rating_sq"].tolist() == [16, 25] - assert df["rating_cube"].tolist() == [64, 125] - assert df["rating_into_5"].tolist() == [20, 25] + assert df.shape == (2, 5) + assert df["movie"].tolist() == ["Jumanji", "Titanic"] + assert df["rating_sq"].tolist() == [16, 25] + assert df["rating_cube"].tolist() == [64, 125] + assert df["rating_into_5"].tolist() == [20, 25] @meta(owner="test@test.com") @@ -1840,67 +1861,94 @@ def filter_positive_ratings(cls, rating: Dataset): # fmt: on +@meta(owner="test@test.com") +@dataset(index=True) +class PositiveRatingActivityExpr: + cnt_rating: int + movie: oneof(str, ["Jumanji", "Titanic", "RaOne", "ABC"]) = field( # type: ignore + key=True + ) + t: datetime + + @pipeline + @inputs(RatingActivity) + def filter_positive_ratings(cls, rating: Dataset): + filtered_ds = rating.filter(F("rating") >= 3.5) + filter2 = filtered_ds.filter( + (F("movie") == "Jumanji") + | (F("movie") == "Titanic") + | (F("movie") == "RaOne") + ) + return filter2.groupby("movie").aggregate( + Count(window=Continuous("forever"), into_field=str(cls.cnt_rating)), + ) + + class TestBasicFilter(unittest.TestCase): @pytest.mark.integration @mock def test_basic_filter(self, client): - # # Sync the dataset - client.commit( - message="msg", - datasets=[PositiveRatingActivity, RatingActivity], - ) - now = datetime.now(timezone.utc) - one_hour_ago = now - timedelta(hours=1) - two_hours_ago = now - timedelta(hours=2) - three_hours_ago = now - timedelta(hours=3) - four_hours_ago = now - timedelta(hours=4) - five_hours_ago = now - timedelta(hours=5) - minute_ago = now - timedelta(minutes=1) - data = [ - [18231, 4.5, "Jumanji", five_hours_ago], - [18231, 3, "Jumanji", four_hours_ago], - [18231, 3.5, "Jumanji", three_hours_ago], - [18231, 4, "Titanic", three_hours_ago], - [18231, 3, "Titanic", two_hours_ago], - [18231, 5, "Titanic", one_hour_ago], - [18231, 4, "Titanic", minute_ago], - [18231, 2, "RaOne", one_hour_ago], - [18231, 3, "RaOne", minute_ago], - [18231, 1, "RaOne", two_hours_ago], - ] - columns = ["userid", "rating", "movie", "t"] - df = pd.DataFrame(data, columns=columns) - response = client.log("fennel_webhook", "RatingActivity", df) - assert response.status_code == requests.codes.OK, response.json() - - client.sleep() + test_cases = [PositiveRatingActivityExpr] + for case in test_cases: + client.init_branch(case.__name__) + # # Sync the dataset + client.commit( + message="msg", + datasets=[case, RatingActivity], + ) + now = datetime.now(timezone.utc) + one_hour_ago = now - timedelta(hours=1) + two_hours_ago = now - timedelta(hours=2) + three_hours_ago = now - timedelta(hours=3) + four_hours_ago = now - timedelta(hours=4) + five_hours_ago = now - timedelta(hours=5) + minute_ago = now - timedelta(minutes=1) + data = [ + [18231, 4.5, "Jumanji", five_hours_ago], + [18231, 3, "Jumanji", four_hours_ago], + [18231, 3.5, "Jumanji", three_hours_ago], + [18231, 4, "Titanic", three_hours_ago], + [18231, 3, "Titanic", two_hours_ago], + [18231, 5, "Titanic", one_hour_ago], + [18231, 4, "Titanic", minute_ago], + [18231, 2, "RaOne", one_hour_ago], + [18231, 3, "RaOne", minute_ago], + [18231, 1, "RaOne", two_hours_ago], + ] + columns = ["userid", "rating", "movie", "t"] + df = pd.DataFrame(data, columns=columns) + response = client.log("fennel_webhook", "RatingActivity", df) + assert response.status_code == requests.codes.OK, response.json() - # Do some lookups to verify pipeline_aggregate is working as expected - df, _ = client.lookup( - "PositiveRatingActivity", - keys=pd.DataFrame({"movie": ["Jumanji", "Titanic", "RaOne"]}), - ) - assert df.shape == (3, 3) - assert df["movie"].tolist() == ["Jumanji", "Titanic", "RaOne"] - if client.is_integration_client(): - # backend returns default values for aggregate dataset - assert df["cnt_rating"].tolist() == [2, 3, 0] - else: - assert df["cnt_rating"].tolist() == [2, 3, pd.NA] + client.sleep() - ts = pd.Series([two_hours_ago, two_hours_ago, two_hours_ago]) - df, _ = client.lookup( - "PositiveRatingActivity", - keys=pd.DataFrame({"movie": ["Jumanji", "Titanic", "RaOne"]}), - timestamps=ts, - ) - assert df.shape == (3, 3) - assert df["movie"].tolist() == ["Jumanji", "Titanic", "RaOne"] - if client.is_integration_client(): - # backend returns default values for aggregate dataset - assert df["cnt_rating"].tolist() == [2, 1, 0] - else: - assert df["cnt_rating"].tolist() == [2, 1, pd.NA] + # Do some lookups to verify pipeline_aggregate is working as expected + df, _ = client.lookup( + case.__name__, + keys=pd.DataFrame({"movie": ["Jumanji", "Titanic", "RaOne"]}), + ) + assert df.shape == (3, 3) + print(df) + assert df["movie"].tolist() == ["Jumanji", "Titanic", "RaOne"] + if client.is_integration_client(): + # backend returns default values for aggregate dataset + assert df["cnt_rating"].tolist() == [2, 3, 0] + else: + assert df["cnt_rating"].tolist() == [2, 3, pd.NA] + + ts = pd.Series([two_hours_ago, two_hours_ago, two_hours_ago]) + df, _ = client.lookup( + case.__name__, + keys=pd.DataFrame({"movie": ["Jumanji", "Titanic", "RaOne"]}), + timestamps=ts, + ) + assert df.shape == (3, 3) + assert df["movie"].tolist() == ["Jumanji", "Titanic", "RaOne"] + if client.is_integration_client(): + # backend returns default values for aggregate dataset + assert df["cnt_rating"].tolist() == [2, 1, 0] + else: + assert df["cnt_rating"].tolist() == [2, 1, pd.NA] @meta(owner="test@test.com") diff --git a/fennel/datasets/datasets.py b/fennel/datasets/datasets.py index 435a60feb..cf839da90 100644 --- a/fennel/datasets/datasets.py +++ b/fennel/datasets/datasets.py @@ -2,6 +2,7 @@ import copy import datetime +import enum import functools import inspect import sys @@ -26,6 +27,8 @@ import pandas as pd from typing_extensions import Literal +from fennel.expr.expr import TypedExpr +from fennel.expr.visitor import ExprPrinter import fennel.gen.index_pb2 as index_proto from fennel._vendor.pydantic import BaseModel # type: ignore from fennel.datasets.aggregate import ( @@ -42,6 +45,7 @@ ExpDecaySum, ) from fennel.dtypes.dtypes import ( + FENNEL_STRUCT, get_fennel_struct, Window, Decimal, @@ -50,6 +54,7 @@ Session, Tumbling, ) +from fennel.expr import Expr from fennel.gen import schema_pb2 as schema_proto from fennel.internal_lib.duration import ( Duration, @@ -241,11 +246,49 @@ def transform(self, func: Callable, schema: Dict = {}) -> _Node: return Transform(self, func, None) return Transform(self, func, copy.deepcopy(schema)) - def filter(self, func: Callable) -> _Node: + def filter(self, func: Callable | Expr) -> _Node: return Filter(self, func) - def assign(self, name: str, dtype: Type, func: Callable) -> _Node: - return Assign(self, name, dtype, func) + def assign(self, *args, **kwargs) -> _Node: + """ + Assign is an overloaded operator that can be used in several ways: + 1. Assigning a new column with a lambda function: + >>> ds.assign("new_column", int, lambda x: x.old_column + 1) + 2. Assigning one or more columns: + >>> ds.assign( + ... new_column1=(F("old_column1") + 1).astype(int), + ... new_column2=(F("old_column2") + 2).astype(int), + ... ) + + """ + # Check if the user is using the first syntax + # Skip args[1] check because type check can be flaky + # and args[0] and args[2] are always strings and callable + if ( + len(args) == 3 + and isinstance(args[0], str) + and isinstance(args[2], Callable) # type: ignore + ): + return Assign(self, args[0], args[1], args[2]) + + # If there are 3 kwargs for name, dtype, func throw an error + if ( + len(kwargs) == 3 + and "name" in kwargs + and "dtype" in kwargs + and "func" in kwargs + ): + raise TypeError( + "assign operator can either take 3 args for name, dtype, func or kwargs for expressions, not both" + ) + + # Check if the user is using the second syntax + if len(args) == 0 and len(kwargs) > 0: + return Assign.from_expressions(self, **kwargs) + + raise Exception( + "Invalid arguments to assign, please see the documentation for more information." + ) def groupby( self, *args, window: Optional[Union[Hopping, Session, Tumbling]] = None @@ -410,9 +453,19 @@ def dsschema(self): ) +class UDFType(str, Enum): + expr = "expr" + python = "python" + + class Assign(_Node): def __init__( - self, node: _Node, column: str, output_type: Type, func: Callable + self, + node: _Node, + column: Optional[str], + output_type: Optional[Type], + func: Optional[Callable], + **kwargs, ): super().__init__() self.node = node @@ -420,6 +473,43 @@ def __init__( self.func = func self.column = column self.output_type = output_type + self.output_expressions = {} + self.assign_type = UDFType.python + # Check that either node, column and output_type are all None or all not None + if ( + node is None + and column is None + and output_type is None + and len(kwargs) == 0 + ) or ( + node is not None + and column is not None + and output_type is not None + and len(kwargs) > 0 + ): + raise ValueError( + "Assign expects either to use the arguments `node`, `column` and `output_type` or use the keyword arguments for expressions" + ) + # Map of column names to expressions + if len(kwargs) > 0: + for k, v in kwargs.items(): + self.output_expressions[k] = v + self.assign_type = UDFType.expr + + @classmethod + def from_expressions(cls, self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, Expr): + raise TypeError( + f"type not specified for column {k} in assign operator, please use .astype(...) to specify the type" + ) + + if not isinstance(v, TypedExpr): + raise ValueError( + "Assign.from_expressions expects all values to be of type Expr", + f"found `{type(v)}` for column `{k}`", + ) + return Assign(self, None, None, None, **kwargs) def signature(self): if isinstance(self.node, Dataset): @@ -427,27 +517,54 @@ def signature(self): self.node._name, self.func, self.column, + ( + self.output_type.__name__ + if self.output_type is not None + else None + ), + ) + if self.assign_type == UDFType.python: + return fhash( + self.node.signature(), + self.func, + self.column, self.output_type.__name__, ) - return fhash( - self.node.signature(), - self.func, - self.column, - self.output_type.__name__, - ) + else: + return fhash( + self.node.signature(), + self.output_expressions, + ) def dsschema(self): input_schema = self.node.dsschema() - input_schema.update_column(self.column, get_pd_dtype(self.output_type)) + if self.assign_type == UDFType.python: + input_schema.update_column( + self.column, get_pd_dtype(self.output_type) + ) + else: + for col, expr in self.output_expressions.items(): + input_schema.update_column(col, get_pd_dtype(expr.dtype)) return input_schema class Filter(_Node): - def __init__(self, node: _Node, func: Callable): + def __init__(self, node: _Node, fiter_fn: Callable | Expr): super().__init__() self.node = node self.node.out_edges.append(self) - self.func = func # noqa: E731 + self.filter_expr = None + self.func = None # noqa: E731 + if isinstance(fiter_fn, Callable): # type: ignore + self.filter_type = UDFType.python + self.func = fiter_fn + elif isinstance(fiter_fn, Expr): # type: ignore + self.filter_type = UDFType.expr + self.filter_expr = fiter_fn + else: + raise ValueError( + f"Filter expects either a lambda function or an expression object, found {type(func)}" # type: ignore + ) def signature(self): if isinstance(self.node, Dataset): @@ -2325,6 +2442,7 @@ def __init__(self): def validate(self, pipe: Pipeline) -> DSSchema: self.pipeline_name = pipe.name + self.dsname = pipe.dataset_name return self.visit(pipe.terminal_node) def visit(self, obj) -> DSSchema: @@ -2381,6 +2499,12 @@ def visitFilter(self, obj) -> DSSchema: f"Cannot add node 'Filter' after a terminal node in pipeline : `{self.pipeline_name}`." ) input_schema.name = f"'[Pipeline:{self.pipeline_name}]->filter node'" + if obj.filter_type == UDFType.expr: + expr_type = obj.filter_expr.typeof(input_schema.schema()) + if expr_type != bool: + raise TypeError( + f"Filter expression must return type bool, found {dtype_to_string(expr_type)}." + ) return input_schema def visitAggregate(self, obj) -> DSSchema: @@ -2764,10 +2888,46 @@ def visitAssign(self, obj) -> DSSchema: f"Cannot add node 'Assign' after a terminal node in pipeline : `{self.pipeline_name}`." ) output_schema_name = f"'[Pipeline:{self.pipeline_name}]->assign node'" - if obj.column is None or len(obj.column) == 0: - raise ValueError( - f"invalid assign - {output_schema_name} must specify a column to assign" - ) + if obj.assign_type == UDFType.expr: + if len(obj.output_expressions) == 0: + raise ValueError( + f"invalid assign - {output_schema_name} must have at least one column to assign" + ) + # Ensure there are no duplicate columns + if len(obj.output_expressions) != len( + set(obj.output_expressions.keys()) + ): + raise ValueError( + f"invalid assign - {output_schema_name} cannot have duplicate columns" + ) + # Fetch the type for every column and match it against the type provided in the expression + type_errors = [] + for col, typed_expr in obj.output_expressions.items(): + try: + expr_type = typed_expr.expr.typeof(input_schema.schema()) + except Exception as e: + raise ValueError( + f"invalid assign - {output_schema_name} error in expression for column `{col}`: {str(e)}" + ) + if typed_expr.dtype != expr_type: + printer = ExprPrinter() + type_errors.append( + f"'{col}' is of type `{dtype_to_string(typed_expr.dtype)}`, can not be cast to `{dtype_to_string(expr_type)}`. Full expression: `{printer.print(typed_expr.expr.root)}`" + ) + + if len(type_errors) > 0: + joined_errors = "\n\t".join(type_errors) + print(joined_errors) + raise TypeError( + f"found type errors in assign node of `{self.dsname}.{self.pipeline_name}`:\n\t{joined_errors}" + ) + + else: + if obj.column is None or len(obj.column) == 0: + raise ValueError( + f"invalid assign - {output_schema_name} must specify a column to assign" + ) + val_fields = input_schema.values.keys() if ( obj.column in input_schema.keys diff --git a/fennel/datasets/test_dataset.py b/fennel/datasets/test_dataset.py index c7416e88f..e25c250cd 100644 --- a/fennel/datasets/test_dataset.py +++ b/fennel/datasets/test_dataset.py @@ -561,107 +561,6 @@ def test_nested_dataset(): ) -# TODO(mohit): Uncomment once support for ondemand funcs is added on protos -# -# def test_dataset_with_pull(): -# API_ENDPOINT_URL = "http://transunion.com/v1/credit_score" - -# @meta(owner="test@test.com") -# @dataset( -# history="1y", -# ) -# class UserCreditScore: -# user_id: int = field(key=True) -# name: str = field(key=True) -# credit_score: float -# timestamp: datetime - -# @on_demand(expires_after="7d") -# def pull_from_api( -# cls, ts: pd.Series, user_id: pd.Series[int], names: pd.Series[str] -# ) -> pd.DataFrame: -# user_list = user_id.tolist() -# names = names.tolist() -# resp = requests.get( -# API_ENDPOINT_URL, json={"users": user_list, "names": names} -# ) -# df = pd.DataFrame(columns=["user_id", "credit_score", "timestamp"]) -# if resp.status_code != 200: -# return df -# results = resp.json()["results"] -# df[str(cls.user_id)] = user_id -# df[str(cls.name)] = names -# df[str(cls.timestamp)] = ts -# df[str(cls.credit_score)] = pd.Series(results) -# return df, pd.Series([True] * len(df)) - -# assert UserCreditScore._history == timedelta(days=365) -# view = InternalTestClient() -# view.add(UserCreditScore) -# sync_request = view._get_sync_request_proto() -# assert len(sync_request.datasets) == 1 -# d = { -# "name": "UserCreditScore", -# "fields": [ -# { -# "name": "user_id", -# "ftype": "Key", -# "dtype": {"scalarType": "INT"}, -# "metadata": {}, -# }, -# { -# "name": "name", -# "ftype": "Key", -# "dtype": {"scalarType": "STRING"}, -# "metadata": {}, -# }, -# { -# "name": "credit_score", -# "ftype": "Val", -# "dtype": {"scalarType": "FLOAT"}, -# "metadata": {}, -# }, -# { -# "name": "timestamp", -# "ftype": "Timestamp", -# "dtype": {"scalarType": "TIMESTAMP"}, -# "metadata": {}, -# }, -# ], -# "mode": "pandas", -# "metadata": {"owner": "test@test.com"}, -# "history": "31536000000000", -# "onDemand": {"expiresAfter": "604800000000"}, -# } - -# # Ignoring schema validation since they are bytes and not human-readable -# dataset_req = sync_request.datasets[0] -# expected_ds_request = ParseDict(d, ds_proto.CoreDataset()) -# assert dataset_req == expected_ds_request, error_message( -# dataset_req, expected_ds_request -# ) - -# with pytest.raises(TypeError) as e: - -# @meta(owner="test@test.com") -# @dataset(history="1y") -# class UserCreditScore2: -# user_id: int = field(key=True) -# credit_score: float -# timestamp: datetime - -# @on_demand -# def pull_from_api( -# cls, user_id: pd.Series, names: pd.Series, timestamps: pd.Series -# ) -> pd.DataFrame: -# pass - -# assert ( -# str(e.value) == "on_demand must be defined with a parameter " -# "expires_after of type Duration for eg: 30d." -# ) - - def test_dataset_with_pipes(): @meta(owner="test@test.com") @dataset diff --git a/fennel/datasets/test_invalid_dataset.py b/fennel/datasets/test_invalid_dataset.py index 7e670de79..3ab4667b8 100644 --- a/fennel/datasets/test_invalid_dataset.py +++ b/fennel/datasets/test_invalid_dataset.py @@ -18,6 +18,7 @@ index, ) from fennel.dtypes import struct, Window, Continuous, Session +from fennel.expr import F from fennel.lib import ( meta, inputs, @@ -220,6 +221,105 @@ class RatingActivity: t: datetime +def strip_whitespace(s): + return "".join(s.split()) + + +def test_incorrect_assign_expr_type(): + with pytest.raises(TypeError) as e: + + @meta(owner="test@test.com") + @dataset + class RatingActivityTransformed: + userid: int + rating_sq: float + movie_suffixed: str + t: datetime + + @pipeline + @inputs(RatingActivity) + def transform(cls, rating: Dataset): + return rating.assign( + rating_sq=(F("rating") * F("rating")).astype(str), + movie_suffixed=F("movie").str.concat("_suffix").astype(int), + ).drop("rating", "movie") + + expected_err = ( + "found type errors in assign node of `RatingActivityTransformed.transform`:\n" + + "\t'rating_sq' is of type `str`, can not be cast to `float`. Full expression: `(Ref('rating') * Ref('rating'))`\n" + + "\t'movie_suffixed' is of type `int`, can not be cast to `str`. Full expression: `Ref('movie') + \"_suffix\"`" + ) + + assert str(e.value) == expected_err + + with pytest.raises(TypeError) as e2: + + @meta(owner="test@test.com") + @dataset + class RatingActivityTransformed2: + userid: int + rating_sq: int + movie_suffixed: str + t: datetime + + @pipeline + @inputs(RatingActivity) + def transform(cls, rating: Dataset): + return rating.assign( + rating_sq=(F("rating") * F("rating")).astype(float), + movie_suffixed=F("movie").str.concat("_suffix").astype(str), + ).drop("rating", "movie") + + assert ( + str(e2.value) + == """[TypeError('Field `rating_sq` has type `float` in `pipeline transform output value` schema but type `int` in `RatingActivityTransformed2 value` schema.')]""" + ) + + with pytest.raises(ValueError) as e2: + + @meta(owner="test@test.com") + @dataset + class RatingActivityTransformed3: + userid: int + rating_sq: int + movie_suffixed: str + t: datetime + + @pipeline + @inputs(RatingActivity) + def transform(cls, rating: Dataset): + return rating.assign( + rating_sq=(F("rating") % F("rating")).astype(float), + movie_suffixed=(F("movie") + "_suffix").astype(str), + ).drop("rating", "movie") + + assert ( + str(e2.value) + == """invalid assign - '[Pipeline:transform]->assign node' error in expression for column `movie_suffixed`: Failed to compile expression: invalid expression: both sides of '+' must be numeric types but found String & String, left: col(movie), right: lit(String("_suffix"))""" + ) + + +def test_incorrect_filter_expr_type(): + with pytest.raises(TypeError) as e: + + @meta(owner="test@test.com") + @dataset + class RatingActivityFiltered: + userid: int + rating: float + t: datetime + + @pipeline + @inputs(RatingActivity) + def transform(cls, rating: Dataset): + return rating.filter(F("rating") + 3.5).drop("movie") + + assert ( + str(e.value) + == """Filter expression must return type bool, found float.""" + ) + + def test_incorrect_aggregate(): with pytest.raises(ValueError) as e: diff --git a/fennel/datasets/test_schema_validator.py b/fennel/datasets/test_schema_validator.py index 697f6502f..dd4b5c929 100644 --- a/fennel/datasets/test_schema_validator.py +++ b/fennel/datasets/test_schema_validator.py @@ -1180,11 +1180,7 @@ class RatingActivity1: @pipeline @inputs(RatingActivity) def create_dataset(cls, activity: Dataset): - return activity.assign( - name="t", - dtype=float, - func=lambda df: float(df["t"]), - ) + return activity.assign("t", float, lambda df: float(df["t"])) assert ( str(e.value) @@ -1203,11 +1199,7 @@ class RatingActivity2: @pipeline @inputs(RatingActivity) def create_dataset(cls, activity: Dataset): - return activity.assign( - name="", - dtype=float, - func=lambda df: float(df["t"]), - ) + return activity.assign("", float, lambda df: float(df["t"])) assert ( str(e.value) @@ -1226,11 +1218,7 @@ class RatingActivity3: @pipeline @inputs(RatingActivity) def create_dataset(cls, activity: Dataset): - return activity.assign( - name="", - dtype=float, - func=lambda df: float(df["t"]), - ) + return activity.assign("", float, lambda df: float(df["t"])) assert ( str(e.value) @@ -1249,9 +1237,7 @@ class RatingActivity4: @inputs(RatingActivity) def create_dataset(cls, activity: Dataset): return activity.assign( - name="userid", - dtype=float, - func=lambda df: float(df["userid"]), + "userid", float, lambda df: float(df["userid"]) ) assert ( diff --git a/fennel/expr/__init__.py b/fennel/expr/__init__.py new file mode 100644 index 000000000..243f83e54 --- /dev/null +++ b/fennel/expr/__init__.py @@ -0,0 +1 @@ +from fennel.expr.expr import F, lit, when, Expr diff --git a/fennel/expr/dict.py b/fennel/expr/dict.py new file mode 100644 index 000000000..e69de29bb diff --git a/fennel/expr/expr.py b/fennel/expr/expr.py new file mode 100644 index 000000000..d45f93968 --- /dev/null +++ b/fennel/expr/expr.py @@ -0,0 +1,829 @@ +from __future__ import annotations +from enum import Enum +from dataclasses import dataclass +from typing import Any, Callable, Dict, Type, Optional + +import json + +from fennel.dtypes.dtypes import FENNEL_STRUCT +import pandas as pd +from fennel.internal_lib.schema.schema import from_proto +import pyarrow as pa +from fennel_data_lib import eval, type_of +from fennel.internal_lib.schema import get_datatype +import fennel.gen.schema_pb2 as schema_proto + + +class InvalidExprException(Exception): + pass + + +class TypedExpr: + def __init__(self, expr: Expr, dtype: Type): + self.expr = expr + self.dtype = dtype + + +class Expr(object): + def __init__(self, root=None): + self.nodeid = id(self) + self.inline = False + self.root = self if root is None else root + + def astype(self, dtype: Type) -> TypedExpr: + return TypedExpr(self, dtype) + + @property + def num(self): + return _Number(self, MathNoop()) + + @property + def str(self): + return _String(self, StringNoop()) + + @property + def dict(self): + return _Dict(self, DictNoop()) + + @property + def struct(self): + return _Struct(self, StructNoop()) + + @property + def dt(self): + return _DateTime(self, DateTimeNoop()) + + @property + def list(self): + return _List(self, ListNoop()) + + def isnull(self): + return IsNull(self) + + def fillnull(self, value: Any): + return FillNull(self, value) + + # We also add all the math functions + + def abs(self) -> _Number: + return _Number(self, Abs()) + + def round(self, precision: int) -> _Number: + return _Number(self, Round(precision)) + + def ceil(self) -> _Number: + return _Number(self, Ceil()) + + def floor(self) -> _Number: + return _Number(self, Floor()) + + def __getitem__(self, item: Any) -> Expr: + item = make_expr(item) + if not isinstance(item, Expr): + raise InvalidExprException( + "'[]' operation can only take expression but got '%s'" + % type(item) + ) + return Binary(self, "[]", item) + + def __nonzero__(self): + raise InvalidExprException("can not convert: '%s' to bool" % self) + + def __bool__(self): + raise InvalidExprException("can not convert: '%s' to bool" % self) + + def __add__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'+' only allowed between expressions but got: '%s' instead" + % other + ) + return Binary(self, "+", other) + + def __radd__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'+' only allowed between expressions but got: '%s' instead" + % other + ) + return Binary(other, "+", self) + + def __or__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'or' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "or", other) + + def __ror__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'or' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "or", self) + + def __eq__(self, other: Any) -> Expr: # type: ignore[override] + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'==' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "==", other) + + def __ne__(self, other: Any) -> Expr: # type: ignore[override] + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'!=' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "!=", other) + + def __ge__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'>=' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, ">=", other) + + def __gt__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'>' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, ">", other) + + def __le__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'<=' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "<=", other) + + def __lt__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'<' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "<", other) + + def __sub__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'-' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "-", other) + + def __rsub__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'-' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "-", self) + + def __mul__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'*' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "*", other) + + def __rmul__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'*' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "*", self) + + def __truediv__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'/' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "/", other) + + def __rtruediv__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'/' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "/", self) + + def __floordiv__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'//' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "//", other) + + def __rfloordiv__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'//' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "//", self) + + def __mod__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'%%' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "%", other) + + def __rmod__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'%%' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "%", self) + + def __and__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'and' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(self, "and", other) + + def __rand__(self, other: Any) -> Expr: + other = make_expr(other) + if not isinstance(other, Expr): + raise InvalidExprException( + "'and' only allowed between two expressions but got: '%s' instead" + % other + ) + return Binary(other, "and", self) + + def __invert__(self) -> Expr: + return Unary("~", self) + + def __xor__(self, other: Any): + raise InvalidExprException("binary operation 'xor' not supported") + + def __rxor__(self, other: Any): + raise InvalidExprException("binary operation 'xor' not supported") + + def __hash__(self) -> int: + return self.nodeid + + def typeof(self, schema: Dict) -> Type: + from fennel.expr.serializer import ExprSerializer + + serializer = ExprSerializer() + proto_expr = serializer.serialize(self.root) + proto_bytes = proto_expr.SerializeToString() + proto_schema = {} + for key, value in schema.items(): + proto_schema[key] = get_datatype(value).SerializeToString() + type_bytes = type_of(proto_bytes, proto_schema) + datatype = schema_proto.DataType() + datatype.ParseFromString(type_bytes) + return from_proto(datatype) + + def eval(self, input_df: pd.DataFrame, schema: Dict) -> pd.Series: + from fennel.expr.serializer import ExprSerializer + + def convert_object(obj): + if isinstance(obj, list): + result = [convert_object(i) for i in obj] + elif isinstance(obj, dict): + result = {} + for key in obj: + result[key] = convert_object(obj[key]) + elif hasattr(obj, "as_json"): + result = obj.as_json() + else: + result = obj + return result + + def convert_objects(df): + for col in df.columns: + df[col] = df[col].apply(convert_object) + return df + + def pd_to_pa(pd_data, schema=None): + # Schema unspecified - as in the case with lookups + if not schema: + if isinstance(pd_data, pd.Series): + pd_data = pd_data.apply(convert_object) + return pa.Array.from_pandas(pd_data) + elif isinstance(pd_data, pd.DataFrame): + pd_data = convert_objects(pd_data) + return pa.RecordBatch.from_pandas( + pd_data, preserve_index=False + ) + else: + raise ValueError("only pd.Series or pd.Dataframe expected") + + # Single column expected + if isinstance(schema, pa.Field): + # extra columns may have been provided + if isinstance(pd_data, pd.DataFrame): + if schema.name not in pd_data: + raise ValueError( + f"Dataframe does not contain column {schema.name}" + ) + # df -> series + pd_data = pd_data[schema.name] + + if not isinstance(pd_data, pd.Series): + raise ValueError("only pd.Series or pd.Dataframe expected") + pd_data = pd_data.apply(convert_object) + return pa.Array.from_pandas(pd_data, type=schema.type) + + # Multiple columns case: use the columns we need + result_df = pd.DataFrame() + for col in schema.names: + if col not in pd_data: + raise ValueError(f"Dataframe does not contain column {col}") + result_df[col] = pd_data[col].apply(convert_object) + return pa.RecordBatch.from_pandas( + result_df, preserve_index=False, schema=schema + ) + + def pa_to_pd(pa_data): + return pa_data.to_pandas(types_mapper=pd.ArrowDtype) + + serializer = ExprSerializer() + proto_expr = serializer.serialize(self.root) + proto_bytes = proto_expr.SerializeToString() + df_pa = pd_to_pa(input_df) + proto_schema = {} + for key, value in schema.items(): + proto_schema[key] = get_datatype(value).SerializeToString() + arrow_col = eval(proto_bytes, df_pa, proto_schema) + return pa_to_pd(arrow_col) + + +class _Bool(Expr): + def __init__(self, expr: Expr): + self.expr = expr + super(_Bool, self).__init__() + + def __str__(self) -> str: + return f"{self.expr}" + + +######################################################### +# Math Functions +######################################################### + + +class MathOp: + pass + + +@dataclass +class Round(MathOp): + precision: int + + +class Abs(MathOp): + pass + + +class Ceil(MathOp): + pass + + +class Floor(MathOp): + pass + + +class MathNoop(MathOp): + pass + + +class _Number(Expr): + def __init__(self, expr: Expr, op: MathOp): + self.op = op + self.operand = expr + super(_Number, self).__init__() + + def abs(self) -> _Number: + return _Number(self, Abs()) + + def round(self, precision: int) -> _Number: + return _Number(self, Round(precision)) + + def ceil(self) -> _Number: + return _Number(self, Ceil()) + + def floor(self) -> _Number: + return _Number(self, Floor()) + + +######################################################### +# String Functions +######################################################### + + +class StringOp: + pass + + +@dataclass +class StrContains(StringOp): + item: Expr + + +class Lower(StringOp): + pass + + +class Upper(StringOp): + pass + + +class StrLen(StringOp): + pass + + +class StringNoop(StringOp): + pass + + +@dataclass +class Concat(StringOp): + other: Expr + + +class _String(Expr): + + def __init__(self, expr: Expr, op: StringOp): + self.op = op + self.operand = expr + super(_String, self).__init__() + + def lower(self) -> _String: + return _String(self, Lower()) + + def upper(self) -> _String: + return _String(self, Upper()) + + def contains(self, item) -> _Bool: + item_expr = make_expr(item) + return _Bool(_String(self, StrContains(item_expr))) + + def concat(self, other: Expr) -> _String: + other = make_expr(other) + return _String(self, Concat(other)) + + def len(self) -> _Number: + return _Number(_String(self, StrLen()), MathNoop()) + + +######################################################### +# Dict Functions +######################################################### + + +class DictOp: + pass + + +class DictLen(DictOp): + pass + + +@dataclass +class DictGet(DictOp): + key: Expr + default: Optional[Expr] + + +class DictNoop(DictOp): + pass + + +@dataclass +class DictContains(DictOp): + field: str + + +class _Dict(Expr): + def __init__(self, expr: Expr, op: DictOp): + self.op = op + self.expr = expr + super(_Dict, self).__init__() + + def get(self, key: str, default: Optional[Expr] = None) -> Expr: + key = make_expr(key) + default = make_expr(default) if default is not None else None # type: ignore + return _Dict(self, DictGet(key, default)) # type: ignore + + def len(self) -> Expr: + return _Number(_Dict(self, DictLen()), MathNoop()) + + def contains(self, field: Expr) -> Expr: + field = make_expr(field) + return Expr(_Dict(self, DictContains(field))) # type: ignore + + +######################################################### +# Struct Functions +######################################################### + + +class StructOp: + pass + + +@dataclass +class StructGet(StructOp): + key: Expr + + +class StructNoop(StructOp): + pass + + +class _Struct(Expr): + pass + + +######################################################### +# DateTime Functions +######################################################### + + +class DateTimeOp: + pass + + +class DateTimeNoop(DateTimeOp): + pass + + +class _DateTime(Expr): + pass + + +######################################################### +# List Functions +######################################################### + + +class ListOp: + pass + + +class ListLen(ListOp): + pass + + +@dataclass +class ListContains(ListOp): + item: Expr + + +@dataclass +class ListGet(ListOp): + index: Expr + + +class ListNoop(ListOp): + pass + + +class _List(Expr): + pass + + +####################################################### + + +class Literal(Expr): + def __init__(self, c: Any, type: Type): + super(Literal, self).__init__() + if getattr(c.__class__, FENNEL_STRUCT, False): + val = json.dumps(c.as_json()) + else: + try: + val = json.dumps(c) + except TypeError: + val = json.dumps(str(c)) + self.c = val + self.dtype = type + + +class Unary(Expr): + def __init__(self, op: str, operand: Any): + valid = ("~", "len", "str") + if op not in valid: + raise InvalidExprException( + "unary expressions only support %s but given '%s'" + % (", ".join(valid), op) + ) + operand = make_expr(operand) + if not isinstance(operand, Expr): + raise InvalidExprException( + "operand can only be an expression but got %s instead" % operand + ) + self.op = op + self.operand = operand + super(Unary, self).__init__() + + def __str__(self) -> str: + if self.op in ["len", "str"]: + return f"{self.op}({self.operand})" + else: + return f"{self.op}{self.operand}" + + +class Binary(Expr): + def __init__(self, left: Any, op: str, right: Any): + valid = ( + "+", + "-", + "*", + "/", + "//", + "%", + "and", + "or", + "==", + ">=", + ">", + "<", + "<=", + "!=", + "[]", + "in", + ) + if op not in valid: + raise InvalidExprException( + "binary expressions only support %s but given '%s'" + % (", ".join(valid), op) + ) + left = make_expr(left) + right = make_expr(right) + if not isinstance(left, Expr): + raise InvalidExprException( + "left can only be an expression but got %s instead" % left + ) + if not isinstance(right, Expr): + raise InvalidExprException( + "right can only be an expression but got %s instead" % right + ) + self.left = left + self.op = op + self.right = right + + super(Binary, self).__init__(None) + + def __str__(self) -> str: + if self.op == "[]": + return f"{self.left}[{self.right}]" + else: + return f"{self.left} {self.op} {self.right}" + + +class When(Expr): + def __init__(self, expr: Expr, root: Optional[Expr] = None): + self.expr = make_expr(expr) + self._then = None + super(When, self).__init__(self if root is None else root) + + def then(self, expr: Expr) -> Then: + self._then = Then(expr, self.root) # type: ignore + return self._then # type: ignore + + +class Then(Expr): + def __init__(self, expr: Expr, root: Optional[Expr] = None): + self.expr = make_expr(expr) + self._otherwise = None + self._chained_when = None + super(Then, self).__init__(self if root is None else root) + + def when(self, expr: Expr) -> When: + self._chained_when = When(expr, self.root) # type: ignore + return self._chained_when # type: ignore + + def otherwise(self, expr: Expr) -> Otherwise: + self._otherwise = Otherwise(make_expr(expr), self.root) # type: ignore + return self._otherwise # type: ignore + + +class Otherwise(Expr): + + def __init__(self, expr: Expr, root: Optional[Expr] = None): + self.expr = make_expr(expr) + super(Otherwise, self).__init__(self if root is None else root) + + +class Ref(Expr): + def __init__(self, col: str): + if not isinstance(col, str): + raise InvalidExprException( + f"column name can only be a string but got {col} instead" + ) + self._col = col + super(Ref, self).__init__() + + def __str__(self) -> str: + return f"Ref('{self._col}')" + + +class IsNull(Expr): + def __init__(self, expr: Expr): + self.expr = expr + super(IsNull, self).__init__() + + def __str__(self) -> str: + return f"{self.expr} is null" + + +class FillNull(Expr): + def __init__(self, expr: Expr, value: Any): + self.expr = expr + self.value = make_expr(value) + + def __str__(self) -> str: + return f"fillnull({self.expr}, {self.value})" + + +def make_expr(v: Any) -> Any: + """Tries to convert v to Expr. Throws an exception if conversion is not possible.""" + if isinstance(v, Expr): + return v + elif isinstance(v, Callable): # type: ignore + raise TypeError( + "Functions cannot be converted to an expression, found {v}" + ) + elif isinstance(v, TypedExpr): + raise TypeError( + "astype() must be used as a standalone operation on the entire expression, syntax: ().astype(). It cannot be combined with other expressions." + ) + else: + return lit(v) + + +################################################################# +# Top level functions # +################################################################# + + +def F(col: str) -> Expr: + return Ref(col) + + +def lit(v: Any, type: Optional[Type] = None) -> Expr: + # TODO: Add support for more types recursively + if type is not None: + return Literal(v, type) + elif isinstance(v, int): + return Literal(v, int) + elif isinstance(v, float): + return Literal(v, float) + elif isinstance(v, str): + return Literal(v, str) + elif isinstance(v, bool): + return Literal(v, bool) + elif v is None: + return Literal(v, None) # type: ignore + else: + raise Exception( + f"Cannot infer type of literal {v}, please provide type" + ) + + +def when(expr: Expr) -> When: + return When(expr) diff --git a/fennel/expr/gen_lib/fennel_data_lib-0.1.0-cp311-cp311-macosx_11_0_arm64.whl b/fennel/expr/gen_lib/fennel_data_lib-0.1.0-cp311-cp311-macosx_11_0_arm64.whl new file mode 100644 index 000000000..faabe6086 Binary files /dev/null and b/fennel/expr/gen_lib/fennel_data_lib-0.1.0-cp311-cp311-macosx_11_0_arm64.whl differ diff --git a/fennel/expr/list.py b/fennel/expr/list.py new file mode 100644 index 000000000..e69de29bb diff --git a/fennel/expr/math.py b/fennel/expr/math.py new file mode 100644 index 000000000..ec97f0b15 --- /dev/null +++ b/fennel/expr/math.py @@ -0,0 +1,3 @@ +from enum import Enum + +from fennel.expr.expr import Expr diff --git a/fennel/expr/serializer.py b/fennel/expr/serializer.py new file mode 100644 index 000000000..10167fca2 --- /dev/null +++ b/fennel/expr/serializer.py @@ -0,0 +1,248 @@ +from typing import Any, List +import json + +from fennel.dtypes.dtypes import FENNEL_STRUCT + +from .visitor import Visitor +import fennel.gen.expr_pb2 as proto +from fennel.internal_lib.schema import get_datatype + +from fennel.expr.expr import ( + Literal, + Ref, + Unary, + When, + Then, + Otherwise, + Binary, + IsNull, + FillNull, + _Bool, + _Dict, + _Struct, + _List, + _Number, + _String, + InvalidExprException, + MathNoop, + Round, + Ceil, + Abs, + Floor, + StringNoop, + StrLen, + Lower, + Upper, + StrContains, + Concat, + DictContains, + DictGet, + DictLen, + DictNoop, +) + + +class ExprSerializer(Visitor): + def __init__(self): + super(ExprSerializer, self).__init__() + + def visit(self, obj): + ret = super(ExprSerializer, self).visit(obj) + return ret + + def serialize(self, obj, second_pass=False): + return self.visit(obj) + + def visitLiteral(self, obj): + expr = proto.Expr() + val = val_as_json(obj.c) + expr.json_literal.literal = val + expr.json_literal.dtype.CopyFrom(get_datatype(obj.dtype)) + return expr + + def visitRef(self, obj): + expr = proto.Expr() + expr.ref.name = obj._col + return expr + + def visitUnary(self, obj): + expr = proto.Expr() + expr.unary.op = obj.op + operand = self.visit(obj.operand) + expr.unary.operand.CopyFrom(operand) + return expr + + def visitBinary(self, obj): + expr = proto.Expr() + if obj.op == "and": + expr.binary.op = proto.BinOp.AND + elif obj.op == "or": + expr.binary.op = proto.BinOp.OR + elif obj.op == "+": + expr.binary.op = proto.BinOp.ADD + elif obj.op == "-": + expr.binary.op = proto.BinOp.SUB + elif obj.op == "*": + expr.binary.op = proto.BinOp.MUL + elif obj.op == "/": + expr.binary.op = proto.BinOp.DIV + elif obj.op == "//": + expr.binary.op = proto.BinOp.FLOOR_DIV + elif obj.op == "%": + expr.binary.op = proto.BinOp.MOD + elif obj.op == "==": + expr.binary.op = proto.BinOp.EQ + elif obj.op == "!=": + expr.binary.op = proto.BinOp.NE + elif obj.op == ">": + expr.binary.op = proto.BinOp.GT + elif obj.op == "<": + expr.binary.op = proto.BinOp.LT + elif obj.op == ">=": + expr.binary.op = proto.BinOp.GTE + elif obj.op == "<=": + expr.binary.op = proto.BinOp.LTE + else: + raise InvalidExprException("invalid binary operation: %s" % obj.op) + left = self.visit(obj.left) + right = self.visit(obj.right) + expr.binary.left.CopyFrom(left) + expr.binary.right.CopyFrom(right) + return expr + + def visitIsNull(self, obj): + expr = proto.Expr() + expr.isnull.operand.CopyFrom(self.visit(obj.expr)) + return expr + + def visitFillNull(self, obj): + return "FILL NULL(%s, %s)" % ( + self.visit(obj.expr), + self.visit(obj.fill), + ) + + def visitWhen(self, obj): + expr = proto.Expr() + case = proto.Case() + cur_when = obj + when_then_pairs: List[When, Then] = [] + while cur_when is not None: + if cur_when._then is None: + raise InvalidExprException( + f"THEN clause missing for WHEN clause {self.visit(cur_when)}" + ) + when_then_pairs.append((cur_when, cur_when._then)) + cur_when = cur_when._then._chained_when + + case.when_then.extend( + [ + proto.WhenThen( + when=self.visit(when.expr), then=self.visit(then.expr) + ) + for when, then in when_then_pairs + ] + ) + if when_then_pairs[-1][1]._otherwise is not None: + case.otherwise.CopyFrom( + self.visit(when_then_pairs[-1][1]._otherwise.expr) + ) + expr.case.CopyFrom(case) + return expr + + def visitThen(self, obj): + return self.visit(obj.expr) + + def visitOtherwise(self, obj): + return self.visit(obj.expr) + + def visitBool(self, obj): + return self.visit(obj.expr) + + def visitNumber(self, obj): + expr = proto.Expr() + if isinstance(obj.op, MathNoop): + return self.visit(obj.operand) + elif isinstance(obj.op, Round): + expr.math_fn.fn.CopyFrom( + proto.MathOp(round=proto.Round(precision=obj.op.precision)) + ) + elif isinstance(obj.op, Ceil): + expr.math_fn.fn.CopyFrom(proto.MathOp(ceil=proto.Ceil())) + elif isinstance(obj.op, Abs): + expr.math_fn.fn.CopyFrom(proto.MathOp(abs=proto.Abs())) + elif isinstance(obj.op, Floor): + expr.math_fn.fn.CopyFrom(proto.MathOp(floor=proto.Floor())) + else: + raise InvalidExprException("invalid number operation: %s" % obj.op) + expr.math_fn.operand.CopyFrom(self.visit(obj.operand)) + return expr + + def visitString(self, obj): + expr = proto.Expr() + if isinstance(obj.op, StringNoop): + return self.visit(obj.operand) + elif isinstance(obj.op, StrLen): + expr.string_fn.fn.CopyFrom(proto.StringOp(len=proto.Len())) + elif isinstance(obj.op, Lower): + expr.string_fn.fn.CopyFrom(proto.StringOp(tolower=proto.ToLower())) + elif isinstance(obj.op, Upper): + expr.string_fn.fn.CopyFrom(proto.StringOp(toupper=proto.ToUpper())) + elif isinstance(obj.op, StrContains): + expr.string_fn.fn.CopyFrom( + proto.StringOp( + contains=proto.Contains(element=self.visit(obj.op.item)) + ) + ) + elif isinstance(obj.op, Concat): + expr.string_fn.fn.CopyFrom( + proto.StringOp( + concat=proto.Concat( + other=self.visit(obj.op.other), + ) + ) + ) + else: + raise InvalidExprException("invalid string operation: %s" % obj.op) + expr.string_fn.string.CopyFrom(self.visit(obj.operand)) + return expr + + def visitDict(self, obj): + expr = proto.Expr() + if isinstance(obj.op, DictNoop): + return self.visit(obj.expr) + elif isinstance(obj.op, DictContains): + expr.dict_fn.fn.CopyFrom( + proto.DictOp( + contains=proto.Contains(element=self.visit(obj.op.item)) + ) + ) + elif isinstance(obj.op, DictGet): + expr.dict_fn.fn.CopyFrom( + proto.DictOp( + get=proto.DictGet( + field=self.visit(obj.op.key), + default_value=( + self.visit(obj.op.default) + if obj.op.default is not None + else None + ), + ) + ) + ) + elif isinstance(obj.op, DictLen): + expr.dict_fn.fn.CopyFrom(proto.DictOp(len=proto.Len())) + else: + raise InvalidExprException("invalid dict operation: %s" % obj.op) + expr.dict_fn.dict.CopyFrom(self.visit(obj.expr)) + return expr + + +def val_as_json(val: Any) -> str: + if isinstance(val, str): + return val + if getattr(val.__class__, FENNEL_STRUCT, False): + return json.dumps(val.as_json()) + try: + return json.dumps(val) + except TypeError: + return json.dumps(str(val)) diff --git a/fennel/expr/test_expr.py b/fennel/expr/test_expr.py new file mode 100644 index 000000000..f8e04bb93 --- /dev/null +++ b/fennel/expr/test_expr.py @@ -0,0 +1,313 @@ +import pytest +import pandas as pd +from datetime import datetime +from typing import Dict +from fennel.datasets import dataset + +from fennel.expr import F, when +from fennel.expr.visitor import ExprPrinter +from fennel.expr.serializer import ExprSerializer +from google.protobuf.json_format import ParseDict # type: ignore +from fennel.gen.expr_pb2 import Expr +from fennel.testing.test_utils import error_message + + +def test_basic_expr1(): + expr = (F("num") + F("d")).isnull() + df = pd.DataFrame({"num": [1, 2, 3, 4], "d": [5, 6, 7, 8]}) + assert expr.typeof({"num": int, "d": int}) == bool + ret = expr.eval(df, {"num": int, "d": int}) + assert ret.tolist() == [False, False, False, False] + + +def test_basic_expr2(): + + expr = F("a") + F("b") + 3 + printer = ExprPrinter() + expected = "((Ref('a') + Ref('b')) + 3)" + assert expected == printer.print(expr.root) + serializer = ExprSerializer() + proto_expr = serializer.serialize(expr.root) + d = { + "binary": { + "left": { + "binary": { + "left": {"ref": {"name": "a"}}, + "right": {"ref": {"name": "b"}}, + } + }, + "right": { + "jsonLiteral": {"literal": "3", "dtype": {"intType": {}}} + }, + } + } + expected_expr = ParseDict(d, Expr()) + assert expected_expr == proto_expr, error_message(proto_expr, expected_expr) + + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) + + @dataset + class TestDataset: + a: int + b: int + t: datetime + + ret = expr.eval(df, {"a": int, "b": int}) + assert ret.tolist() == [9, 11, 13, 15] + ret = expr.eval(df, TestDataset.schema()) + assert ret.tolist() == [9, 11, 13, 15] + assert expr.typeof({"a": int, "b": int}) == int + + +def test_math_expr(): + expr = (F("a").num.floor() + 3.2).num.ceil() + printer = ExprPrinter() + expected = "CEIL((FLOOR(Ref('a')) + 3.2))" + assert expected == printer.print(expr.root) + serializer = ExprSerializer() + proto_expr = serializer.serialize(expr.root) + d = { + "mathFn": { + "operand": { + "binary": { + "left": { + "mathFn": { + "operand": {"ref": {"name": "a"}}, + "fn": {"floor": {}}, + } + }, + "right": { + "jsonLiteral": { + "literal": "3.2", + "dtype": {"doubleType": {}}, + } + }, + } + }, + "fn": {"ceil": {}}, + } + } + expected_expr = ParseDict(d, Expr()) + assert expected_expr == proto_expr, error_message(proto_expr, expected_expr) + df = pd.DataFrame({"a": [1.4, 2.9, 3.1, 4.8], "b": ["a", "b", "c", "d"]}) + ret = expr.eval(df, {"a": float}) + assert ret.tolist() == [5, 6, 7, 8] + assert expr.typeof({"a": float}) == int + + expr = ( + when(F("a").num.floor() > 5) + .then(F("b")) + .when(F("a") > 3) + .then(F("a")) + .otherwise(1) + ) + df = pd.DataFrame({"a": [1.4, 3.2, 6.1, 4.8], "b": [100, 200, 300, 400]}) + ret = expr.eval(df, {"a": float, "b": int}) + assert ret.tolist() == [1, 3.2, 300, 4.8] + assert expr.typeof({"a": float, "b": int}) == float + + +def test_bool_expr(): + expr = (F("a") == 5) | ((F("b") == "random") & (F("c") == 3.2)) + printer = ExprPrinter() + expected = """((Ref('a') == 5) or ((Ref('b') == "random") and (Ref('c') == 3.2)))""" + assert expected == printer.print(expr.root) + + df = pd.DataFrame( + { + "a": [4, 5, 3, 4], + "b": ["radfsfom", "random", "random", "random"], + "c": [3.2, 1.2, 3.4, 3.2], + } + ) + ret = expr.eval(df, {"a": int, "b": str, "c": float}) + assert ret.tolist() == [False, True, False, True] + assert expr.typeof({"a": int, "b": str, "c": float}) == bool + + +def test_str_expr(): + expr = (F("a").str.concat(F("b"))).str.lower().len().ceil() + printer = ExprPrinter() + expected = "CEIL(LEN(LOWER(Ref('a') + Ref('b'))))" + assert expected == printer.print(expr.root) + + expr = ( + when(((F("a").str.concat(F("b"))).str.upper()).str.contains(F("c"))) + .then(F("b")) + .otherwise("No Match") + ) + expected = """WHEN CONTAINS(UPPER(Ref('a') + Ref('b')), Ref('c')) THEN Ref('b') ELSE "No Match\"""" + assert expected == printer.print(expr.root) + df = pd.DataFrame( + { + "a": ["p", "BRandomS", "CRandomStrin", "tqz"], + "b": ["aa", "tring", "g", "d"], + "c": [ + "RANDOMSTRING", + "RANDOMSTRING", + "RANDOMSTRING", + "RANDOMSTRING", + ], + } + ) + ret = expr.eval(df, {"a": str, "b": str, "c": str}) + assert ret.tolist() == [ + "No Match", + "tring", + "g", + "No Match", + ] + assert expr.typeof({"a": str, "b": str, "c": str}) == str + expr = ( + when(F("a").str.contains("p")) + .then(F("b")) + .when(F("b").str.contains("b")) + .then(F("a")) + .when(F("c").str.contains("C")) + .then(F("c")) + .otherwise("No Match") + ) + expected = """WHEN CONTAINS(Ref('a'), "p") THEN Ref('b') WHEN CONTAINS(Ref('b'), "b") THEN Ref('a') WHEN CONTAINS(Ref('c'), "C") THEN Ref('c') ELSE "No Match\"""" + assert expected == printer.print(expr.root) + serializer = ExprSerializer() + proto_expr = serializer.serialize(expr.root) + d = { + "case": { + "whenThen": [ + { + "when": { + "stringFn": { + "string": {"ref": {"name": "a"}}, + "fn": { + "contains": { + "element": { + "jsonLiteral": { + "literal": '"p"', + "dtype": {"stringType": {}}, + } + } + } + }, + } + }, + "then": {"ref": {"name": "b"}}, + }, + { + "when": { + "stringFn": { + "string": {"ref": {"name": "b"}}, + "fn": { + "contains": { + "element": { + "jsonLiteral": { + "literal": '"b"', + "dtype": {"stringType": {}}, + } + } + } + }, + } + }, + "then": {"ref": {"name": "a"}}, + }, + { + "when": { + "stringFn": { + "string": {"ref": {"name": "c"}}, + "fn": { + "contains": { + "element": { + "jsonLiteral": { + "literal": '"C"', + "dtype": {"stringType": {}}, + } + } + } + }, + } + }, + "then": {"ref": {"name": "c"}}, + }, + ], + "otherwise": { + "jsonLiteral": { + "literal": '"No Match"', + "dtype": {"stringType": {}}, + } + }, + } + } + expected_expr = ParseDict(d, Expr()) + assert expected_expr == proto_expr, error_message(proto_expr, expected_expr) + + df = pd.DataFrame( + { + "a": ["p", "q", "r", "t"], + "b": ["a", "b", "c", "d"], + "c": ["A", "B", "C", "D"], + } + ) + ret = expr.eval(df, {"a": str, "b": str, "c": str}) + assert expr.typeof({"a": str, "b": str, "c": str}) == str + + +def test_dict_op(): + expr = (F("a").dict.get("x") + F("a").dict.get("y")).num.ceil() + F( + "a" + ).dict.len() + printer = ExprPrinter() + expected = ( + """(CEIL((Ref('a').get("x") + Ref('a').get("y"))) + LEN(Ref('a')))""" + ) + assert expected == printer.print(expr.root) + serializer = ExprSerializer() + proto_expr = serializer.serialize(expr.root) + d = { + "binary": { + "left": { + "mathFn": { + "operand": { + "binary": { + "left": { + "dictFn": { + "dict": {"ref": {"name": "a"}}, + "fn": { + "get": { + "field": { + "jsonLiteral": { + "literal": '"x"', + "dtype": {"stringType": {}}, + } + } + } + }, + } + }, + "right": { + "dictFn": { + "dict": {"ref": {"name": "a"}}, + "fn": { + "get": { + "field": { + "jsonLiteral": { + "literal": '"y"', + "dtype": {"stringType": {}}, + } + } + } + }, + } + }, + } + }, + "fn": {"ceil": {}}, + } + }, + "right": { + "dictFn": {"dict": {"ref": {"name": "a"}}, "fn": {"len": {}}} + }, + } + } + expected_expr = ParseDict(d, Expr()) + assert expected_expr == proto_expr, error_message(proto_expr, expected_expr) + assert expr.typeof({"a": Dict[str, int]}) == int diff --git a/fennel/expr/test_invalid_expr.py b/fennel/expr/test_invalid_expr.py new file mode 100644 index 000000000..e69de29bb diff --git a/fennel/expr/visitor.py b/fennel/expr/visitor.py new file mode 100644 index 000000000..781e4a245 --- /dev/null +++ b/fennel/expr/visitor.py @@ -0,0 +1,238 @@ +from typing import List + +from fennel.expr.expr import ( + Literal, + Ref, + Unary, + When, + Then, + Otherwise, + Binary, + IsNull, + FillNull, + _Bool, + _Dict, + _Struct, + _List, + _Number, + _String, + InvalidExprException, + MathNoop, + Round, + Ceil, + Abs, + Floor, + StringNoop, + StrLen, + Lower, + Upper, + StrContains, + DictContains, + Concat, + DictGet, + DictLen, + DictNoop, +) + + +class Visitor(object): + def visit(self, obj): + if isinstance(obj, Literal): + ret = self.visitLiteral(obj) + + elif isinstance(obj, Ref): + ret = self.visitRef(obj) + + elif isinstance(obj, Unary): + ret = self.visitUnary(obj) + + elif isinstance(obj, Binary): + ret = self.visitBinary(obj) + + elif isinstance(obj, IsNull): + ret = self.visitIsNull(obj) + + elif isinstance(obj, FillNull): + ret = self.visitFillNull(obj) + + elif isinstance(obj, When): + ret = self.visitWhen(obj) + + elif isinstance(obj, Then): + ret = self.visitThen(obj) + + elif isinstance(obj, Otherwise): + ret = self.visitOtherwise(obj) + + elif isinstance(obj, _Number): + ret = self.visitNumber(obj) + + elif isinstance(obj, _Dict): + ret = self.visitDict(obj) + + elif isinstance(obj, _List): + ret = self.visitList(obj) + + elif isinstance(obj, _Struct): + ret = self.visitStruct(obj) + + elif isinstance(obj, _String): + ret = self.visitString(obj) + + elif isinstance(obj, _Bool): + ret = self.visitBool(obj) + else: + raise InvalidExprException("invalid expression type: %s" % obj) + + return ret + + def visitLiteral(self, obj): + raise NotImplementedError + + def visitRef(self, obj): + raise NotImplementedError + + def visitUnary(self, obj): + raise NotImplementedError + + def visitBinary(self, obj): + raise NotImplementedError + + def visitIsNull(self, obj): + raise NotImplementedError + + def visitFillNull(self, obj): + raise NotImplementedError + + def visitThen(self, obj): + raise NotImplementedError + + def visitOtherwise(self, obj): + raise NotImplementedError + + def visitNumber(self, obj): + raise NotImplementedError + + def visitBool(self, obj): + raise NotImplementedError + + def visitString(self, obj): + raise NotImplementedError + + def visitDict(self, obj): + raise NotImplementedError + + def visitList(self, obj): + raise NotImplementedError + + def visitWhen(self, obj): + raise NotImplementedError + + def visitStruct(self, obj): + raise NotImplementedError + + +class ExprPrinter(Visitor): + + def print(self, obj): + return self.visit(obj) + + def visitLiteral(self, obj): + return obj.c + + def visitRef(self, obj): + return str(obj) + + def visitUnary(self, obj): + return "%s(%s)" % (obj.op, self.visit(obj.expr)) + + def visitBinary(self, obj): + return "(%s %s %s)" % ( + self.visit(obj.left), + obj.op, + self.visit(obj.right), + ) + + def visitIsNull(self, obj): + return "IS_NULL(%s)" % self.visit(obj.expr) + + def visitFillNull(self, obj): + return "FILL_NULL(%s, %s)" % ( + self.visit(obj.expr), + self.visit(obj.fill), + ) + + def visitWhen(self, obj): + cur_when = obj + when_then_pairs: List[When, Then] = [] + while cur_when is not None: + if cur_when._then is None: + raise InvalidExprException( + f"THEN clause missing for WHEN clause {self.visit(cur_when)}" + ) + when_then_pairs.append((cur_when, cur_when._then)) + cur_when = cur_when._then._chained_when + + ret = " ".join( + [ + f"WHEN {self.visit(when.expr)} THEN {self.visit(then.expr)}" + for when, then in when_then_pairs + ] + ) + if when_then_pairs[-1][1]._otherwise is not None: + ret += f" ELSE {self.visit(when_then_pairs[-1][1]._otherwise.expr)}" + return ret + + def visitThen(self, obj): + return f"{self.visit(obj.expr)}" + + def visitOtherwise(self, obj): + return f"{self.visit(obj.expr)}" + + def visitBool(self, obj): + return f"{self.visit(obj.expr)}" + + def visitNumber(self, obj): + if isinstance(obj.op, MathNoop): + return self.visit(obj.operand) + elif isinstance(obj.op, Floor): + return "FLOOR(%s)" % self.visit(obj.operand) + elif isinstance(obj.op, Round): + return f"ROUND({self.visit(obj.operand)}, {obj.op.precision})" + elif isinstance(obj.op, Ceil): + return "CEIL(%s)" % self.visit(obj.operand) + elif isinstance(obj.op, Abs): + return "ABS(%s)" % self.visit(obj.operand) + else: + raise InvalidExprException("invalid number operation: %s" % obj.op) + + def visitString(self, obj): + if isinstance(obj.op, StringNoop): + return self.visit(obj.operand) + elif isinstance(obj.op, StrLen): + return "LEN(%s)" % self.visit(obj.operand) + elif isinstance(obj.op, Lower): + return "LOWER(%s)" % self.visit(obj.operand) + elif isinstance(obj.op, Upper): + return "UPPER(%s)" % self.visit(obj.operand) + elif isinstance(obj.op, StrContains): + return f"CONTAINS({self.visit(obj.operand)}, {self.visit(obj.op.item)})" + elif isinstance(obj.op, Concat): + return f"{self.visit(obj.operand)} + {self.visit(obj.op.other)}" + else: + raise InvalidExprException("invalid string operation: %s" % obj.op) + + def visitDict(self, obj): + if isinstance(obj.op, DictNoop): + return self.visit(obj.expr) + elif isinstance(obj.op, DictContains): + return ( + f"CONTAINS({self.visit(obj.expr)}, {self.visit(obj.op.item)})" + ) + elif isinstance(obj.op, DictGet): + if obj.op.default is None: + return f"{self.visit(obj.expr)}.get({self.visit(obj.op.key)})" + else: + return f"{self.visit(obj.expr)}.get('{self.visit(obj.op.key)}', {self.visit(obj.op.default)})" + elif isinstance(obj.op, DictLen): + return f"LEN({self.visit(obj.expr)})" diff --git a/fennel/gen/dataset_pb2.py b/fennel/gen/dataset_pb2.py index a53669565..e6f8c47ed 100644 --- a/fennel/gen/dataset_pb2.py +++ b/fennel/gen/dataset_pb2.py @@ -17,9 +17,10 @@ import fennel.gen.schema_pb2 as schema__pb2 import fennel.gen.spec_pb2 as spec__pb2 import fennel.gen.window_pb2 as window__pb2 +import fennel.gen.expr_pb2 as expr__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rdataset.proto\x12\x14\x66\x65nnel.proto.dataset\x1a\x1egoogle/protobuf/duration.proto\x1a\x0emetadata.proto\x1a\x0cpycode.proto\x1a\x0cschema.proto\x1a\nspec.proto\x1a\x0cwindow.proto\"\xe5\x03\n\x0b\x43oreDataset\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\x08metadata\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata\x12/\n\x08\x64sschema\x18\x03 \x01(\x0b\x32\x1d.fennel.proto.schema.DSSchema\x12*\n\x07history\x18\x04 \x01(\x0b\x32\x19.google.protobuf.Duration\x12,\n\tretention\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\x12L\n\x0e\x66ield_metadata\x18\x06 \x03(\x0b\x32\x34.fennel.proto.dataset.CoreDataset.FieldMetadataEntry\x12+\n\x06pycode\x18\x07 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x19\n\x11is_source_dataset\x18\x08 \x01(\x08\x12\x0f\n\x07version\x18\t \x01(\r\x12\x0c\n\x04tags\x18\n \x03(\t\x1aU\n\x12\x46ieldMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12.\n\x05value\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata:\x02\x38\x01\"Q\n\x08OnDemand\x12\x1c\n\x14\x66unction_source_code\x18\x01 \x01(\t\x12\x10\n\x08\x66unction\x18\x02 \x01(\x0c\x12\x15\n\rexpires_after\x18\x03 \x01(\x03\"\xd2\x01\n\x08Pipeline\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0c\x64\x61taset_name\x18\x02 \x01(\t\x12\x11\n\tsignature\x18\x03 \x01(\t\x12\x31\n\x08metadata\x18\x04 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata\x12\x1b\n\x13input_dataset_names\x18\x05 \x03(\t\x12\x12\n\nds_version\x18\x06 \x01(\r\x12+\n\x06pycode\x18\x07 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\"\x9d\x07\n\x08Operator\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07is_root\x18\x02 \x01(\x08\x12\x15\n\rpipeline_name\x18\x03 \x01(\t\x12\x14\n\x0c\x64\x61taset_name\x18\x04 \x01(\t\x12\x12\n\nds_version\x18\x14 \x01(\r\x12\x34\n\taggregate\x18\x05 \x01(\x0b\x32\x1f.fennel.proto.dataset.AggregateH\x00\x12*\n\x04join\x18\x06 \x01(\x0b\x32\x1a.fennel.proto.dataset.JoinH\x00\x12\x34\n\ttransform\x18\x07 \x01(\x0b\x32\x1f.fennel.proto.dataset.TransformH\x00\x12,\n\x05union\x18\x08 \x01(\x0b\x32\x1b.fennel.proto.dataset.UnionH\x00\x12.\n\x06\x66ilter\x18\t \x01(\x0b\x32\x1c.fennel.proto.dataset.FilterH\x00\x12\x37\n\x0b\x64\x61taset_ref\x18\n \x01(\x0b\x32 .fennel.proto.dataset.DatasetRefH\x00\x12.\n\x06rename\x18\x0c \x01(\x0b\x32\x1c.fennel.proto.dataset.RenameH\x00\x12*\n\x04\x64rop\x18\r \x01(\x0b\x32\x1a.fennel.proto.dataset.DropH\x00\x12\x30\n\x07\x65xplode\x18\x0e \x01(\x0b\x32\x1d.fennel.proto.dataset.ExplodeH\x00\x12,\n\x05\x64\x65\x64up\x18\x0f \x01(\x0b\x32\x1b.fennel.proto.dataset.DedupH\x00\x12,\n\x05\x66irst\x18\x10 \x01(\x0b\x32\x1b.fennel.proto.dataset.FirstH\x00\x12.\n\x06\x61ssign\x18\x11 \x01(\x0b\x32\x1c.fennel.proto.dataset.AssignH\x00\x12\x32\n\x08\x64ropnull\x18\x12 \x01(\x0b\x32\x1e.fennel.proto.dataset.DropnullH\x00\x12:\n\x06window\x18\x13 \x01(\x0b\x32(.fennel.proto.dataset.WindowOperatorKindH\x00\x12.\n\x06latest\x18\x15 \x01(\x0b\x32\x1c.fennel.proto.dataset.LatestH\x00\x12\x34\n\tchangelog\x18\x16 \x01(\x0b\x32\x1f.fennel.proto.dataset.ChangelogH\x00\x12\x0c\n\x04name\x18\x0b \x01(\tB\x06\n\x04kind\"\xf7\x01\n\tAggregate\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0c\n\x04keys\x18\x02 \x03(\t\x12)\n\x05specs\x18\x03 \x03(\x0b\x32\x1a.fennel.proto.spec.PreSpec\x12\x12\n\x05\x61long\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x43\n\remit_strategy\x18\x06 \x01(\x0e\x32,.fennel.proto.dataset.Aggregate.EmitStrategy\x12\x14\n\x0coperand_name\x18\x04 \x01(\t\"$\n\x0c\x45mitStrategy\x12\t\n\x05\x45\x61ger\x10\x00\x12\t\n\x05\x46inal\x10\x01\x42\x08\n\x06_along\"\xa2\x03\n\x04Join\x12\x16\n\x0elhs_operand_id\x18\x01 \x01(\t\x12\x1c\n\x14rhs_dsref_operand_id\x18\x02 \x01(\t\x12.\n\x02on\x18\x03 \x03(\x0b\x32\".fennel.proto.dataset.Join.OnEntry\x12\x32\n\nwithin_low\x18\x06 \x01(\x0b\x32\x19.google.protobuf.DurationH\x00\x88\x01\x01\x12\x33\n\x0bwithin_high\x18\x07 \x01(\x0b\x32\x19.google.protobuf.DurationH\x01\x88\x01\x01\x12\x18\n\x10lhs_operand_name\x18\x04 \x01(\t\x12\x1e\n\x16rhs_dsref_operand_name\x18\x05 \x01(\t\x12+\n\x03how\x18\x08 \x01(\x0e\x32\x1e.fennel.proto.dataset.Join.How\x1a)\n\x07OnEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x1a\n\x03How\x12\x08\n\x04Left\x10\x00\x12\t\n\x05Inner\x10\x01\x42\r\n\x0b_within_lowB\x0e\n\x0c_within_high\"\xed\x01\n\tTransform\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12;\n\x06schema\x18\x02 \x03(\x0b\x32+.fennel.proto.dataset.Transform.SchemaEntry\x12+\n\x06pycode\x18\x03 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x14\n\x0coperand_name\x18\x04 \x01(\t\x1aL\n\x0bSchemaEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType:\x02\x38\x01\"_\n\x06\x46ilter\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12+\n\x06pycode\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xa8\x01\n\x06\x41ssign\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12+\n\x06pycode\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x13\n\x0b\x63olumn_name\x18\x03 \x01(\t\x12\x32\n\x0boutput_type\x18\x04 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\x14\n\x0coperand_name\x18\x05 \x01(\t\"E\n\x08\x44ropnull\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"B\n\x04\x44rop\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x10\n\x08\x64ropcols\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xa5\x01\n\x06Rename\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12?\n\ncolumn_map\x18\x02 \x03(\x0b\x32+.fennel.proto.dataset.Rename.ColumnMapEntry\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\x1a\x30\n\x0e\x43olumnMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"3\n\x05Union\x12\x13\n\x0boperand_ids\x18\x01 \x03(\t\x12\x15\n\roperand_names\x18\x02 \x03(\t\"B\n\x05\x44\x65\x64up\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"D\n\x07\x45xplode\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"=\n\x05\x46irst\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\n\n\x02\x62y\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\">\n\x06Latest\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\n\n\x02\x62y\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"L\n\tChangelog\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x15\n\rdelete_column\x18\x02 \x01(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xcb\x01\n\x12WindowOperatorKind\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x30\n\x0bwindow_type\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.window.Window\x12\n\n\x02\x62y\x18\x03 \x03(\t\x12\r\n\x05\x66ield\x18\x04 \x01(\t\x12\x32\n\x07summary\x18\x06 \x01(\x0b\x32\x1c.fennel.proto.window.SummaryH\x00\x88\x01\x01\x12\x14\n\x0coperand_name\x18\x05 \x01(\tB\n\n\x08_summary\",\n\nDatasetRef\x12\x1e\n\x16referring_dataset_name\x18\x01 \x01(\t\"\x80\x02\n\x08\x44\x61taflow\x12\x16\n\x0c\x64\x61taset_name\x18\x01 \x01(\tH\x00\x12L\n\x11pipeline_dataflow\x18\x02 \x01(\x0b\x32/.fennel.proto.dataset.Dataflow.PipelineDataflowH\x00\x12\x0c\n\x04tags\x18\x03 \x03(\t\x1ax\n\x10PipelineDataflow\x12\x14\n\x0c\x64\x61taset_name\x18\x01 \x01(\t\x12\x15\n\rpipeline_name\x18\x02 \x01(\t\x12\x37\n\x0finput_dataflows\x18\x03 \x03(\x0b\x32\x1e.fennel.proto.dataset.DataflowB\x06\n\x04kind\"\x9c\x01\n\x10PipelineLineages\x12\x14\n\x0c\x64\x61taset_name\x18\x01 \x01(\t\x12\x15\n\rpipeline_name\x18\x02 \x01(\t\x12=\n\x0einput_datasets\x18\x03 \x03(\x0b\x32%.fennel.proto.dataset.DatasetLineages\x12\x0e\n\x06\x61\x63tive\x18\x04 \x01(\x08\x12\x0c\n\x04tags\x18\x05 \x03(\t\"\\\n\x17\x44\x61tasetPipelineLineages\x12\x41\n\x11pipeline_lineages\x18\x02 \x03(\x0b\x32&.fennel.proto.dataset.PipelineLineages\"\x8b\x01\n\x0f\x44\x61tasetLineages\x12\x18\n\x0esource_dataset\x18\x01 \x01(\tH\x00\x12H\n\x0f\x64\x65rived_dataset\x18\x02 \x01(\x0b\x32-.fennel.proto.dataset.DatasetPipelineLineagesH\x00\x12\x0c\n\x04tags\x18\x03 \x03(\tB\x06\n\x04kindb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rdataset.proto\x12\x14\x66\x65nnel.proto.dataset\x1a\x1egoogle/protobuf/duration.proto\x1a\x0emetadata.proto\x1a\x0cpycode.proto\x1a\x0cschema.proto\x1a\nspec.proto\x1a\x0cwindow.proto\x1a\nexpr.proto\"\xe5\x03\n\x0b\x43oreDataset\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x31\n\x08metadata\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata\x12/\n\x08\x64sschema\x18\x03 \x01(\x0b\x32\x1d.fennel.proto.schema.DSSchema\x12*\n\x07history\x18\x04 \x01(\x0b\x32\x19.google.protobuf.Duration\x12,\n\tretention\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\x12L\n\x0e\x66ield_metadata\x18\x06 \x03(\x0b\x32\x34.fennel.proto.dataset.CoreDataset.FieldMetadataEntry\x12+\n\x06pycode\x18\x07 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x19\n\x11is_source_dataset\x18\x08 \x01(\x08\x12\x0f\n\x07version\x18\t \x01(\r\x12\x0c\n\x04tags\x18\n \x03(\t\x1aU\n\x12\x46ieldMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12.\n\x05value\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata:\x02\x38\x01\"Q\n\x08OnDemand\x12\x1c\n\x14\x66unction_source_code\x18\x01 \x01(\t\x12\x10\n\x08\x66unction\x18\x02 \x01(\x0c\x12\x15\n\rexpires_after\x18\x03 \x01(\x03\"\xd2\x01\n\x08Pipeline\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0c\x64\x61taset_name\x18\x02 \x01(\t\x12\x11\n\tsignature\x18\x03 \x01(\t\x12\x31\n\x08metadata\x18\x04 \x01(\x0b\x32\x1f.fennel.proto.metadata.Metadata\x12\x1b\n\x13input_dataset_names\x18\x05 \x03(\t\x12\x12\n\nds_version\x18\x06 \x01(\r\x12+\n\x06pycode\x18\x07 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\"\x8f\x08\n\x08Operator\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0f\n\x07is_root\x18\x02 \x01(\x08\x12\x15\n\rpipeline_name\x18\x03 \x01(\t\x12\x14\n\x0c\x64\x61taset_name\x18\x04 \x01(\t\x12\x12\n\nds_version\x18\x14 \x01(\r\x12\x34\n\taggregate\x18\x05 \x01(\x0b\x32\x1f.fennel.proto.dataset.AggregateH\x00\x12*\n\x04join\x18\x06 \x01(\x0b\x32\x1a.fennel.proto.dataset.JoinH\x00\x12\x34\n\ttransform\x18\x07 \x01(\x0b\x32\x1f.fennel.proto.dataset.TransformH\x00\x12,\n\x05union\x18\x08 \x01(\x0b\x32\x1b.fennel.proto.dataset.UnionH\x00\x12.\n\x06\x66ilter\x18\t \x01(\x0b\x32\x1c.fennel.proto.dataset.FilterH\x00\x12\x37\n\x0b\x64\x61taset_ref\x18\n \x01(\x0b\x32 .fennel.proto.dataset.DatasetRefH\x00\x12.\n\x06rename\x18\x0c \x01(\x0b\x32\x1c.fennel.proto.dataset.RenameH\x00\x12*\n\x04\x64rop\x18\r \x01(\x0b\x32\x1a.fennel.proto.dataset.DropH\x00\x12\x30\n\x07\x65xplode\x18\x0e \x01(\x0b\x32\x1d.fennel.proto.dataset.ExplodeH\x00\x12,\n\x05\x64\x65\x64up\x18\x0f \x01(\x0b\x32\x1b.fennel.proto.dataset.DedupH\x00\x12,\n\x05\x66irst\x18\x10 \x01(\x0b\x32\x1b.fennel.proto.dataset.FirstH\x00\x12.\n\x06\x61ssign\x18\x11 \x01(\x0b\x32\x1c.fennel.proto.dataset.AssignH\x00\x12\x32\n\x08\x64ropnull\x18\x12 \x01(\x0b\x32\x1e.fennel.proto.dataset.DropnullH\x00\x12:\n\x06window\x18\x13 \x01(\x0b\x32(.fennel.proto.dataset.WindowOperatorKindH\x00\x12.\n\x06latest\x18\x15 \x01(\x0b\x32\x1c.fennel.proto.dataset.LatestH\x00\x12\x34\n\tchangelog\x18\x16 \x01(\x0b\x32\x1f.fennel.proto.dataset.ChangelogH\x00\x12\x37\n\x0b\x61ssign_expr\x18\x17 \x01(\x0b\x32 .fennel.proto.dataset.AssignExprH\x00\x12\x37\n\x0b\x66ilter_expr\x18\x18 \x01(\x0b\x32 .fennel.proto.dataset.FilterExprH\x00\x12\x0c\n\x04name\x18\x0b \x01(\tB\x06\n\x04kind\"\xf7\x01\n\tAggregate\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0c\n\x04keys\x18\x02 \x03(\t\x12)\n\x05specs\x18\x03 \x03(\x0b\x32\x1a.fennel.proto.spec.PreSpec\x12\x12\n\x05\x61long\x18\x05 \x01(\tH\x00\x88\x01\x01\x12\x43\n\remit_strategy\x18\x06 \x01(\x0e\x32,.fennel.proto.dataset.Aggregate.EmitStrategy\x12\x14\n\x0coperand_name\x18\x04 \x01(\t\"$\n\x0c\x45mitStrategy\x12\t\n\x05\x45\x61ger\x10\x00\x12\t\n\x05\x46inal\x10\x01\x42\x08\n\x06_along\"\xa2\x03\n\x04Join\x12\x16\n\x0elhs_operand_id\x18\x01 \x01(\t\x12\x1c\n\x14rhs_dsref_operand_id\x18\x02 \x01(\t\x12.\n\x02on\x18\x03 \x03(\x0b\x32\".fennel.proto.dataset.Join.OnEntry\x12\x32\n\nwithin_low\x18\x06 \x01(\x0b\x32\x19.google.protobuf.DurationH\x00\x88\x01\x01\x12\x33\n\x0bwithin_high\x18\x07 \x01(\x0b\x32\x19.google.protobuf.DurationH\x01\x88\x01\x01\x12\x18\n\x10lhs_operand_name\x18\x04 \x01(\t\x12\x1e\n\x16rhs_dsref_operand_name\x18\x05 \x01(\t\x12+\n\x03how\x18\x08 \x01(\x0e\x32\x1e.fennel.proto.dataset.Join.How\x1a)\n\x07OnEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x1a\n\x03How\x12\x08\n\x04Left\x10\x00\x12\t\n\x05Inner\x10\x01\x42\r\n\x0b_within_lowB\x0e\n\x0c_within_high\"\xed\x01\n\tTransform\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12;\n\x06schema\x18\x02 \x03(\x0b\x32+.fennel.proto.dataset.Transform.SchemaEntry\x12+\n\x06pycode\x18\x03 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x14\n\x0coperand_name\x18\x04 \x01(\t\x1aL\n\x0bSchemaEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType:\x02\x38\x01\"]\n\nFilterExpr\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12%\n\x04\x65xpr\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"_\n\x06\x46ilter\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12+\n\x06pycode\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xa8\x01\n\x06\x41ssign\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12+\n\x06pycode\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x13\n\x0b\x63olumn_name\x18\x03 \x01(\t\x12\x32\n\x0boutput_type\x18\x04 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\x14\n\x0coperand_name\x18\x05 \x01(\t\"\xd5\x02\n\nAssignExpr\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12:\n\x05\x65xprs\x18\x02 \x03(\x0b\x32+.fennel.proto.dataset.AssignExpr.ExprsEntry\x12G\n\x0coutput_types\x18\x03 \x03(\x0b\x32\x31.fennel.proto.dataset.AssignExpr.OutputTypesEntry\x12\x14\n\x0coperand_name\x18\x05 \x01(\t\x1a\x45\n\nExprsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12&\n\x05value\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr:\x02\x38\x01\x1aQ\n\x10OutputTypesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType:\x02\x38\x01\"E\n\x08\x44ropnull\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"B\n\x04\x44rop\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x10\n\x08\x64ropcols\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xa5\x01\n\x06Rename\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12?\n\ncolumn_map\x18\x02 \x03(\x0b\x32+.fennel.proto.dataset.Rename.ColumnMapEntry\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\x1a\x30\n\x0e\x43olumnMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"3\n\x05Union\x12\x13\n\x0boperand_ids\x18\x01 \x03(\t\x12\x15\n\roperand_names\x18\x02 \x03(\t\"B\n\x05\x44\x65\x64up\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"D\n\x07\x45xplode\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"=\n\x05\x46irst\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\n\n\x02\x62y\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\">\n\x06Latest\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\n\n\x02\x62y\x18\x02 \x03(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"L\n\tChangelog\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x15\n\rdelete_column\x18\x02 \x01(\t\x12\x14\n\x0coperand_name\x18\x03 \x01(\t\"\xcb\x01\n\x12WindowOperatorKind\x12\x12\n\noperand_id\x18\x01 \x01(\t\x12\x30\n\x0bwindow_type\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.window.Window\x12\n\n\x02\x62y\x18\x03 \x03(\t\x12\r\n\x05\x66ield\x18\x04 \x01(\t\x12\x32\n\x07summary\x18\x06 \x01(\x0b\x32\x1c.fennel.proto.window.SummaryH\x00\x88\x01\x01\x12\x14\n\x0coperand_name\x18\x05 \x01(\tB\n\n\x08_summary\",\n\nDatasetRef\x12\x1e\n\x16referring_dataset_name\x18\x01 \x01(\t\"\x80\x02\n\x08\x44\x61taflow\x12\x16\n\x0c\x64\x61taset_name\x18\x01 \x01(\tH\x00\x12L\n\x11pipeline_dataflow\x18\x02 \x01(\x0b\x32/.fennel.proto.dataset.Dataflow.PipelineDataflowH\x00\x12\x0c\n\x04tags\x18\x03 \x03(\t\x1ax\n\x10PipelineDataflow\x12\x14\n\x0c\x64\x61taset_name\x18\x01 \x01(\t\x12\x15\n\rpipeline_name\x18\x02 \x01(\t\x12\x37\n\x0finput_dataflows\x18\x03 \x03(\x0b\x32\x1e.fennel.proto.dataset.DataflowB\x06\n\x04kind\"\x9c\x01\n\x10PipelineLineages\x12\x14\n\x0c\x64\x61taset_name\x18\x01 \x01(\t\x12\x15\n\rpipeline_name\x18\x02 \x01(\t\x12=\n\x0einput_datasets\x18\x03 \x03(\x0b\x32%.fennel.proto.dataset.DatasetLineages\x12\x0e\n\x06\x61\x63tive\x18\x04 \x01(\x08\x12\x0c\n\x04tags\x18\x05 \x03(\t\"\\\n\x17\x44\x61tasetPipelineLineages\x12\x41\n\x11pipeline_lineages\x18\x02 \x03(\x0b\x32&.fennel.proto.dataset.PipelineLineages\"\x8b\x01\n\x0f\x44\x61tasetLineages\x12\x18\n\x0esource_dataset\x18\x01 \x01(\tH\x00\x12H\n\x0f\x64\x65rived_dataset\x18\x02 \x01(\x0b\x32-.fennel.proto.dataset.DatasetPipelineLineagesH\x00\x12\x0c\n\x04tags\x18\x03 \x03(\tB\x06\n\x04kindb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -32,68 +33,80 @@ _globals['_JOIN_ONENTRY']._serialized_options = b'8\001' _globals['_TRANSFORM_SCHEMAENTRY']._options = None _globals['_TRANSFORM_SCHEMAENTRY']._serialized_options = b'8\001' + _globals['_ASSIGNEXPR_EXPRSENTRY']._options = None + _globals['_ASSIGNEXPR_EXPRSENTRY']._serialized_options = b'8\001' + _globals['_ASSIGNEXPR_OUTPUTTYPESENTRY']._options = None + _globals['_ASSIGNEXPR_OUTPUTTYPESENTRY']._serialized_options = b'8\001' _globals['_RENAME_COLUMNMAPENTRY']._options = None _globals['_RENAME_COLUMNMAPENTRY']._serialized_options = b'8\001' - _globals['_COREDATASET']._serialized_start=142 - _globals['_COREDATASET']._serialized_end=627 - _globals['_COREDATASET_FIELDMETADATAENTRY']._serialized_start=542 - _globals['_COREDATASET_FIELDMETADATAENTRY']._serialized_end=627 - _globals['_ONDEMAND']._serialized_start=629 - _globals['_ONDEMAND']._serialized_end=710 - _globals['_PIPELINE']._serialized_start=713 - _globals['_PIPELINE']._serialized_end=923 - _globals['_OPERATOR']._serialized_start=926 - _globals['_OPERATOR']._serialized_end=1851 - _globals['_AGGREGATE']._serialized_start=1854 - _globals['_AGGREGATE']._serialized_end=2101 - _globals['_AGGREGATE_EMITSTRATEGY']._serialized_start=2055 - _globals['_AGGREGATE_EMITSTRATEGY']._serialized_end=2091 - _globals['_JOIN']._serialized_start=2104 - _globals['_JOIN']._serialized_end=2522 - _globals['_JOIN_ONENTRY']._serialized_start=2422 - _globals['_JOIN_ONENTRY']._serialized_end=2463 - _globals['_JOIN_HOW']._serialized_start=2465 - _globals['_JOIN_HOW']._serialized_end=2491 - _globals['_TRANSFORM']._serialized_start=2525 - _globals['_TRANSFORM']._serialized_end=2762 - _globals['_TRANSFORM_SCHEMAENTRY']._serialized_start=2686 - _globals['_TRANSFORM_SCHEMAENTRY']._serialized_end=2762 - _globals['_FILTER']._serialized_start=2764 - _globals['_FILTER']._serialized_end=2859 - _globals['_ASSIGN']._serialized_start=2862 - _globals['_ASSIGN']._serialized_end=3030 - _globals['_DROPNULL']._serialized_start=3032 - _globals['_DROPNULL']._serialized_end=3101 - _globals['_DROP']._serialized_start=3103 - _globals['_DROP']._serialized_end=3169 - _globals['_RENAME']._serialized_start=3172 - _globals['_RENAME']._serialized_end=3337 - _globals['_RENAME_COLUMNMAPENTRY']._serialized_start=3289 - _globals['_RENAME_COLUMNMAPENTRY']._serialized_end=3337 - _globals['_UNION']._serialized_start=3339 - _globals['_UNION']._serialized_end=3390 - _globals['_DEDUP']._serialized_start=3392 - _globals['_DEDUP']._serialized_end=3458 - _globals['_EXPLODE']._serialized_start=3460 - _globals['_EXPLODE']._serialized_end=3528 - _globals['_FIRST']._serialized_start=3530 - _globals['_FIRST']._serialized_end=3591 - _globals['_LATEST']._serialized_start=3593 - _globals['_LATEST']._serialized_end=3655 - _globals['_CHANGELOG']._serialized_start=3657 - _globals['_CHANGELOG']._serialized_end=3733 - _globals['_WINDOWOPERATORKIND']._serialized_start=3736 - _globals['_WINDOWOPERATORKIND']._serialized_end=3939 - _globals['_DATASETREF']._serialized_start=3941 - _globals['_DATASETREF']._serialized_end=3985 - _globals['_DATAFLOW']._serialized_start=3988 - _globals['_DATAFLOW']._serialized_end=4244 - _globals['_DATAFLOW_PIPELINEDATAFLOW']._serialized_start=4116 - _globals['_DATAFLOW_PIPELINEDATAFLOW']._serialized_end=4236 - _globals['_PIPELINELINEAGES']._serialized_start=4247 - _globals['_PIPELINELINEAGES']._serialized_end=4403 - _globals['_DATASETPIPELINELINEAGES']._serialized_start=4405 - _globals['_DATASETPIPELINELINEAGES']._serialized_end=4497 - _globals['_DATASETLINEAGES']._serialized_start=4500 - _globals['_DATASETLINEAGES']._serialized_end=4639 + _globals['_COREDATASET']._serialized_start=154 + _globals['_COREDATASET']._serialized_end=639 + _globals['_COREDATASET_FIELDMETADATAENTRY']._serialized_start=554 + _globals['_COREDATASET_FIELDMETADATAENTRY']._serialized_end=639 + _globals['_ONDEMAND']._serialized_start=641 + _globals['_ONDEMAND']._serialized_end=722 + _globals['_PIPELINE']._serialized_start=725 + _globals['_PIPELINE']._serialized_end=935 + _globals['_OPERATOR']._serialized_start=938 + _globals['_OPERATOR']._serialized_end=1977 + _globals['_AGGREGATE']._serialized_start=1980 + _globals['_AGGREGATE']._serialized_end=2227 + _globals['_AGGREGATE_EMITSTRATEGY']._serialized_start=2181 + _globals['_AGGREGATE_EMITSTRATEGY']._serialized_end=2217 + _globals['_JOIN']._serialized_start=2230 + _globals['_JOIN']._serialized_end=2648 + _globals['_JOIN_ONENTRY']._serialized_start=2548 + _globals['_JOIN_ONENTRY']._serialized_end=2589 + _globals['_JOIN_HOW']._serialized_start=2591 + _globals['_JOIN_HOW']._serialized_end=2617 + _globals['_TRANSFORM']._serialized_start=2651 + _globals['_TRANSFORM']._serialized_end=2888 + _globals['_TRANSFORM_SCHEMAENTRY']._serialized_start=2812 + _globals['_TRANSFORM_SCHEMAENTRY']._serialized_end=2888 + _globals['_FILTEREXPR']._serialized_start=2890 + _globals['_FILTEREXPR']._serialized_end=2983 + _globals['_FILTER']._serialized_start=2985 + _globals['_FILTER']._serialized_end=3080 + _globals['_ASSIGN']._serialized_start=3083 + _globals['_ASSIGN']._serialized_end=3251 + _globals['_ASSIGNEXPR']._serialized_start=3254 + _globals['_ASSIGNEXPR']._serialized_end=3595 + _globals['_ASSIGNEXPR_EXPRSENTRY']._serialized_start=3443 + _globals['_ASSIGNEXPR_EXPRSENTRY']._serialized_end=3512 + _globals['_ASSIGNEXPR_OUTPUTTYPESENTRY']._serialized_start=3514 + _globals['_ASSIGNEXPR_OUTPUTTYPESENTRY']._serialized_end=3595 + _globals['_DROPNULL']._serialized_start=3597 + _globals['_DROPNULL']._serialized_end=3666 + _globals['_DROP']._serialized_start=3668 + _globals['_DROP']._serialized_end=3734 + _globals['_RENAME']._serialized_start=3737 + _globals['_RENAME']._serialized_end=3902 + _globals['_RENAME_COLUMNMAPENTRY']._serialized_start=3854 + _globals['_RENAME_COLUMNMAPENTRY']._serialized_end=3902 + _globals['_UNION']._serialized_start=3904 + _globals['_UNION']._serialized_end=3955 + _globals['_DEDUP']._serialized_start=3957 + _globals['_DEDUP']._serialized_end=4023 + _globals['_EXPLODE']._serialized_start=4025 + _globals['_EXPLODE']._serialized_end=4093 + _globals['_FIRST']._serialized_start=4095 + _globals['_FIRST']._serialized_end=4156 + _globals['_LATEST']._serialized_start=4158 + _globals['_LATEST']._serialized_end=4220 + _globals['_CHANGELOG']._serialized_start=4222 + _globals['_CHANGELOG']._serialized_end=4298 + _globals['_WINDOWOPERATORKIND']._serialized_start=4301 + _globals['_WINDOWOPERATORKIND']._serialized_end=4504 + _globals['_DATASETREF']._serialized_start=4506 + _globals['_DATASETREF']._serialized_end=4550 + _globals['_DATAFLOW']._serialized_start=4553 + _globals['_DATAFLOW']._serialized_end=4809 + _globals['_DATAFLOW_PIPELINEDATAFLOW']._serialized_start=4681 + _globals['_DATAFLOW_PIPELINEDATAFLOW']._serialized_end=4801 + _globals['_PIPELINELINEAGES']._serialized_start=4812 + _globals['_PIPELINELINEAGES']._serialized_end=4968 + _globals['_DATASETPIPELINELINEAGES']._serialized_start=4970 + _globals['_DATASETPIPELINELINEAGES']._serialized_end=5062 + _globals['_DATASETLINEAGES']._serialized_start=5065 + _globals['_DATASETLINEAGES']._serialized_end=5204 # @@protoc_insertion_point(module_scope) diff --git a/fennel/gen/dataset_pb2.pyi b/fennel/gen/dataset_pb2.pyi index b1eeb1c89..384699764 100644 --- a/fennel/gen/dataset_pb2.pyi +++ b/fennel/gen/dataset_pb2.pyi @@ -4,6 +4,7 @@ isort:skip_file """ import builtins import collections.abc +import expr_pb2 import google.protobuf.descriptor import google.protobuf.duration_pb2 import google.protobuf.internal.containers @@ -186,6 +187,8 @@ class Operator(google.protobuf.message.Message): WINDOW_FIELD_NUMBER: builtins.int LATEST_FIELD_NUMBER: builtins.int CHANGELOG_FIELD_NUMBER: builtins.int + ASSIGN_EXPR_FIELD_NUMBER: builtins.int + FILTER_EXPR_FIELD_NUMBER: builtins.int NAME_FIELD_NUMBER: builtins.int id: builtins.str """Every operator has an ID assigned by the client""" @@ -230,6 +233,10 @@ class Operator(google.protobuf.message.Message): def latest(self) -> global___Latest: ... @property def changelog(self) -> global___Changelog: ... + @property + def assign_expr(self) -> global___AssignExpr: ... + @property + def filter_expr(self) -> global___FilterExpr: ... name: builtins.str """NOTE: FOLLOWING PROPERTIES ARE SET BY THE SERVER AND WILL BE IGNORED BY THE CLIENT @@ -260,11 +267,13 @@ class Operator(google.protobuf.message.Message): window: global___WindowOperatorKind | None = ..., latest: global___Latest | None = ..., changelog: global___Changelog | None = ..., + assign_expr: global___AssignExpr | None = ..., + filter_expr: global___FilterExpr | None = ..., name: builtins.str = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["aggregate", b"aggregate", "assign", b"assign", "changelog", b"changelog", "dataset_ref", b"dataset_ref", "dedup", b"dedup", "drop", b"drop", "dropnull", b"dropnull", "explode", b"explode", "filter", b"filter", "first", b"first", "join", b"join", "kind", b"kind", "latest", b"latest", "rename", b"rename", "transform", b"transform", "union", b"union", "window", b"window"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["aggregate", b"aggregate", "assign", b"assign", "changelog", b"changelog", "dataset_name", b"dataset_name", "dataset_ref", b"dataset_ref", "dedup", b"dedup", "drop", b"drop", "dropnull", b"dropnull", "ds_version", b"ds_version", "explode", b"explode", "filter", b"filter", "first", b"first", "id", b"id", "is_root", b"is_root", "join", b"join", "kind", b"kind", "latest", b"latest", "name", b"name", "pipeline_name", b"pipeline_name", "rename", b"rename", "transform", b"transform", "union", b"union", "window", b"window"]) -> None: ... - def WhichOneof(self, oneof_group: typing_extensions.Literal["kind", b"kind"]) -> typing_extensions.Literal["aggregate", "join", "transform", "union", "filter", "dataset_ref", "rename", "drop", "explode", "dedup", "first", "assign", "dropnull", "window", "latest", "changelog"] | None: ... + def HasField(self, field_name: typing_extensions.Literal["aggregate", b"aggregate", "assign", b"assign", "assign_expr", b"assign_expr", "changelog", b"changelog", "dataset_ref", b"dataset_ref", "dedup", b"dedup", "drop", b"drop", "dropnull", b"dropnull", "explode", b"explode", "filter", b"filter", "filter_expr", b"filter_expr", "first", b"first", "join", b"join", "kind", b"kind", "latest", b"latest", "rename", b"rename", "transform", b"transform", "union", b"union", "window", b"window"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["aggregate", b"aggregate", "assign", b"assign", "assign_expr", b"assign_expr", "changelog", b"changelog", "dataset_name", b"dataset_name", "dataset_ref", b"dataset_ref", "dedup", b"dedup", "drop", b"drop", "dropnull", b"dropnull", "ds_version", b"ds_version", "explode", b"explode", "filter", b"filter", "filter_expr", b"filter_expr", "first", b"first", "id", b"id", "is_root", b"is_root", "join", b"join", "kind", b"kind", "latest", b"latest", "name", b"name", "pipeline_name", b"pipeline_name", "rename", b"rename", "transform", b"transform", "union", b"union", "window", b"window"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["kind", b"kind"]) -> typing_extensions.Literal["aggregate", "join", "transform", "union", "filter", "dataset_ref", "rename", "drop", "explode", "dedup", "first", "assign", "dropnull", "window", "latest", "changelog", "assign_expr", "filter_expr"] | None: ... global___Operator = Operator @@ -444,6 +453,32 @@ class Transform(google.protobuf.message.Message): global___Transform = Transform +@typing_extensions.final +class FilterExpr(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OPERAND_ID_FIELD_NUMBER: builtins.int + EXPR_FIELD_NUMBER: builtins.int + OPERAND_NAME_FIELD_NUMBER: builtins.int + operand_id: builtins.str + @property + def expr(self) -> expr_pb2.Expr: ... + operand_name: builtins.str + """NOTE: FOLLOWING PROPERTIES ARE SET BY THE SERVER AND WILL BE IGNORED BY + THE CLIENT + """ + def __init__( + self, + *, + operand_id: builtins.str = ..., + expr: expr_pb2.Expr | None = ..., + operand_name: builtins.str = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["expr", b"expr"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["expr", b"expr", "operand_id", b"operand_id", "operand_name", b"operand_name"]) -> None: ... + +global___FilterExpr = FilterExpr + @typing_extensions.final class Filter(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -503,6 +538,71 @@ class Assign(google.protobuf.message.Message): global___Assign = Assign +@typing_extensions.final +class AssignExpr(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class ExprsEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.str + @property + def value(self) -> expr_pb2.Expr: ... + def __init__( + self, + *, + key: builtins.str = ..., + value: expr_pb2.Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["value", b"value"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ... + + @typing_extensions.final + class OutputTypesEntry(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + VALUE_FIELD_NUMBER: builtins.int + key: builtins.str + @property + def value(self) -> schema_pb2.DataType: ... + def __init__( + self, + *, + key: builtins.str = ..., + value: schema_pb2.DataType | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["value", b"value"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key", "value", b"value"]) -> None: ... + + OPERAND_ID_FIELD_NUMBER: builtins.int + EXPRS_FIELD_NUMBER: builtins.int + OUTPUT_TYPES_FIELD_NUMBER: builtins.int + OPERAND_NAME_FIELD_NUMBER: builtins.int + operand_id: builtins.str + @property + def exprs(self) -> google.protobuf.internal.containers.MessageMap[builtins.str, expr_pb2.Expr]: ... + @property + def output_types(self) -> google.protobuf.internal.containers.MessageMap[builtins.str, schema_pb2.DataType]: ... + operand_name: builtins.str + """NOTE: FOLLOWING PROPERTIES ARE SET BY THE SERVER AND WILL BE IGNORED BY + THE CLIENT + """ + def __init__( + self, + *, + operand_id: builtins.str = ..., + exprs: collections.abc.Mapping[builtins.str, expr_pb2.Expr] | None = ..., + output_types: collections.abc.Mapping[builtins.str, schema_pb2.DataType] | None = ..., + operand_name: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["exprs", b"exprs", "operand_id", b"operand_id", "operand_name", b"operand_name", "output_types", b"output_types"]) -> None: ... + +global___AssignExpr = AssignExpr + @typing_extensions.final class Dropnull(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor diff --git a/fennel/gen/expr_pb2.py b/fennel/gen/expr_pb2.py new file mode 100644 index 000000000..faf568c8a --- /dev/null +++ b/fennel/gen/expr_pb2.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: expr.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +import fennel.gen.schema_pb2 as schema__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\nexpr.proto\x12\x11\x66\x65nnel.proto.expr\x1a\x0cschema.proto\"\xba\x04\n\x04\x45xpr\x12%\n\x03ref\x18\x01 \x01(\x0b\x32\x16.fennel.proto.expr.RefH\x00\x12\x36\n\x0cjson_literal\x18\x02 \x01(\x0b\x32\x1e.fennel.proto.expr.JsonLiteralH\x00\x12)\n\x05unary\x18\x04 \x01(\x0b\x32\x18.fennel.proto.expr.UnaryH\x00\x12\'\n\x04\x63\x61se\x18\x05 \x01(\x0b\x32\x17.fennel.proto.expr.CaseH\x00\x12+\n\x06\x62inary\x18\x06 \x01(\x0b\x32\x19.fennel.proto.expr.BinaryH\x00\x12+\n\x06isnull\x18\x07 \x01(\x0b\x32\x19.fennel.proto.expr.IsNullH\x00\x12/\n\x08\x66illnull\x18\x08 \x01(\x0b\x32\x1b.fennel.proto.expr.FillNullH\x00\x12,\n\x07list_fn\x18\t \x01(\x0b\x32\x19.fennel.proto.expr.ListFnH\x00\x12,\n\x07math_fn\x18\n \x01(\x0b\x32\x19.fennel.proto.expr.MathFnH\x00\x12\x30\n\tstruct_fn\x18\x0b \x01(\x0b\x32\x1b.fennel.proto.expr.StructFnH\x00\x12,\n\x07\x64ict_fn\x18\x0c \x01(\x0b\x32\x19.fennel.proto.expr.DictFnH\x00\x12\x30\n\tstring_fn\x18\r \x01(\x0b\x32\x1b.fennel.proto.expr.StringFnH\x00\x42\x06\n\x04node\"L\n\x0bJsonLiteral\x12\x0f\n\x07literal\x18\x01 \x01(\t\x12,\n\x05\x64type\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\x13\n\x03Ref\x12\x0c\n\x04name\x18\x01 \x01(\t\"Y\n\x05Unary\x12&\n\x02op\x18\x01 \x01(\x0e\x32\x1a.fennel.proto.expr.UnaryOp\x12(\n\x07operand\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"}\n\x06\x42inary\x12%\n\x04left\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12&\n\x05right\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12$\n\x02op\x18\x03 \x01(\x0e\x32\x18.fennel.proto.expr.BinOp\"b\n\x04\x43\x61se\x12.\n\twhen_then\x18\x01 \x03(\x0b\x32\x1b.fennel.proto.expr.WhenThen\x12*\n\totherwise\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"X\n\x08WhenThen\x12%\n\x04when\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12%\n\x04then\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"2\n\x06IsNull\x12(\n\x07operand\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"[\n\x08\x46illNull\x12(\n\x07operand\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12%\n\x04\x66ill\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"\x93\x01\n\x06ListOp\x12%\n\x03len\x18\x01 \x01(\x0b\x32\x16.fennel.proto.expr.LenH\x00\x12&\n\x03get\x18\x02 \x01(\x0b\x32\x17.fennel.proto.expr.ExprH\x00\x12/\n\x08\x63ontains\x18\x03 \x01(\x0b\x32\x1b.fennel.proto.expr.ContainsH\x00\x42\t\n\x07\x66n_type\"\x05\n\x03Len\"4\n\x08\x43ontains\x12(\n\x07\x65lement\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"V\n\x06ListFn\x12%\n\x04list\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12%\n\x02\x66n\x18\x02 \x01(\x0b\x32\x19.fennel.proto.expr.ListOp\"\xb9\x01\n\x06MathOp\x12)\n\x05round\x18\x01 \x01(\x0b\x32\x18.fennel.proto.expr.RoundH\x00\x12%\n\x03\x61\x62s\x18\x02 \x01(\x0b\x32\x16.fennel.proto.expr.AbsH\x00\x12\'\n\x04\x63\x65il\x18\x03 \x01(\x0b\x32\x17.fennel.proto.expr.CeilH\x00\x12)\n\x05\x66loor\x18\x04 \x01(\x0b\x32\x18.fennel.proto.expr.FloorH\x00\x42\t\n\x07\x66n_type\"\x1a\n\x05Round\x12\x11\n\tprecision\x18\x01 \x01(\x05\"\x05\n\x03\x41\x62s\"\x06\n\x04\x43\x65il\"\x07\n\x05\x46loor\"Y\n\x06MathFn\x12(\n\x07operand\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12%\n\x02\x66n\x18\x02 \x01(\x0b\x32\x19.fennel.proto.expr.MathOp\"&\n\x08StructOp\x12\x0f\n\x05\x66ield\x18\x01 \x01(\tH\x00\x42\t\n\x07\x66n_type\"\\\n\x08StructFn\x12\'\n\x06struct\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12\'\n\x02\x66n\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.expr.StructOp\"a\n\x07\x44ictGet\x12&\n\x05\x66ield\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12.\n\rdefault_value\x18\x03 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"\x96\x01\n\x06\x44ictOp\x12%\n\x03len\x18\x01 \x01(\x0b\x32\x16.fennel.proto.expr.LenH\x00\x12)\n\x03get\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.expr.DictGetH\x00\x12/\n\x08\x63ontains\x18\x03 \x01(\x0b\x32\x1b.fennel.proto.expr.ContainsH\x00\x42\t\n\x07\x66n_type\"V\n\x06\x44ictFn\x12%\n\x04\x64ict\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12%\n\x02\x66n\x18\x02 \x01(\x0b\x32\x19.fennel.proto.expr.DictOp\"\xde\x02\n\x08StringOp\x12%\n\x03len\x18\x01 \x01(\x0b\x32\x16.fennel.proto.expr.LenH\x00\x12-\n\x07tolower\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.expr.ToLowerH\x00\x12-\n\x07toupper\x18\x03 \x01(\x0b\x32\x1a.fennel.proto.expr.ToUpperH\x00\x12/\n\x08\x63ontains\x18\x04 \x01(\x0b\x32\x1b.fennel.proto.expr.ContainsH\x00\x12\x33\n\nstartswith\x18\x05 \x01(\x0b\x32\x1d.fennel.proto.expr.StartsWithH\x00\x12/\n\x08\x65ndswith\x18\x06 \x01(\x0b\x32\x1b.fennel.proto.expr.EndsWithH\x00\x12+\n\x06\x63oncat\x18\x07 \x01(\x0b\x32\x19.fennel.proto.expr.ConcatH\x00\x42\t\n\x07\x66n_type\"\t\n\x07ToLower\"\t\n\x07ToUpper\"2\n\nStartsWith\x12$\n\x03key\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"0\n\x08\x45ndsWith\x12$\n\x03key\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"0\n\x06\x43oncat\x12&\n\x05other\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\"\\\n\x08StringFn\x12\'\n\x06string\x18\x01 \x01(\x0b\x32\x17.fennel.proto.expr.Expr\x12\'\n\x02\x66n\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.expr.StringOp*$\n\x07UnaryOp\x12\x07\n\x03NEG\x10\x00\x12\x07\n\x03NOT\x10\x01\x12\x07\n\x03LEN\x10\x02*\x86\x01\n\x05\x42inOp\x12\x07\n\x03\x41\x44\x44\x10\x00\x12\x07\n\x03SUB\x10\x01\x12\x07\n\x03MUL\x10\x02\x12\x07\n\x03\x44IV\x10\x03\x12\x07\n\x03MOD\x10\x04\x12\r\n\tFLOOR_DIV\x10\x05\x12\x06\n\x02\x45Q\x10\x06\x12\x06\n\x02NE\x10\x07\x12\x06\n\x02GT\x10\x08\x12\x07\n\x03GTE\x10\t\x12\x06\n\x02LT\x10\n\x12\x07\n\x03LTE\x10\x0b\x12\x07\n\x03\x41ND\x10\x0c\x12\x06\n\x02OR\x10\rb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'expr_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + DESCRIPTOR._options = None + _globals['_UNARYOP']._serialized_start=2997 + _globals['_UNARYOP']._serialized_end=3033 + _globals['_BINOP']._serialized_start=3036 + _globals['_BINOP']._serialized_end=3170 + _globals['_EXPR']._serialized_start=48 + _globals['_EXPR']._serialized_end=618 + _globals['_JSONLITERAL']._serialized_start=620 + _globals['_JSONLITERAL']._serialized_end=696 + _globals['_REF']._serialized_start=698 + _globals['_REF']._serialized_end=717 + _globals['_UNARY']._serialized_start=719 + _globals['_UNARY']._serialized_end=808 + _globals['_BINARY']._serialized_start=810 + _globals['_BINARY']._serialized_end=935 + _globals['_CASE']._serialized_start=937 + _globals['_CASE']._serialized_end=1035 + _globals['_WHENTHEN']._serialized_start=1037 + _globals['_WHENTHEN']._serialized_end=1125 + _globals['_ISNULL']._serialized_start=1127 + _globals['_ISNULL']._serialized_end=1177 + _globals['_FILLNULL']._serialized_start=1179 + _globals['_FILLNULL']._serialized_end=1270 + _globals['_LISTOP']._serialized_start=1273 + _globals['_LISTOP']._serialized_end=1420 + _globals['_LEN']._serialized_start=1422 + _globals['_LEN']._serialized_end=1427 + _globals['_CONTAINS']._serialized_start=1429 + _globals['_CONTAINS']._serialized_end=1481 + _globals['_LISTFN']._serialized_start=1483 + _globals['_LISTFN']._serialized_end=1569 + _globals['_MATHOP']._serialized_start=1572 + _globals['_MATHOP']._serialized_end=1757 + _globals['_ROUND']._serialized_start=1759 + _globals['_ROUND']._serialized_end=1785 + _globals['_ABS']._serialized_start=1787 + _globals['_ABS']._serialized_end=1792 + _globals['_CEIL']._serialized_start=1794 + _globals['_CEIL']._serialized_end=1800 + _globals['_FLOOR']._serialized_start=1802 + _globals['_FLOOR']._serialized_end=1809 + _globals['_MATHFN']._serialized_start=1811 + _globals['_MATHFN']._serialized_end=1900 + _globals['_STRUCTOP']._serialized_start=1902 + _globals['_STRUCTOP']._serialized_end=1940 + _globals['_STRUCTFN']._serialized_start=1942 + _globals['_STRUCTFN']._serialized_end=2034 + _globals['_DICTGET']._serialized_start=2036 + _globals['_DICTGET']._serialized_end=2133 + _globals['_DICTOP']._serialized_start=2136 + _globals['_DICTOP']._serialized_end=2286 + _globals['_DICTFN']._serialized_start=2288 + _globals['_DICTFN']._serialized_end=2374 + _globals['_STRINGOP']._serialized_start=2377 + _globals['_STRINGOP']._serialized_end=2727 + _globals['_TOLOWER']._serialized_start=2729 + _globals['_TOLOWER']._serialized_end=2738 + _globals['_TOUPPER']._serialized_start=2740 + _globals['_TOUPPER']._serialized_end=2749 + _globals['_STARTSWITH']._serialized_start=2751 + _globals['_STARTSWITH']._serialized_end=2801 + _globals['_ENDSWITH']._serialized_start=2803 + _globals['_ENDSWITH']._serialized_end=2851 + _globals['_CONCAT']._serialized_start=2853 + _globals['_CONCAT']._serialized_end=2901 + _globals['_STRINGFN']._serialized_start=2903 + _globals['_STRINGFN']._serialized_end=2995 +# @@protoc_insertion_point(module_scope) diff --git a/fennel/gen/expr_pb2.pyi b/fennel/gen/expr_pb2.pyi new file mode 100644 index 000000000..64bfed121 --- /dev/null +++ b/fennel/gen/expr_pb2.pyi @@ -0,0 +1,711 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import schema_pb2 +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class _UnaryOp: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _UnaryOpEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_UnaryOp.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + NEG: _UnaryOp.ValueType # 0 + NOT: _UnaryOp.ValueType # 1 + LEN: _UnaryOp.ValueType # 2 + +class UnaryOp(_UnaryOp, metaclass=_UnaryOpEnumTypeWrapper): ... + +NEG: UnaryOp.ValueType # 0 +NOT: UnaryOp.ValueType # 1 +LEN: UnaryOp.ValueType # 2 +global___UnaryOp = UnaryOp + +class _BinOp: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _BinOpEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_BinOp.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + ADD: _BinOp.ValueType # 0 + SUB: _BinOp.ValueType # 1 + MUL: _BinOp.ValueType # 2 + DIV: _BinOp.ValueType # 3 + MOD: _BinOp.ValueType # 4 + FLOOR_DIV: _BinOp.ValueType # 5 + EQ: _BinOp.ValueType # 6 + NE: _BinOp.ValueType # 7 + GT: _BinOp.ValueType # 8 + GTE: _BinOp.ValueType # 9 + LT: _BinOp.ValueType # 10 + LTE: _BinOp.ValueType # 11 + AND: _BinOp.ValueType # 12 + OR: _BinOp.ValueType # 13 + +class BinOp(_BinOp, metaclass=_BinOpEnumTypeWrapper): ... + +ADD: BinOp.ValueType # 0 +SUB: BinOp.ValueType # 1 +MUL: BinOp.ValueType # 2 +DIV: BinOp.ValueType # 3 +MOD: BinOp.ValueType # 4 +FLOOR_DIV: BinOp.ValueType # 5 +EQ: BinOp.ValueType # 6 +NE: BinOp.ValueType # 7 +GT: BinOp.ValueType # 8 +GTE: BinOp.ValueType # 9 +LT: BinOp.ValueType # 10 +LTE: BinOp.ValueType # 11 +AND: BinOp.ValueType # 12 +OR: BinOp.ValueType # 13 +global___BinOp = BinOp + +@typing_extensions.final +class Expr(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + REF_FIELD_NUMBER: builtins.int + JSON_LITERAL_FIELD_NUMBER: builtins.int + UNARY_FIELD_NUMBER: builtins.int + CASE_FIELD_NUMBER: builtins.int + BINARY_FIELD_NUMBER: builtins.int + ISNULL_FIELD_NUMBER: builtins.int + FILLNULL_FIELD_NUMBER: builtins.int + LIST_FN_FIELD_NUMBER: builtins.int + MATH_FN_FIELD_NUMBER: builtins.int + STRUCT_FN_FIELD_NUMBER: builtins.int + DICT_FN_FIELD_NUMBER: builtins.int + STRING_FN_FIELD_NUMBER: builtins.int + @property + def ref(self) -> global___Ref: ... + @property + def json_literal(self) -> global___JsonLiteral: + """Used for serializing a literal as a JSON string""" + @property + def unary(self) -> global___Unary: ... + @property + def case(self) -> global___Case: ... + @property + def binary(self) -> global___Binary: ... + @property + def isnull(self) -> global___IsNull: ... + @property + def fillnull(self) -> global___FillNull: ... + @property + def list_fn(self) -> global___ListFn: ... + @property + def math_fn(self) -> global___MathFn: ... + @property + def struct_fn(self) -> global___StructFn: ... + @property + def dict_fn(self) -> global___DictFn: ... + @property + def string_fn(self) -> global___StringFn: ... + def __init__( + self, + *, + ref: global___Ref | None = ..., + json_literal: global___JsonLiteral | None = ..., + unary: global___Unary | None = ..., + case: global___Case | None = ..., + binary: global___Binary | None = ..., + isnull: global___IsNull | None = ..., + fillnull: global___FillNull | None = ..., + list_fn: global___ListFn | None = ..., + math_fn: global___MathFn | None = ..., + struct_fn: global___StructFn | None = ..., + dict_fn: global___DictFn | None = ..., + string_fn: global___StringFn | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["binary", b"binary", "case", b"case", "dict_fn", b"dict_fn", "fillnull", b"fillnull", "isnull", b"isnull", "json_literal", b"json_literal", "list_fn", b"list_fn", "math_fn", b"math_fn", "node", b"node", "ref", b"ref", "string_fn", b"string_fn", "struct_fn", b"struct_fn", "unary", b"unary"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["binary", b"binary", "case", b"case", "dict_fn", b"dict_fn", "fillnull", b"fillnull", "isnull", b"isnull", "json_literal", b"json_literal", "list_fn", b"list_fn", "math_fn", b"math_fn", "node", b"node", "ref", b"ref", "string_fn", b"string_fn", "struct_fn", b"struct_fn", "unary", b"unary"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["node", b"node"]) -> typing_extensions.Literal["ref", "json_literal", "unary", "case", "binary", "isnull", "fillnull", "list_fn", "math_fn", "struct_fn", "dict_fn", "string_fn"] | None: ... + +global___Expr = Expr + +@typing_extensions.final +class JsonLiteral(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LITERAL_FIELD_NUMBER: builtins.int + DTYPE_FIELD_NUMBER: builtins.int + literal: builtins.str + """Literal sent as a JSON string""" + @property + def dtype(self) -> schema_pb2.DataType: ... + def __init__( + self, + *, + literal: builtins.str = ..., + dtype: schema_pb2.DataType | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["dtype", b"dtype"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["dtype", b"dtype", "literal", b"literal"]) -> None: ... + +global___JsonLiteral = JsonLiteral + +@typing_extensions.final +class Ref(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NAME_FIELD_NUMBER: builtins.int + name: builtins.str + def __init__( + self, + *, + name: builtins.str = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["name", b"name"]) -> None: ... + +global___Ref = Ref + +@typing_extensions.final +class Unary(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OP_FIELD_NUMBER: builtins.int + OPERAND_FIELD_NUMBER: builtins.int + op: global___UnaryOp.ValueType + @property + def operand(self) -> global___Expr: ... + def __init__( + self, + *, + op: global___UnaryOp.ValueType = ..., + operand: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["operand", b"operand"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["op", b"op", "operand", b"operand"]) -> None: ... + +global___Unary = Unary + +@typing_extensions.final +class Binary(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LEFT_FIELD_NUMBER: builtins.int + RIGHT_FIELD_NUMBER: builtins.int + OP_FIELD_NUMBER: builtins.int + @property + def left(self) -> global___Expr: ... + @property + def right(self) -> global___Expr: ... + op: global___BinOp.ValueType + def __init__( + self, + *, + left: global___Expr | None = ..., + right: global___Expr | None = ..., + op: global___BinOp.ValueType = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["left", b"left", "right", b"right"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["left", b"left", "op", b"op", "right", b"right"]) -> None: ... + +global___Binary = Binary + +@typing_extensions.final +class Case(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + WHEN_THEN_FIELD_NUMBER: builtins.int + OTHERWISE_FIELD_NUMBER: builtins.int + @property + def when_then(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___WhenThen]: ... + @property + def otherwise(self) -> global___Expr: ... + def __init__( + self, + *, + when_then: collections.abc.Iterable[global___WhenThen] | None = ..., + otherwise: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["otherwise", b"otherwise"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["otherwise", b"otherwise", "when_then", b"when_then"]) -> None: ... + +global___Case = Case + +@typing_extensions.final +class WhenThen(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + WHEN_FIELD_NUMBER: builtins.int + THEN_FIELD_NUMBER: builtins.int + @property + def when(self) -> global___Expr: ... + @property + def then(self) -> global___Expr: ... + def __init__( + self, + *, + when: global___Expr | None = ..., + then: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["then", b"then", "when", b"when"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["then", b"then", "when", b"when"]) -> None: ... + +global___WhenThen = WhenThen + +@typing_extensions.final +class IsNull(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OPERAND_FIELD_NUMBER: builtins.int + @property + def operand(self) -> global___Expr: ... + def __init__( + self, + *, + operand: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["operand", b"operand"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["operand", b"operand"]) -> None: ... + +global___IsNull = IsNull + +@typing_extensions.final +class FillNull(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OPERAND_FIELD_NUMBER: builtins.int + FILL_FIELD_NUMBER: builtins.int + @property + def operand(self) -> global___Expr: ... + @property + def fill(self) -> global___Expr: ... + def __init__( + self, + *, + operand: global___Expr | None = ..., + fill: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fill", b"fill", "operand", b"operand"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fill", b"fill", "operand", b"operand"]) -> None: ... + +global___FillNull = FillNull + +@typing_extensions.final +class ListOp(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LEN_FIELD_NUMBER: builtins.int + GET_FIELD_NUMBER: builtins.int + CONTAINS_FIELD_NUMBER: builtins.int + @property + def len(self) -> global___Len: ... + @property + def get(self) -> global___Expr: + """Index to fetch an element from the list""" + @property + def contains(self) -> global___Contains: + """Check if the list contains an element""" + def __init__( + self, + *, + len: global___Len | None = ..., + get: global___Expr | None = ..., + contains: global___Contains | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["contains", b"contains", "fn_type", b"fn_type", "get", b"get", "len", b"len"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["contains", b"contains", "fn_type", b"fn_type", "get", b"get", "len", b"len"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["fn_type", b"fn_type"]) -> typing_extensions.Literal["len", "get", "contains"] | None: ... + +global___ListOp = ListOp + +@typing_extensions.final +class Len(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___Len = Len + +@typing_extensions.final +class Contains(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ELEMENT_FIELD_NUMBER: builtins.int + @property + def element(self) -> global___Expr: ... + def __init__( + self, + *, + element: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["element", b"element"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["element", b"element"]) -> None: ... + +global___Contains = Contains + +@typing_extensions.final +class ListFn(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LIST_FIELD_NUMBER: builtins.int + FN_FIELD_NUMBER: builtins.int + @property + def list(self) -> global___Expr: ... + @property + def fn(self) -> global___ListOp: ... + def __init__( + self, + *, + list: global___Expr | None = ..., + fn: global___ListOp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fn", b"fn", "list", b"list"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fn", b"fn", "list", b"list"]) -> None: ... + +global___ListFn = ListFn + +@typing_extensions.final +class MathOp(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ROUND_FIELD_NUMBER: builtins.int + ABS_FIELD_NUMBER: builtins.int + CEIL_FIELD_NUMBER: builtins.int + FLOOR_FIELD_NUMBER: builtins.int + @property + def round(self) -> global___Round: ... + @property + def abs(self) -> global___Abs: ... + @property + def ceil(self) -> global___Ceil: ... + @property + def floor(self) -> global___Floor: ... + def __init__( + self, + *, + round: global___Round | None = ..., + abs: global___Abs | None = ..., + ceil: global___Ceil | None = ..., + floor: global___Floor | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["abs", b"abs", "ceil", b"ceil", "floor", b"floor", "fn_type", b"fn_type", "round", b"round"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["abs", b"abs", "ceil", b"ceil", "floor", b"floor", "fn_type", b"fn_type", "round", b"round"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["fn_type", b"fn_type"]) -> typing_extensions.Literal["round", "abs", "ceil", "floor"] | None: ... + +global___MathOp = MathOp + +@typing_extensions.final +class Round(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PRECISION_FIELD_NUMBER: builtins.int + precision: builtins.int + def __init__( + self, + *, + precision: builtins.int = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["precision", b"precision"]) -> None: ... + +global___Round = Round + +@typing_extensions.final +class Abs(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___Abs = Abs + +@typing_extensions.final +class Ceil(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___Ceil = Ceil + +@typing_extensions.final +class Floor(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___Floor = Floor + +@typing_extensions.final +class MathFn(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OPERAND_FIELD_NUMBER: builtins.int + FN_FIELD_NUMBER: builtins.int + @property + def operand(self) -> global___Expr: ... + @property + def fn(self) -> global___MathOp: ... + def __init__( + self, + *, + operand: global___Expr | None = ..., + fn: global___MathOp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fn", b"fn", "operand", b"operand"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fn", b"fn", "operand", b"operand"]) -> None: ... + +global___MathFn = MathFn + +@typing_extensions.final +class StructOp(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FIELD_FIELD_NUMBER: builtins.int + field: builtins.str + def __init__( + self, + *, + field: builtins.str = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["field", b"field", "fn_type", b"fn_type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["field", b"field", "fn_type", b"fn_type"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["fn_type", b"fn_type"]) -> typing_extensions.Literal["field"] | None: ... + +global___StructOp = StructOp + +@typing_extensions.final +class StructFn(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + STRUCT_FIELD_NUMBER: builtins.int + FN_FIELD_NUMBER: builtins.int + @property + def struct(self) -> global___Expr: ... + @property + def fn(self) -> global___StructOp: ... + def __init__( + self, + *, + struct: global___Expr | None = ..., + fn: global___StructOp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fn", b"fn", "struct", b"struct"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fn", b"fn", "struct", b"struct"]) -> None: ... + +global___StructFn = StructFn + +@typing_extensions.final +class DictGet(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + FIELD_FIELD_NUMBER: builtins.int + DEFAULT_VALUE_FIELD_NUMBER: builtins.int + @property + def field(self) -> global___Expr: ... + @property + def default_value(self) -> global___Expr: ... + def __init__( + self, + *, + field: global___Expr | None = ..., + default_value: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["default_value", b"default_value", "field", b"field"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["default_value", b"default_value", "field", b"field"]) -> None: ... + +global___DictGet = DictGet + +@typing_extensions.final +class DictOp(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LEN_FIELD_NUMBER: builtins.int + GET_FIELD_NUMBER: builtins.int + CONTAINS_FIELD_NUMBER: builtins.int + @property + def len(self) -> global___Len: ... + @property + def get(self) -> global___DictGet: ... + @property + def contains(self) -> global___Contains: ... + def __init__( + self, + *, + len: global___Len | None = ..., + get: global___DictGet | None = ..., + contains: global___Contains | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["contains", b"contains", "fn_type", b"fn_type", "get", b"get", "len", b"len"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["contains", b"contains", "fn_type", b"fn_type", "get", b"get", "len", b"len"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["fn_type", b"fn_type"]) -> typing_extensions.Literal["len", "get", "contains"] | None: ... + +global___DictOp = DictOp + +@typing_extensions.final +class DictFn(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + DICT_FIELD_NUMBER: builtins.int + FN_FIELD_NUMBER: builtins.int + @property + def dict(self) -> global___Expr: ... + @property + def fn(self) -> global___DictOp: ... + def __init__( + self, + *, + dict: global___Expr | None = ..., + fn: global___DictOp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["dict", b"dict", "fn", b"fn"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["dict", b"dict", "fn", b"fn"]) -> None: ... + +global___DictFn = DictFn + +@typing_extensions.final +class StringOp(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + LEN_FIELD_NUMBER: builtins.int + TOLOWER_FIELD_NUMBER: builtins.int + TOUPPER_FIELD_NUMBER: builtins.int + CONTAINS_FIELD_NUMBER: builtins.int + STARTSWITH_FIELD_NUMBER: builtins.int + ENDSWITH_FIELD_NUMBER: builtins.int + CONCAT_FIELD_NUMBER: builtins.int + @property + def len(self) -> global___Len: ... + @property + def tolower(self) -> global___ToLower: ... + @property + def toupper(self) -> global___ToUpper: ... + @property + def contains(self) -> global___Contains: ... + @property + def startswith(self) -> global___StartsWith: ... + @property + def endswith(self) -> global___EndsWith: ... + @property + def concat(self) -> global___Concat: ... + def __init__( + self, + *, + len: global___Len | None = ..., + tolower: global___ToLower | None = ..., + toupper: global___ToUpper | None = ..., + contains: global___Contains | None = ..., + startswith: global___StartsWith | None = ..., + endswith: global___EndsWith | None = ..., + concat: global___Concat | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["concat", b"concat", "contains", b"contains", "endswith", b"endswith", "fn_type", b"fn_type", "len", b"len", "startswith", b"startswith", "tolower", b"tolower", "toupper", b"toupper"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["concat", b"concat", "contains", b"contains", "endswith", b"endswith", "fn_type", b"fn_type", "len", b"len", "startswith", b"startswith", "tolower", b"tolower", "toupper", b"toupper"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["fn_type", b"fn_type"]) -> typing_extensions.Literal["len", "tolower", "toupper", "contains", "startswith", "endswith", "concat"] | None: ... + +global___StringOp = StringOp + +@typing_extensions.final +class ToLower(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___ToLower = ToLower + +@typing_extensions.final +class ToUpper(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___ToUpper = ToUpper + +@typing_extensions.final +class StartsWith(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + @property + def key(self) -> global___Expr: ... + def __init__( + self, + *, + key: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["key", b"key"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key"]) -> None: ... + +global___StartsWith = StartsWith + +@typing_extensions.final +class EndsWith(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + KEY_FIELD_NUMBER: builtins.int + @property + def key(self) -> global___Expr: ... + def __init__( + self, + *, + key: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["key", b"key"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["key", b"key"]) -> None: ... + +global___EndsWith = EndsWith + +@typing_extensions.final +class Concat(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + OTHER_FIELD_NUMBER: builtins.int + @property + def other(self) -> global___Expr: ... + def __init__( + self, + *, + other: global___Expr | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["other", b"other"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["other", b"other"]) -> None: ... + +global___Concat = Concat + +@typing_extensions.final +class StringFn(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + STRING_FIELD_NUMBER: builtins.int + FN_FIELD_NUMBER: builtins.int + @property + def string(self) -> global___Expr: ... + @property + def fn(self) -> global___StringOp: ... + def __init__( + self, + *, + string: global___Expr | None = ..., + fn: global___StringOp | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["fn", b"fn", "string", b"string"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["fn", b"fn", "string", b"string"]) -> None: ... + +global___StringFn = StringFn diff --git a/fennel/gen/pycode_pb2.py b/fennel/gen/pycode_pb2.py index c4cd0f703..d17598f79 100644 --- a/fennel/gen/pycode_pb2.py +++ b/fennel/gen/pycode_pb2.py @@ -13,7 +13,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cpycode.proto\x12\x13\x66\x65nnel.proto.pycode\"\xb3\x02\n\x06PyCode\x12\x13\n\x0b\x65ntry_point\x18\x01 \x01(\t\x12\x13\n\x0bsource_code\x18\x02 \x01(\t\x12\x11\n\tcore_code\x18\x03 \x01(\t\x12\x16\n\x0egenerated_code\x18\x04 \x01(\t\x12-\n\x08includes\x18\x05 \x03(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x42\n\x0cref_includes\x18\x06 \x03(\x0b\x32,.fennel.proto.pycode.PyCode.RefIncludesEntry\x12\x0f\n\x07imports\x18\x07 \x01(\t\x1aP\n\x10RefIncludesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12+\n\x05value\x18\x02 \x01(\x0e\x32\x1c.fennel.proto.pycode.RefType:\x02\x38\x01*&\n\x07RefType\x12\x0b\n\x07\x44\x61taset\x10\x00\x12\x0e\n\nFeatureset\x10\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cpycode.proto\x12\x13\x66\x65nnel.proto.pycode\"\xb3\x02\n\x06PyCode\x12\x13\n\x0b\x65ntry_point\x18\x01 \x01(\t\x12\x13\n\x0bsource_code\x18\x02 \x01(\t\x12\x11\n\tcore_code\x18\x03 \x01(\t\x12\x16\n\x0egenerated_code\x18\x04 \x01(\t\x12-\n\x08includes\x18\x05 \x03(\x0b\x32\x1b.fennel.proto.pycode.PyCode\x12\x42\n\x0cref_includes\x18\x06 \x03(\x0b\x32,.fennel.proto.pycode.PyCode.RefIncludesEntry\x12\x0f\n\x07imports\x18\x07 \x01(\t\x1aP\n\x10RefIncludesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12+\n\x05value\x18\x02 \x01(\x0e\x32\x1c.fennel.proto.pycode.RefType:\x02\x38\x01\"T\n\x03UDF\x12-\n\x06pycode\x18\x01 \x01(\x0b\x32\x1b.fennel.proto.pycode.PyCodeH\x00\x12\x16\n\x0cjson_literal\x18\x02 \x01(\tH\x00\x42\x06\n\x04node*&\n\x07RefType\x12\x0b\n\x07\x44\x61taset\x10\x00\x12\x0e\n\nFeatureset\x10\x01\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -22,10 +22,12 @@ DESCRIPTOR._options = None _globals['_PYCODE_REFINCLUDESENTRY']._options = None _globals['_PYCODE_REFINCLUDESENTRY']._serialized_options = b'8\001' - _globals['_REFTYPE']._serialized_start=347 - _globals['_REFTYPE']._serialized_end=385 + _globals['_REFTYPE']._serialized_start=433 + _globals['_REFTYPE']._serialized_end=471 _globals['_PYCODE']._serialized_start=38 _globals['_PYCODE']._serialized_end=345 _globals['_PYCODE_REFINCLUDESENTRY']._serialized_start=265 _globals['_PYCODE_REFINCLUDESENTRY']._serialized_end=345 + _globals['_UDF']._serialized_start=347 + _globals['_UDF']._serialized_end=431 # @@protoc_insertion_point(module_scope) diff --git a/fennel/gen/pycode_pb2.pyi b/fennel/gen/pycode_pb2.pyi index ff7f30edd..9b997c4f8 100644 --- a/fennel/gen/pycode_pb2.pyi +++ b/fennel/gen/pycode_pb2.pyi @@ -98,3 +98,25 @@ class PyCode(google.protobuf.message.Message): def ClearField(self, field_name: typing_extensions.Literal["core_code", b"core_code", "entry_point", b"entry_point", "generated_code", b"generated_code", "imports", b"imports", "includes", b"includes", "ref_includes", b"ref_includes", "source_code", b"source_code"]) -> None: ... global___PyCode = PyCode + +@typing_extensions.final +class UDF(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + PYCODE_FIELD_NUMBER: builtins.int + JSON_LITERAL_FIELD_NUMBER: builtins.int + @property + def pycode(self) -> global___PyCode: ... + json_literal: builtins.str + """Used for serializing a literal as a JSON string""" + def __init__( + self, + *, + pycode: global___PyCode | None = ..., + json_literal: builtins.str = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["json_literal", b"json_literal", "node", b"node", "pycode", b"pycode"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["json_literal", b"json_literal", "node", b"node", "pycode", b"pycode"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["node", b"node"]) -> typing_extensions.Literal["pycode", "json_literal"] | None: ... + +global___UDF = UDF diff --git a/fennel/gen/schema_pb2.py b/fennel/gen/schema_pb2.py index 3949be109..e8d3e1501 100644 --- a/fennel/gen/schema_pb2.py +++ b/fennel/gen/schema_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cschema.proto\x12\x13\x66\x65nnel.proto.schema\x1a\x1fgoogle/protobuf/timestamp.proto\"\x84\x07\n\x08\x44\x61taType\x12\x30\n\x08int_type\x18\x01 \x01(\x0b\x32\x1c.fennel.proto.schema.IntTypeH\x00\x12\x36\n\x0b\x64ouble_type\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.schema.DoubleTypeH\x00\x12\x36\n\x0bstring_type\x18\x03 \x01(\x0b\x32\x1f.fennel.proto.schema.StringTypeH\x00\x12\x32\n\tbool_type\x18\x04 \x01(\x0b\x32\x1d.fennel.proto.schema.BoolTypeH\x00\x12<\n\x0etimestamp_type\x18\x05 \x01(\x0b\x32\".fennel.proto.schema.TimestampTypeH\x00\x12\x34\n\narray_type\x18\x06 \x01(\x0b\x32\x1e.fennel.proto.schema.ArrayTypeH\x00\x12\x30\n\x08map_type\x18\x07 \x01(\x0b\x32\x1c.fennel.proto.schema.MapTypeH\x00\x12<\n\x0e\x65mbedding_type\x18\x08 \x01(\x0b\x32\".fennel.proto.schema.EmbeddingTypeH\x00\x12\x34\n\x0c\x62\x65tween_type\x18\t \x01(\x0b\x32\x1c.fennel.proto.schema.BetweenH\x00\x12\x31\n\x0bone_of_type\x18\n \x01(\x0b\x32\x1a.fennel.proto.schema.OneOfH\x00\x12\x34\n\nregex_type\x18\x0b \x01(\x0b\x32\x1e.fennel.proto.schema.RegexTypeH\x00\x12:\n\roptional_type\x18\x0c \x01(\x0b\x32!.fennel.proto.schema.OptionalTypeH\x00\x12\x36\n\x0bstruct_type\x18\r \x01(\x0b\x32\x1f.fennel.proto.schema.StructTypeH\x00\x12\x38\n\x0c\x64\x65\x63imal_type\x18\x0e \x01(\x0b\x32 .fennel.proto.schema.DecimalTypeH\x00\x12\x32\n\tdate_type\x18\x0f \x01(\x0b\x32\x1d.fennel.proto.schema.DateTypeH\x00\x12\x34\n\nbytes_type\x18\x10 \x01(\x0b\x32\x1e.fennel.proto.schema.BytesTypeH\x00\x42\x07\n\x05\x64type\"C\n\x05\x46ield\x12\x0c\n\x04name\x18\x01 \x01(\t\x12,\n\x05\x64type\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\t\n\x07IntType\"\x0c\n\nDoubleType\"\x0c\n\nStringType\"\n\n\x08\x42oolType\"\x0f\n\rTimestampType\"\n\n\x08\x44\x61teType\"\x0b\n\tBytesType\"\x1c\n\tRegexType\x12\x0f\n\x07pattern\x18\x01 \x01(\t\"6\n\tArrayType\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\'\n\rEmbeddingType\x12\x16\n\x0e\x65mbedding_size\x18\x02 \x01(\x05\"c\n\x07MapType\x12*\n\x03key\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"F\n\nStructType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12*\n\x06\x66ields\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Field\"\xb1\x01\n\x07\x42\x65tween\x12,\n\x05\x64type\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\'\n\x03min\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12\'\n\x03max\x18\x03 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12\x12\n\nstrict_min\x18\x04 \x01(\x08\x12\x12\n\nstrict_max\x18\x05 \x01(\x08\"_\n\x05OneOf\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12+\n\x07options\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Value\"9\n\x0cOptionalType\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\x1c\n\x0b\x44\x65\x63imalType\x12\r\n\x05scale\x18\x01 \x01(\x05\"4\n\x06Schema\x12*\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1a.fennel.proto.schema.Field\"\x89\x01\n\x08\x44SSchema\x12)\n\x04keys\x18\x01 \x01(\x0b\x32\x1b.fennel.proto.schema.Schema\x12+\n\x06values\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.schema.Schema\x12\x11\n\ttimestamp\x18\x03 \x01(\t\x12\x12\n\nerase_keys\x18\x04 \x03(\t\"\xda\x03\n\x05Value\x12)\n\x04none\x18\x01 \x01(\x0b\x32\x19.fennel.proto.schema.NoneH\x00\x12\x0e\n\x04\x62ool\x18\x02 \x01(\x08H\x00\x12\r\n\x03int\x18\x03 \x01(\x03H\x00\x12\x0f\n\x05\x66loat\x18\x04 \x01(\x01H\x00\x12\x10\n\x06string\x18\x05 \x01(\tH\x00\x12/\n\ttimestamp\x18\x06 \x01(\x0b\x32\x1a.google.protobuf.TimestampH\x00\x12\x33\n\tembedding\x18\x07 \x01(\x0b\x32\x1e.fennel.proto.schema.EmbeddingH\x00\x12)\n\x04list\x18\x08 \x01(\x0b\x32\x19.fennel.proto.schema.ListH\x00\x12\'\n\x03map\x18\t \x01(\x0b\x32\x18.fennel.proto.schema.MapH\x00\x12\x32\n\x06struct\x18\n \x01(\x0b\x32 .fennel.proto.schema.StructValueH\x00\x12/\n\x07\x64\x65\x63imal\x18\x0b \x01(\x0b\x32\x1c.fennel.proto.schema.DecimalH\x00\x12)\n\x04\x64\x61te\x18\x0c \x01(\x0b\x32\x19.fennel.proto.schema.DateH\x00\x12\x0f\n\x05\x62ytes\x18\r \x01(\x0cH\x00\x42\t\n\x07variant\"\x1b\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x01\"`\n\x04List\x12,\n\x05\x64type\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12*\n\x06values\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Value\"\xf9\x01\n\x03Map\x12\x30\n\tkey_dtype\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\x32\n\x0bvalue_dtype\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12/\n\x07\x65ntries\x18\x03 \x03(\x0b\x32\x1e.fennel.proto.schema.Map.Entry\x1a[\n\x05\x45ntry\x12\'\n\x03key\x18\x01 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12)\n\x05value\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\"\x87\x01\n\x0bStructValue\x12\x36\n\x06\x66ields\x18\x01 \x03(\x0b\x32&.fennel.proto.schema.StructValue.Entry\x1a@\n\x05\x45ntry\x12\x0c\n\x04name\x18\x01 \x01(\t\x12)\n\x05value\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\"\'\n\x07\x44\x65\x63imal\x12\r\n\x05scale\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x03\"\x14\n\x04\x44\x61te\x12\x0c\n\x04\x64\x61ys\x18\x02 \x01(\x03\"\x06\n\x04Noneb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0cschema.proto\x12\x13\x66\x65nnel.proto.schema\x1a\x1fgoogle/protobuf/timestamp.proto\"\xb8\x07\n\x08\x44\x61taType\x12\x30\n\x08int_type\x18\x01 \x01(\x0b\x32\x1c.fennel.proto.schema.IntTypeH\x00\x12\x36\n\x0b\x64ouble_type\x18\x02 \x01(\x0b\x32\x1f.fennel.proto.schema.DoubleTypeH\x00\x12\x36\n\x0bstring_type\x18\x03 \x01(\x0b\x32\x1f.fennel.proto.schema.StringTypeH\x00\x12\x32\n\tbool_type\x18\x04 \x01(\x0b\x32\x1d.fennel.proto.schema.BoolTypeH\x00\x12<\n\x0etimestamp_type\x18\x05 \x01(\x0b\x32\".fennel.proto.schema.TimestampTypeH\x00\x12\x34\n\narray_type\x18\x06 \x01(\x0b\x32\x1e.fennel.proto.schema.ArrayTypeH\x00\x12\x30\n\x08map_type\x18\x07 \x01(\x0b\x32\x1c.fennel.proto.schema.MapTypeH\x00\x12<\n\x0e\x65mbedding_type\x18\x08 \x01(\x0b\x32\".fennel.proto.schema.EmbeddingTypeH\x00\x12\x34\n\x0c\x62\x65tween_type\x18\t \x01(\x0b\x32\x1c.fennel.proto.schema.BetweenH\x00\x12\x31\n\x0bone_of_type\x18\n \x01(\x0b\x32\x1a.fennel.proto.schema.OneOfH\x00\x12\x34\n\nregex_type\x18\x0b \x01(\x0b\x32\x1e.fennel.proto.schema.RegexTypeH\x00\x12:\n\roptional_type\x18\x0c \x01(\x0b\x32!.fennel.proto.schema.OptionalTypeH\x00\x12\x36\n\x0bstruct_type\x18\r \x01(\x0b\x32\x1f.fennel.proto.schema.StructTypeH\x00\x12\x38\n\x0c\x64\x65\x63imal_type\x18\x0e \x01(\x0b\x32 .fennel.proto.schema.DecimalTypeH\x00\x12\x32\n\tdate_type\x18\x0f \x01(\x0b\x32\x1d.fennel.proto.schema.DateTypeH\x00\x12\x34\n\nbytes_type\x18\x10 \x01(\x0b\x32\x1e.fennel.proto.schema.BytesTypeH\x00\x12\x32\n\tnull_type\x18\x11 \x01(\x0b\x32\x1d.fennel.proto.schema.NullTypeH\x00\x42\x07\n\x05\x64type\"C\n\x05\x46ield\x12\x0c\n\x04name\x18\x01 \x01(\t\x12,\n\x05\x64type\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\n\n\x08NullType\"\t\n\x07IntType\"\x0c\n\nDoubleType\"\x0c\n\nStringType\"\n\n\x08\x42oolType\"\x0f\n\rTimestampType\"\n\n\x08\x44\x61teType\"\x0b\n\tBytesType\"\x1c\n\tRegexType\x12\x0f\n\x07pattern\x18\x01 \x01(\t\"6\n\tArrayType\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\'\n\rEmbeddingType\x12\x16\n\x0e\x65mbedding_size\x18\x02 \x01(\x05\"c\n\x07MapType\x12*\n\x03key\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"F\n\nStructType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12*\n\x06\x66ields\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Field\"\xb1\x01\n\x07\x42\x65tween\x12,\n\x05\x64type\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\'\n\x03min\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12\'\n\x03max\x18\x03 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12\x12\n\nstrict_min\x18\x04 \x01(\x08\x12\x12\n\nstrict_max\x18\x05 \x01(\x08\"_\n\x05OneOf\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12+\n\x07options\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Value\"9\n\x0cOptionalType\x12)\n\x02of\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\"\x1c\n\x0b\x44\x65\x63imalType\x12\r\n\x05scale\x18\x01 \x01(\x05\"4\n\x06Schema\x12*\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x1a.fennel.proto.schema.Field\"\x89\x01\n\x08\x44SSchema\x12)\n\x04keys\x18\x01 \x01(\x0b\x32\x1b.fennel.proto.schema.Schema\x12+\n\x06values\x18\x02 \x01(\x0b\x32\x1b.fennel.proto.schema.Schema\x12\x11\n\ttimestamp\x18\x03 \x01(\t\x12\x12\n\nerase_keys\x18\x04 \x03(\t\"\xda\x03\n\x05Value\x12)\n\x04none\x18\x01 \x01(\x0b\x32\x19.fennel.proto.schema.NoneH\x00\x12\x0e\n\x04\x62ool\x18\x02 \x01(\x08H\x00\x12\r\n\x03int\x18\x03 \x01(\x03H\x00\x12\x0f\n\x05\x66loat\x18\x04 \x01(\x01H\x00\x12\x10\n\x06string\x18\x05 \x01(\tH\x00\x12/\n\ttimestamp\x18\x06 \x01(\x0b\x32\x1a.google.protobuf.TimestampH\x00\x12\x33\n\tembedding\x18\x07 \x01(\x0b\x32\x1e.fennel.proto.schema.EmbeddingH\x00\x12)\n\x04list\x18\x08 \x01(\x0b\x32\x19.fennel.proto.schema.ListH\x00\x12\'\n\x03map\x18\t \x01(\x0b\x32\x18.fennel.proto.schema.MapH\x00\x12\x32\n\x06struct\x18\n \x01(\x0b\x32 .fennel.proto.schema.StructValueH\x00\x12/\n\x07\x64\x65\x63imal\x18\x0b \x01(\x0b\x32\x1c.fennel.proto.schema.DecimalH\x00\x12)\n\x04\x64\x61te\x18\x0c \x01(\x0b\x32\x19.fennel.proto.schema.DateH\x00\x12\x0f\n\x05\x62ytes\x18\r \x01(\x0cH\x00\x42\t\n\x07variant\"\x1b\n\tEmbedding\x12\x0e\n\x06values\x18\x01 \x03(\x01\"`\n\x04List\x12,\n\x05\x64type\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12*\n\x06values\x18\x02 \x03(\x0b\x32\x1a.fennel.proto.schema.Value\"\xf9\x01\n\x03Map\x12\x30\n\tkey_dtype\x18\x01 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12\x32\n\x0bvalue_dtype\x18\x02 \x01(\x0b\x32\x1d.fennel.proto.schema.DataType\x12/\n\x07\x65ntries\x18\x03 \x03(\x0b\x32\x1e.fennel.proto.schema.Map.Entry\x1a[\n\x05\x45ntry\x12\'\n\x03key\x18\x01 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\x12)\n\x05value\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\"\x87\x01\n\x0bStructValue\x12\x36\n\x06\x66ields\x18\x01 \x03(\x0b\x32&.fennel.proto.schema.StructValue.Entry\x1a@\n\x05\x45ntry\x12\x0c\n\x04name\x18\x01 \x01(\t\x12)\n\x05value\x18\x02 \x01(\x0b\x32\x1a.fennel.proto.schema.Value\"\'\n\x07\x44\x65\x63imal\x12\r\n\x05scale\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x03\"\x14\n\x04\x44\x61te\x12\x0c\n\x04\x64\x61ys\x18\x02 \x01(\x03\"\x06\n\x04Noneb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -22,63 +22,65 @@ if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None _globals['_DATATYPE']._serialized_start=71 - _globals['_DATATYPE']._serialized_end=971 - _globals['_FIELD']._serialized_start=973 - _globals['_FIELD']._serialized_end=1040 - _globals['_INTTYPE']._serialized_start=1042 - _globals['_INTTYPE']._serialized_end=1051 - _globals['_DOUBLETYPE']._serialized_start=1053 - _globals['_DOUBLETYPE']._serialized_end=1065 - _globals['_STRINGTYPE']._serialized_start=1067 - _globals['_STRINGTYPE']._serialized_end=1079 - _globals['_BOOLTYPE']._serialized_start=1081 - _globals['_BOOLTYPE']._serialized_end=1091 - _globals['_TIMESTAMPTYPE']._serialized_start=1093 - _globals['_TIMESTAMPTYPE']._serialized_end=1108 - _globals['_DATETYPE']._serialized_start=1110 - _globals['_DATETYPE']._serialized_end=1120 - _globals['_BYTESTYPE']._serialized_start=1122 - _globals['_BYTESTYPE']._serialized_end=1133 - _globals['_REGEXTYPE']._serialized_start=1135 - _globals['_REGEXTYPE']._serialized_end=1163 - _globals['_ARRAYTYPE']._serialized_start=1165 - _globals['_ARRAYTYPE']._serialized_end=1219 - _globals['_EMBEDDINGTYPE']._serialized_start=1221 - _globals['_EMBEDDINGTYPE']._serialized_end=1260 - _globals['_MAPTYPE']._serialized_start=1262 - _globals['_MAPTYPE']._serialized_end=1361 - _globals['_STRUCTTYPE']._serialized_start=1363 - _globals['_STRUCTTYPE']._serialized_end=1433 - _globals['_BETWEEN']._serialized_start=1436 - _globals['_BETWEEN']._serialized_end=1613 - _globals['_ONEOF']._serialized_start=1615 - _globals['_ONEOF']._serialized_end=1710 - _globals['_OPTIONALTYPE']._serialized_start=1712 - _globals['_OPTIONALTYPE']._serialized_end=1769 - _globals['_DECIMALTYPE']._serialized_start=1771 - _globals['_DECIMALTYPE']._serialized_end=1799 - _globals['_SCHEMA']._serialized_start=1801 - _globals['_SCHEMA']._serialized_end=1853 - _globals['_DSSCHEMA']._serialized_start=1856 - _globals['_DSSCHEMA']._serialized_end=1993 - _globals['_VALUE']._serialized_start=1996 - _globals['_VALUE']._serialized_end=2470 - _globals['_EMBEDDING']._serialized_start=2472 - _globals['_EMBEDDING']._serialized_end=2499 - _globals['_LIST']._serialized_start=2501 - _globals['_LIST']._serialized_end=2597 - _globals['_MAP']._serialized_start=2600 - _globals['_MAP']._serialized_end=2849 - _globals['_MAP_ENTRY']._serialized_start=2758 - _globals['_MAP_ENTRY']._serialized_end=2849 - _globals['_STRUCTVALUE']._serialized_start=2852 - _globals['_STRUCTVALUE']._serialized_end=2987 - _globals['_STRUCTVALUE_ENTRY']._serialized_start=2923 - _globals['_STRUCTVALUE_ENTRY']._serialized_end=2987 - _globals['_DECIMAL']._serialized_start=2989 - _globals['_DECIMAL']._serialized_end=3028 - _globals['_DATE']._serialized_start=3030 - _globals['_DATE']._serialized_end=3050 - _globals['_NONE']._serialized_start=3052 - _globals['_NONE']._serialized_end=3058 + _globals['_DATATYPE']._serialized_end=1023 + _globals['_FIELD']._serialized_start=1025 + _globals['_FIELD']._serialized_end=1092 + _globals['_NULLTYPE']._serialized_start=1094 + _globals['_NULLTYPE']._serialized_end=1104 + _globals['_INTTYPE']._serialized_start=1106 + _globals['_INTTYPE']._serialized_end=1115 + _globals['_DOUBLETYPE']._serialized_start=1117 + _globals['_DOUBLETYPE']._serialized_end=1129 + _globals['_STRINGTYPE']._serialized_start=1131 + _globals['_STRINGTYPE']._serialized_end=1143 + _globals['_BOOLTYPE']._serialized_start=1145 + _globals['_BOOLTYPE']._serialized_end=1155 + _globals['_TIMESTAMPTYPE']._serialized_start=1157 + _globals['_TIMESTAMPTYPE']._serialized_end=1172 + _globals['_DATETYPE']._serialized_start=1174 + _globals['_DATETYPE']._serialized_end=1184 + _globals['_BYTESTYPE']._serialized_start=1186 + _globals['_BYTESTYPE']._serialized_end=1197 + _globals['_REGEXTYPE']._serialized_start=1199 + _globals['_REGEXTYPE']._serialized_end=1227 + _globals['_ARRAYTYPE']._serialized_start=1229 + _globals['_ARRAYTYPE']._serialized_end=1283 + _globals['_EMBEDDINGTYPE']._serialized_start=1285 + _globals['_EMBEDDINGTYPE']._serialized_end=1324 + _globals['_MAPTYPE']._serialized_start=1326 + _globals['_MAPTYPE']._serialized_end=1425 + _globals['_STRUCTTYPE']._serialized_start=1427 + _globals['_STRUCTTYPE']._serialized_end=1497 + _globals['_BETWEEN']._serialized_start=1500 + _globals['_BETWEEN']._serialized_end=1677 + _globals['_ONEOF']._serialized_start=1679 + _globals['_ONEOF']._serialized_end=1774 + _globals['_OPTIONALTYPE']._serialized_start=1776 + _globals['_OPTIONALTYPE']._serialized_end=1833 + _globals['_DECIMALTYPE']._serialized_start=1835 + _globals['_DECIMALTYPE']._serialized_end=1863 + _globals['_SCHEMA']._serialized_start=1865 + _globals['_SCHEMA']._serialized_end=1917 + _globals['_DSSCHEMA']._serialized_start=1920 + _globals['_DSSCHEMA']._serialized_end=2057 + _globals['_VALUE']._serialized_start=2060 + _globals['_VALUE']._serialized_end=2534 + _globals['_EMBEDDING']._serialized_start=2536 + _globals['_EMBEDDING']._serialized_end=2563 + _globals['_LIST']._serialized_start=2565 + _globals['_LIST']._serialized_end=2661 + _globals['_MAP']._serialized_start=2664 + _globals['_MAP']._serialized_end=2913 + _globals['_MAP_ENTRY']._serialized_start=2822 + _globals['_MAP_ENTRY']._serialized_end=2913 + _globals['_STRUCTVALUE']._serialized_start=2916 + _globals['_STRUCTVALUE']._serialized_end=3051 + _globals['_STRUCTVALUE_ENTRY']._serialized_start=2987 + _globals['_STRUCTVALUE_ENTRY']._serialized_end=3051 + _globals['_DECIMAL']._serialized_start=3053 + _globals['_DECIMAL']._serialized_end=3092 + _globals['_DATE']._serialized_start=3094 + _globals['_DATE']._serialized_end=3114 + _globals['_NONE']._serialized_start=3116 + _globals['_NONE']._serialized_end=3122 # @@protoc_insertion_point(module_scope) diff --git a/fennel/gen/schema_pb2.pyi b/fennel/gen/schema_pb2.pyi index a5f6dc1de..37e6aab9f 100644 --- a/fennel/gen/schema_pb2.pyi +++ b/fennel/gen/schema_pb2.pyi @@ -37,6 +37,7 @@ class DataType(google.protobuf.message.Message): DECIMAL_TYPE_FIELD_NUMBER: builtins.int DATE_TYPE_FIELD_NUMBER: builtins.int BYTES_TYPE_FIELD_NUMBER: builtins.int + NULL_TYPE_FIELD_NUMBER: builtins.int @property def int_type(self) -> global___IntType: ... @property @@ -69,6 +70,8 @@ class DataType(google.protobuf.message.Message): def date_type(self) -> global___DateType: ... @property def bytes_type(self) -> global___BytesType: ... + @property + def null_type(self) -> global___NullType: ... def __init__( self, *, @@ -88,10 +91,11 @@ class DataType(google.protobuf.message.Message): decimal_type: global___DecimalType | None = ..., date_type: global___DateType | None = ..., bytes_type: global___BytesType | None = ..., + null_type: global___NullType | None = ..., ) -> None: ... - def HasField(self, field_name: typing_extensions.Literal["array_type", b"array_type", "between_type", b"between_type", "bool_type", b"bool_type", "bytes_type", b"bytes_type", "date_type", b"date_type", "decimal_type", b"decimal_type", "double_type", b"double_type", "dtype", b"dtype", "embedding_type", b"embedding_type", "int_type", b"int_type", "map_type", b"map_type", "one_of_type", b"one_of_type", "optional_type", b"optional_type", "regex_type", b"regex_type", "string_type", b"string_type", "struct_type", b"struct_type", "timestamp_type", b"timestamp_type"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["array_type", b"array_type", "between_type", b"between_type", "bool_type", b"bool_type", "bytes_type", b"bytes_type", "date_type", b"date_type", "decimal_type", b"decimal_type", "double_type", b"double_type", "dtype", b"dtype", "embedding_type", b"embedding_type", "int_type", b"int_type", "map_type", b"map_type", "one_of_type", b"one_of_type", "optional_type", b"optional_type", "regex_type", b"regex_type", "string_type", b"string_type", "struct_type", b"struct_type", "timestamp_type", b"timestamp_type"]) -> None: ... - def WhichOneof(self, oneof_group: typing_extensions.Literal["dtype", b"dtype"]) -> typing_extensions.Literal["int_type", "double_type", "string_type", "bool_type", "timestamp_type", "array_type", "map_type", "embedding_type", "between_type", "one_of_type", "regex_type", "optional_type", "struct_type", "decimal_type", "date_type", "bytes_type"] | None: ... + def HasField(self, field_name: typing_extensions.Literal["array_type", b"array_type", "between_type", b"between_type", "bool_type", b"bool_type", "bytes_type", b"bytes_type", "date_type", b"date_type", "decimal_type", b"decimal_type", "double_type", b"double_type", "dtype", b"dtype", "embedding_type", b"embedding_type", "int_type", b"int_type", "map_type", b"map_type", "null_type", b"null_type", "one_of_type", b"one_of_type", "optional_type", b"optional_type", "regex_type", b"regex_type", "string_type", b"string_type", "struct_type", b"struct_type", "timestamp_type", b"timestamp_type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["array_type", b"array_type", "between_type", b"between_type", "bool_type", b"bool_type", "bytes_type", b"bytes_type", "date_type", b"date_type", "decimal_type", b"decimal_type", "double_type", b"double_type", "dtype", b"dtype", "embedding_type", b"embedding_type", "int_type", b"int_type", "map_type", b"map_type", "null_type", b"null_type", "one_of_type", b"one_of_type", "optional_type", b"optional_type", "regex_type", b"regex_type", "string_type", b"string_type", "struct_type", b"struct_type", "timestamp_type", b"timestamp_type"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["dtype", b"dtype"]) -> typing_extensions.Literal["int_type", "double_type", "string_type", "bool_type", "timestamp_type", "array_type", "map_type", "embedding_type", "between_type", "one_of_type", "regex_type", "optional_type", "struct_type", "decimal_type", "date_type", "bytes_type", "null_type"] | None: ... global___DataType = DataType @@ -115,6 +119,16 @@ class Field(google.protobuf.message.Message): global___Field = Field +@typing_extensions.final +class NullType(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + def __init__( + self, + ) -> None: ... + +global___NullType = NullType + @typing_extensions.final class IntType(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor diff --git a/fennel/internal_lib/schema/schema.py b/fennel/internal_lib/schema/schema.py index f9c5b8ed3..47e5d7ef2 100644 --- a/fennel/internal_lib/schema/schema.py +++ b/fennel/internal_lib/schema/schema.py @@ -1,5 +1,5 @@ from __future__ import annotations - +from typing import Dict import dataclasses import re import typing @@ -852,3 +852,66 @@ def data_schema_check( except Exception as e: raise e return exceptions + + +def from_proto(data_type: schema_proto.DataType) -> Any: + field = data_type.WhichOneof("dtype") + if field == "int_type": + return int + elif field == "double_type": + return float + elif field == "string_type": + return str + elif field == "bool_type": + return bool + elif field == "timestamp_type": + return datetime + elif field == "date_type": + return date + elif field == "bytes_type": + return bytes + elif field == "array_type": + element_type = from_proto(data_type.array_type.of) + return List[element_type] # type: ignore + elif field == "map_type": + key_type = from_proto(data_type.map_type.key) + value_type = from_proto(data_type.map_type.value) + return Dict[key_type, value_type] # type: ignore + elif field == "struct_type": + fields = [ + (f.name, from_proto(f.dtype)) for f in data_type.struct_type.fields + ] + # Dynamically create the dataclass with the specified fields + return dataclasses.make_dataclass(data_type.struct_type.name, fields) + elif field == "optional_type": + return Optional[from_proto(data_type.optional_type.of)] + elif field == "decimal_type": + return Decimal + elif field == "regex_type": + return regex(data_type.regex_type.pattern) + elif field == "embedding_type": + return _Embedding(data_type.embedding_type.embedding_size) + elif field == "between_type": + dtype = from_proto(data_type.between_type.dtype) + min_value = data_type.between_type.min + max_value = data_type.between_type.max + min = min_value.int if hasattr(min_value, "int") else min_value.float + max = max_value.int if hasattr(max_value, "int") else max_value.float + return between( + dtype=dtype, + min=min, + max=max, + strict_min=data_type.between_type.strict_min, + strict_max=data_type.between_type.strict_max, + ) + elif field == "one_of_type": + dtype = from_proto(data_type.one_of_type.of) + options = [ + option.int if hasattr(option, "int") else option.string + for option in data_type.one_of_type.options + ] + return oneof(dtype=dtype, options=options) + else: + raise ValueError(f"Unsupported data type field: {field}") + + return None diff --git a/fennel/internal_lib/schema/test_schema.py b/fennel/internal_lib/schema/test_schema.py index a3103c7e1..d45850b82 100644 --- a/fennel/internal_lib/schema/test_schema.py +++ b/fennel/internal_lib/schema/test_schema.py @@ -1,19 +1,23 @@ -from datetime import datetime, timedelta, timezone -from decimal import Decimal as PythonDecimal -from typing import Dict, List, Optional - import numpy as np import pandas as pd import pytest +import unittest + +from datetime import datetime, date, timedelta, timezone +from decimal import Decimal as PythonDecimal +from typing import Dict, List, Optional, Union, get_type_hints import fennel.gen.schema_pb2 as proto from fennel.dtypes.dtypes import ( + Embedding, struct, FENNEL_STRUCT_SRC_CODE, FENNEL_STRUCT_DEPENDENCIES_SRC_CODE, Decimal, ) from fennel.internal_lib.schema.schema import ( + fennel_is_optional, + from_proto, get_datatype, data_schema_check, between, @@ -799,3 +803,158 @@ def test_convert_dtype_to_arrow_type(): str(arrow_dtype) == "list, c: list>>" ) + + +class TestDataTypeConversions(unittest.TestCase): + + def test_int_type(self): + original = int + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(original, result) + + def test_float_type(self): + original = float + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(original, result) + + def test_string_type(self): + original = str + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(original, result) + + def test_datetime_type(self): + original = datetime + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(original, result) + + def test_date_type(self): + original = date + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(original, result) + + def test_numpy_int64(self): + original = np.int64 + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(int, result) # np.int64 should map to int + + def test_numpy_float64(self): + original = np.float64 + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(float, result) # np.float64 should map to float + + def test_pandas_int64_dtype(self): + original = pd.Int64Dtype + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(int, result) # pd.Int64Dtype should map to int + + def test_pandas_string_dtype(self): + original = pd.StringDtype + proto = get_datatype(original) + result = from_proto(proto) + self.assertEqual(str, result) # pd.StringDtype should map to str + + def test_custom_type_between(self): + original = between( + dtype=int, min=1, max=5, strict_min=True, strict_max=False + ) + proto = original.to_proto() + result = from_proto(proto) + self.assertEqual(original.dtype, result.dtype) + self.assertEqual(original.min, result.min) + self.assertEqual(original.max, result.max) + self.assertEqual(original.strict_min, result.strict_min) + self.assertEqual(original.strict_max, result.strict_max) + + def test_custom_type_oneof(self): + original = oneof(dtype=int, options=[1, 2, 3]) + proto = original.to_proto() + result = from_proto(proto) + self.assertEqual(original.dtype, result.dtype) + self.assertEqual(original.options, result.options) + + def test_list_type(self): + original_type = List[int] + proto = get_datatype(original_type) + converted_type = from_proto(proto) + self.assertEqual(original_type, converted_type) + + def test_dict_type(self): + original_type = Dict[str, float] + proto = get_datatype(original_type) + converted_type = from_proto(proto) + self.assertEqual(original_type, converted_type) + + def test_optional_type(self): + original_type = Optional[int] + proto = get_datatype(original_type) + converted_type = from_proto(proto) + self.assertEqual(original_type, converted_type) + + def test_complex_case(self): + original_type = List[Dict[str, List[float]]] + proto = get_datatype(original_type) + converted_type = from_proto(proto) + self.assertEqual(original_type, converted_type) + + def test_embedding_case(self): + original_type = Embedding[4] + proto = get_datatype(original_type) + converted_type = from_proto(proto) + self.assertEqual(original_type, converted_type) + + def test_struct_case(self): + def assert_struct_fields_match(self, original_cls, reconstructed_cls): + # Names of class should match + self.assertEqual( + original_cls.__name__, + reconstructed_cls.__name__, + "Class names do not match.", + ) + original_fields = get_type_hints(original_cls) + reconstructed_fields = get_type_hints(reconstructed_cls) + self.assertEqual( + set(original_fields.keys()), + set(reconstructed_fields.keys()), + "Field names do not match.", + ) + + for field_name, original_type in original_fields.items(): + reconstructed_type = reconstructed_fields[field_name] + self.assertEqual( + original_type, + reconstructed_type, + f"Types for field {field_name} do not match.", + ) + + @struct + class A: + a: int + b: str + + original_type = A + proto = get_datatype(original_type) + converted_type = from_proto(proto) + assert_struct_fields_match(self, original_type, converted_type) + + @struct + class ComplexStruct: + a: List[int] + b: Dict[str, float] + c: Optional[str] + d: Embedding[4] + e: List[Dict[str, List[float]]] + f: Dict[str, Optional[int]] + g: Optional[Dict[str, List[float]]] + + original_type = ComplexStruct + proto = get_datatype(original_type) + converted_type = from_proto(proto) + assert_struct_fields_match(self, original_type, converted_type) diff --git a/fennel/internal_lib/to_proto/__init__.py b/fennel/internal_lib/to_proto/__init__.py index 48e31df87..8e489695e 100644 --- a/fennel/internal_lib/to_proto/__init__.py +++ b/fennel/internal_lib/to_proto/__init__.py @@ -5,6 +5,7 @@ features_from_fs, featureset_to_proto, extractors_from_fs, + val_as_json, ) from fennel.internal_lib.to_proto.source_code import ( get_dataset_core_code, diff --git a/fennel/internal_lib/to_proto/serializer.py b/fennel/internal_lib/to_proto/serializer.py index 048d7b019..52cedb0a4 100644 --- a/fennel/internal_lib/to_proto/serializer.py +++ b/fennel/internal_lib/to_proto/serializer.py @@ -5,9 +5,10 @@ import google.protobuf.duration_pb2 as duration_proto +from fennel.expr.serializer import ExprSerializer import fennel.gen.dataset_pb2 as proto from fennel.datasets import Dataset, Pipeline, Visitor -from fennel.datasets.datasets import WINDOW_FIELD_NAME, EmitStrategy +from fennel.datasets.datasets import WINDOW_FIELD_NAME, UDFType, EmitStrategy from fennel.internal_lib.duration import ( duration_to_timedelta, ) @@ -100,50 +101,87 @@ def visitTransform(self, obj): ) def visitFilter(self, obj): - filter_func_pycode = to_includes_proto(obj.func) - gen_pycode = wrap_function( - self.dataset_name, - self.dataset_code, - self.lib_generated_code, - filter_func_pycode, - is_filter=True, - ) - return proto.Operator( - id=obj.signature(), - is_root=obj == self.terminal_node, - pipeline_name=self.pipeline_name, - dataset_name=self.dataset_name, - ds_version=self.dataset_version, - filter=proto.Filter( + if obj.filter_type == UDFType.python: + filter_func_pycode = to_includes_proto(obj.func) + gen_pycode = wrap_function( + self.dataset_name, + self.dataset_code, + self.lib_generated_code, + filter_func_pycode, + is_filter=True, + ) + return proto.Operator( + id=obj.signature(), + is_root=obj == self.terminal_node, + pipeline_name=self.pipeline_name, + dataset_name=self.dataset_name, + ds_version=self.dataset_version, + filter=proto.Filter( + operand_id=self.visit(obj.node), + pycode=gen_pycode, + ), + ) + else: + serializer = ExprSerializer() + filter_expr = proto.FilterExpr( operand_id=self.visit(obj.node), - pycode=gen_pycode, - ), - ) + expr=serializer.serialize(obj.filter_expr.root), + ) + return proto.Operator( + id=obj.signature(), + is_root=obj == self.terminal_node, + pipeline_name=self.pipeline_name, + dataset_name=self.dataset_name, + ds_version=self.dataset_version, + filter_expr=filter_expr, + ) def visitAssign(self, obj): - assign_func_pycode = to_includes_proto(obj.func) - gen_pycode = wrap_function( - self.dataset_name, - self.dataset_code, - self.lib_generated_code, - assign_func_pycode, - is_assign=True, - column_name=obj.column, - ) - - return proto.Operator( - id=obj.signature(), - is_root=(obj == self.terminal_node), - pipeline_name=self.pipeline_name, - dataset_name=self.dataset_name, - ds_version=self.dataset_version, - assign=proto.Assign( - operand_id=self.visit(obj.node), - pycode=gen_pycode, + if obj.assign_type == UDFType.python: + assign_func_pycode = to_includes_proto(obj.func) + gen_pycode = wrap_function( + self.dataset_name, + self.dataset_code, + self.lib_generated_code, + assign_func_pycode, + is_assign=True, column_name=obj.column, - output_type=get_datatype(obj.output_type), - ), - ) + ) + + return proto.Operator( + id=obj.signature(), + is_root=(obj == self.terminal_node), + pipeline_name=self.pipeline_name, + dataset_name=self.dataset_name, + ds_version=self.dataset_version, + assign=proto.Assign( + operand_id=self.visit(obj.node), + pycode=gen_pycode, + column_name=obj.column, + output_type=get_datatype(obj.output_type), + ), + ) + elif obj.assign_type == UDFType.expr: + serializer = ExprSerializer() + assign_expr_message = proto.AssignExpr( + operand_id=self.visit(obj.node) + ) + for col, typed_expr in obj.output_expressions.items(): + assign_expr_message.exprs[col].CopyFrom( + serializer.serialize(typed_expr.expr.root) + ) + assign_expr_message.output_types[col].CopyFrom( + get_datatype(typed_expr.dtype) + ) + + return proto.Operator( + id=obj.signature(), + is_root=(obj == self.terminal_node), + pipeline_name=self.pipeline_name, + dataset_name=self.dataset_name, + ds_version=self.dataset_version, + assign_expr=assign_expr_message, + ) def visitAggregate(self, obj): emit_strategy = ( diff --git a/fennel/internal_lib/to_proto/to_proto.py b/fennel/internal_lib/to_proto/to_proto.py index 0bc6ff969..36e54980e 100644 --- a/fennel/internal_lib/to_proto/to_proto.py +++ b/fennel/internal_lib/to_proto/to_proto.py @@ -85,6 +85,15 @@ def _expectations_to_proto( ] +def val_as_json(val: Any) -> str: + if getattr(val.__class__, FENNEL_STRUCT, False): + return json.dumps(val.as_json()) + try: + return json.dumps(val) + except TypeError: + return json.dumps(str(val)) + + # ------------------------------------------------------------------------------ # Sync # ------------------------------------------------------------------------------ @@ -652,14 +661,7 @@ def _to_field_lookup_proto( return fs_proto.FieldLookupInfo( field=_field_to_proto(info.field), default_value=json.dumps(None) ) - - if getattr(info.default.__class__, FENNEL_STRUCT, False): - default_val = json.dumps(info.default.as_json()) - else: - try: - default_val = json.dumps(info.default) - except TypeError: - default_val = json.dumps(str(info.default)) + default_val = val_as_json(info.default) return fs_proto.FieldLookupInfo( field=_field_to_proto(info.field), diff --git a/fennel/testing/execute_aggregation.py b/fennel/testing/execute_aggregation.py index b4c7bf761..f8c8dc014 100644 --- a/fennel/testing/execute_aggregation.py +++ b/fennel/testing/execute_aggregation.py @@ -320,11 +320,13 @@ def add_val_to_state(self, val, _ts): return self.get_val() def del_val_from_state(self, val, _ts): - self.count -= 1 - if self.count == 0: + if self.count == 1: # If removing the last value, reset everything + self.count = 0 self.mean = 0 self.m2 = 0 return self.default + + self.count -= 1 delta = val - self.mean self.mean -= delta / self.count delta2 = val - self.mean diff --git a/fennel/testing/executor.py b/fennel/testing/executor.py index 88023c89d..a29fc9bc8 100644 --- a/fennel/testing/executor.py +++ b/fennel/testing/executor.py @@ -6,12 +6,13 @@ import numpy as np import pandas as pd +from fennel.expr.visitor import ExprPrinter import pyarrow as pa from frozendict import frozendict import fennel.gen.schema_pb2 as schema_proto from fennel.datasets import Pipeline, Visitor, Dataset, Count -from fennel.datasets.datasets import DSSchema, WINDOW_FIELD_NAME +from fennel.datasets.datasets import UDFType, DSSchema, WINDOW_FIELD_NAME from fennel.gen.schema_pb2 import Field from fennel.internal_lib.duration import duration_to_timedelta from fennel.internal_lib.schema import get_datatype, fennel_is_optional @@ -211,34 +212,53 @@ def visitTransform(self, obj) -> Optional[NodeRet]: def visitFilter(self, obj) -> Optional[NodeRet]: input_ret = self.visit(obj.node) - if input_ret is None or input_ret.df is None: + if ( + input_ret is None + or input_ret.df is None + or input_ret.df.shape[0] == 0 + ): return None + fields = obj.dsschema().to_fields_proto() - filter_func_pycode = to_includes_proto(obj.func) - mod = types.ModuleType(filter_func_pycode.entry_point) - gen_pycode = wrap_function( - self.serializer.dataset_name, - self.serializer.dataset_code, - self.serializer.lib_generated_code, - filter_func_pycode, - is_filter=True, - ) - code = gen_pycode.imports + "\n" + gen_pycode.generated_code - exec(code, mod.__dict__) - func = mod.__dict__[gen_pycode.entry_point] - try: - df = cast_df_to_pandas_dtype(input_ret.df, fields) - f_df = func(df).sort_values(input_ret.timestamp_field) - except Exception as e: - raise Exception( - f"Error in filter function `{obj.func.__name__}` for pipeline " - f"`{self.cur_pipeline_name}`, {e}" + if obj.filter_type == UDFType.python: + filter_func_pycode = to_includes_proto(obj.func) + mod = types.ModuleType(filter_func_pycode.entry_point) + gen_pycode = wrap_function( + self.serializer.dataset_name, + self.serializer.dataset_code, + self.serializer.lib_generated_code, + filter_func_pycode, + is_filter=True, ) + code = gen_pycode.imports + "\n" + gen_pycode.generated_code + exec(code, mod.__dict__) + func = mod.__dict__[gen_pycode.entry_point] + try: + df = cast_df_to_pandas_dtype(input_ret.df, fields) + f_df = func(df).sort_values(input_ret.timestamp_field) + except Exception as e: + raise Exception( + f"Error in filter function `{obj.func.__name__}` for pipeline " + f"`{self.cur_pipeline_name}`, {e}" + ) + else: + input_df = copy.deepcopy(input_ret.df) + f_df = input_df.copy() + f_df.reset_index(drop=True, inplace=True) + try: + f_df = f_df[ + obj.filter_expr.eval(input_df, obj.dsschema().schema()) + ] + except Exception as e: + printer = ExprPrinter() + raise Exception( + f"Error in filter function `{printer.print(obj.filter_expr)}` for pipeline " + f"`{self.cur_pipeline_name}`, {e}" + ) sorted_df = cast_df_to_arrow_dtype( f_df, fields, ) - return NodeRet( sorted_df, input_ret.timestamp_field, @@ -712,38 +732,55 @@ def visitAssign(self, obj): or input_ret.df.shape[0] == 0 ): return None - assign_func_pycode = to_includes_proto(obj.func) - mod = types.ModuleType(assign_func_pycode.entry_point) - gen_pycode = wrap_function( - self.serializer.dataset_name, - self.serializer.dataset_code, - self.serializer.lib_generated_code, - assign_func_pycode, - is_assign=True, - column_name=obj.column, - ) - code = gen_pycode.imports + "\n" + gen_pycode.generated_code - exec(code, mod.__dict__) - func = mod.__dict__[gen_pycode.entry_point] - try: - df = cast_df_to_pandas_dtype(input_ret.df, input_ret.fields) - df = func(df) - field = schema_proto.Field( - name=obj.column, dtype=get_datatype(obj.output_type) - ) - # Check the schema of the column - validate_field_in_df(field, df, self.cur_pipeline_name) + fields = obj.dsschema().to_fields_proto() + if obj.assign_type == UDFType.python: + assign_func_pycode = to_includes_proto(obj.func) + mod = types.ModuleType(assign_func_pycode.entry_point) + gen_pycode = wrap_function( + self.serializer.dataset_name, + self.serializer.dataset_code, + self.serializer.lib_generated_code, + assign_func_pycode, + is_assign=True, + column_name=obj.column, + ) + code = gen_pycode.imports + "\n" + gen_pycode.generated_code + exec(code, mod.__dict__) + func = mod.__dict__[gen_pycode.entry_point] + try: + df = cast_df_to_pandas_dtype(input_ret.df, input_ret.fields) + df = func(df) + field = schema_proto.Field( + name=obj.column, dtype=get_datatype(obj.output_type) + ) - fields = obj.dsschema().to_fields_proto() + # Check the schema of the column + validate_field_in_df(field, df, self.cur_pipeline_name) - # Cast to arrow dtype - df = cast_df_to_arrow_dtype(df, fields) - except Exception as e: - raise Exception( - f"Error in assign node for column `{obj.column}` for pipeline " - f"`{self.cur_pipeline_name}`, {e}" - ) + # Cast to arrow dtype + df = cast_df_to_arrow_dtype(df, fields) + except Exception as e: + raise Exception( + f"Error in assign node for column `{obj.column}` for pipeline " + f"`{self.cur_pipeline_name}`, {e}" + ) + else: + input_df = copy.deepcopy(input_ret.df) + df = copy.deepcopy(input_df) + for col, typed_expr in obj.output_expressions.items(): + if col in input_ret.df.columns: + raise Exception( + f"Column `{col}` already present in dataframe" + ) + input_dsschema = obj.node.dsschema().schema() + try: + df[col] = typed_expr.expr.eval(input_df, input_dsschema) + except Exception as e: + raise Exception( + f"Error in assign node for column `{col}` for pipeline " + f"`{self.cur_pipeline_name}`, {e}" + ) return NodeRet( df, input_ret.timestamp_field, diff --git a/pyproject.toml b/pyproject.toml index 49f4d30a3..69728052b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "fennel-ai" -version = "1.4.7" +version = "1.5.0" description = "The modern realtime feature engineering platform" authors = ["Fennel AI "] packages = [{ include = "fennel" }] @@ -39,8 +39,13 @@ pyspelling = "^2.8.2" pyyaml = "^6.0.1" [build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +requires = ["maturin", "setuptools>=42", "wheel"] +build-backend = "maturin" + +[tool.maturin] +name = "fennel_data_lib" +sdist-directory = "python_package" +manifest-path = "../server/fennel_data_lib/Cargo.toml" # inspired from - https://github.com/pypa/pip/blob/main/pyproject.toml