This repository has been archived by the owner on Feb 20, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from JasperHG90/feat/pandas
Add Pandas IO manager
- Loading branch information
Showing
8 changed files
with
503 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,9 @@ daft = [ | |
polars = [ | ||
"polars>=1.13.1", | ||
] | ||
pandas = [ | ||
"pandas>=2.2.3", | ||
] | ||
|
||
[tool.black] | ||
line-length = 88 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
from typing import Sequence, Type | ||
|
||
try: | ||
import pandas as pd | ||
except ImportError as e: | ||
raise ImportError( | ||
"Please install dagster-pyiceberg with the 'pandas' extra." | ||
) from e | ||
import pyarrow as pa | ||
from dagster import InputContext | ||
from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice | ||
from pyiceberg.catalog import Catalog | ||
|
||
from dagster_pyiceberg import io_manager as _io_manager | ||
from dagster_pyiceberg.io_manager.arrow import _IcebergPyArrowTypeHandler | ||
|
||
|
||
class _IcebergPandasTypeHandler(_IcebergPyArrowTypeHandler): | ||
"""Type handler that converts data between Iceberg tables and pyarrow Tables""" | ||
|
||
def to_arrow(self, obj: pd.DataFrame) -> pa.Table: | ||
return pa.Table.from_pandas(obj) | ||
|
||
def load_input( | ||
self, | ||
context: InputContext, | ||
table_slice: TableSlice, | ||
connection: Catalog, | ||
) -> pd.DataFrame: | ||
"""Loads the input using a dataframe implmentation""" | ||
tbl: pa.Table = self.to_data_frame( | ||
table=connection.load_table(f"{table_slice.schema}.{table_slice.table}"), | ||
table_slice=table_slice, | ||
target_type=pa.RecordBatchReader, | ||
) | ||
return tbl.read_pandas() | ||
|
||
@property | ||
def supported_types(self) -> Sequence[Type[object]]: | ||
return [pd.DataFrame] | ||
|
||
|
||
class IcebergPandasIOManager(_io_manager.IcebergIOManager): | ||
"""An IO manager definition that reads inputs from and writes outputs to Iceberg tables using Pandas. | ||
Examples: | ||
.. code-block:: python | ||
import pandas as pd | ||
import pyarrow as pa | ||
from dagster import Definitions, asset | ||
from dagster_pyiceberg import IcebergPyarrowIOManager, IcebergSqlCatalogConfig | ||
CATALOG_URI = "sqlite:////home/vscode/workspace/.tmp/examples/select_columns/catalog.db" | ||
CATALOG_WAREHOUSE = ( | ||
"file:///home/vscode/workspace/.tmp/examples/select_columns/warehouse" | ||
) | ||
resources = { | ||
"io_manager": IcebergPyarrowIOManager( | ||
name="test", | ||
config=IcebergSqlCatalogConfig( | ||
properties={"uri": CATALOG_URI, "warehouse": CATALOG_WAREHOUSE} | ||
), | ||
schema="dagster", | ||
) | ||
} | ||
@asset | ||
def iris_dataset() -> pd.DataFrame: | ||
pa.Table.from_pandas( | ||
pd.read_csv( | ||
"https://docs.dagster.io/assets/iris.csv", | ||
names=[ | ||
"sepal_length_cm", | ||
"sepal_width_cm", | ||
"petal_length_cm", | ||
"petal_width_cm", | ||
"species", | ||
], | ||
) | ||
) | ||
defs = Definitions(assets=[iris_dataset], resources=resources) | ||
If you do not provide a schema, Dagster will determine a schema based on the assets and ops using | ||
the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example. | ||
For ops, the schema can be specified by including a "schema" entry in output metadata. If none | ||
of these is provided, the schema will default to "public". | ||
.. code-block:: python | ||
@op( | ||
out={"my_table": Out(metadata={"schema": "my_schema"})} | ||
) | ||
def make_my_table() -> pd.DataFrame: | ||
... | ||
To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the | ||
In or AssetIn. | ||
.. code-block:: python | ||
@asset( | ||
ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})} | ||
) | ||
def my_table_a(my_table: pd.DataFrame): | ||
# my_table will just contain the data from column "a" | ||
... | ||
""" | ||
|
||
@staticmethod | ||
def type_handlers() -> Sequence[DbTypeHandler]: | ||
return [_IcebergPandasTypeHandler()] |
Oops, something went wrong.