Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ARROW-197] Add support for Polars in PyMongoArrow #185

Merged
merged 35 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
810777d
ARROW-197 First commit. Basic API, simple tests, and a number of ques…
caseyclements Jan 17, 2024
4e93616
Raise ValueError if one writes a Polars DataFrame with no _id column
caseyclements Jan 17, 2024
cd31d49
Working start on TestExplicitPolarsApi.test_write_schema_validation
caseyclements Jan 18, 2024
6923089
Cast ExtensionTypes in Arrow.Table from find_arrow_all to base pyarro…
caseyclements Jan 18, 2024
5594d20
cleanup
caseyclements Jan 18, 2024
87e6f40
cleanup formatting
caseyclements Jan 18, 2024
1102f67
Add polars to pyproject deps
caseyclements Jan 18, 2024
5ee5278
Add Polars to benchmarks
caseyclements Jan 20, 2024
9a13702
Additional Polars tests and todos
caseyclements Jan 20, 2024
2ea854c
Finished Polars tests
caseyclements Jan 23, 2024
679ded8
Ruff Cleanup
caseyclements Jan 24, 2024
15f168d
ARROW-206 ARROW-204 Added temporary pytest filterwarnings for Pandas …
caseyclements Jan 27, 2024
9788416
Remove redundant check if pd.DataFrame is None
caseyclements Jan 28, 2024
8e39c62
ARROW-210 Initial commit for pyarrow large_list and large_string Data…
caseyclements Jan 30, 2024
8b341eb
Updated and added further datetime tests
caseyclements Jan 30, 2024
e295768
Added tests of large_list and large_string to test_arrow
caseyclements Jan 30, 2024
083c035
Added version numbers to changelog and docstrings
caseyclements Jan 31, 2024
29a0471
[ARROW-210] Add support for large_list and large_string PyArrow DataT…
caseyclements Jan 31, 2024
f637356
Fixed merge typo
caseyclements Jan 31, 2024
010bb41
[ARROW-214] Added tests of Arrow binary datatypes with expected failu…
caseyclements Jan 31, 2024
a87a801
Updated FAQ. Was outdated as Pandas is now required. Polars slipped i…
caseyclements Feb 1, 2024
f782fda
Update heading of comparison.html. We had 2 Quick Start pages.
caseyclements Feb 1, 2024
bcf45f9
Typo
caseyclements Feb 1, 2024
afa347e
Updates to Quick Start'
caseyclements Feb 1, 2024
9c7ff82
Updates to index.rst and quickstart.rst
caseyclements Feb 1, 2024
48dbc8e
Updated Data Types page
caseyclements Feb 1, 2024
90966b9
Merge branch 'main' into ARROW-197-polars
caseyclements Feb 1, 2024
2ae0098
Fix heading underline
caseyclements Feb 1, 2024
e586173
Added manylinux-aarch64-image
caseyclements Feb 2, 2024
417a826
Merge remote-tracking branch 'upstream/main' into ARROW-197-polars
caseyclements Feb 2, 2024
9c98ad4
Removed completed todo in benchmarks. Polars array IS being tested.
caseyclements Feb 2, 2024
9404ede
ARROW-217 - Turned todos into jiras
caseyclements Feb 2, 2024
18a4f16
Place guards around polars imports
caseyclements Feb 2, 2024
4a0ae1a
Set polars as optional dependency in pyproject.toml
caseyclements Feb 2, 2024
de6db8c
Added test extras to benchmark env in tox.ini
caseyclements Feb 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 88 additions & 4 deletions bindings/python/pymongoarrow/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
import warnings

import numpy as np
import pandas as pd
import polars as pl
blink1073 marked this conversation as resolved.
Show resolved Hide resolved
import pyarrow as pa
import pymongo.errors
from bson import encode
from bson.codec_options import TypeEncoder, TypeRegistry
from bson.raw_bson import RawBSONDocument
from numpy import ndarray
from pandas import NA, DataFrame
from pyarrow import Schema as ArrowSchema
from pyarrow import Table
from pymongo.bulk import BulkWriteError
Expand Down Expand Up @@ -50,6 +52,8 @@
"find_pandas_all",
"aggregate_numpy_all",
"find_numpy_all",
"aggregate_polars_all",
"find_polars_all",
"write",
"Schema",
]
Expand All @@ -62,6 +66,8 @@
"find_pandas_all",
"aggregate_numpy_all",
"find_numpy_all",
"aggregate_polars_all",
"find_polars_all",
]

# MongoDB 3.6's maxMessageSizeBytes minus some overhead to account
Expand Down Expand Up @@ -290,6 +296,78 @@ def aggregate_numpy_all(collection, pipeline, *, schema=None, **kwargs):
)


def _cast_away_extension_types_on_array(array: pa.Array) -> pa.Array:
"""Return an Array where ExtensionTypes have been cast to their base pyarrow types"""
if isinstance(array.type, pa.ExtensionType):
return array.cast(array.type.storage_type)
# elif pa.types.is_struct(field.type):
# ...
# elif pa.types.is_list(field.type):
# ...
return array


def _cast_away_extension_types_on_table(table: pa.Table) -> pa.Table:
"""Given arrow_table that may ExtensionTypes, cast these to the base pyarrow types"""
# Convert all fields in the Arrow table
converted_fields = [
_cast_away_extension_types_on_array(table.column(i)) for i in range(table.num_columns)
]
# Reconstruct the Arrow table
return pa.Table.from_arrays(converted_fields, names=table.column_names)


def _arrow_to_polars(arrow_table):
"""Helper function that converts an Arrow Table to a Polars DataFrame.

Note: Polars lacks ExtensionTypes. We cast them to their base arrow classes.
"""
arrow_table_without_extensions = _cast_away_extension_types_on_table(arrow_table)
return pl.from_arrow(arrow_table_without_extensions)


def find_polars_all(collection, query, *, schema=None, **kwargs):
"""Method that returns the results of a find query as a
:class:`polars.DataFrame` instance.

:Parameters:
- `collection`: Instance of :class:`~pymongo.collection.Collection`.
against which to run the ``find`` operation.
- `query`: A mapping containing the query to use for the find operation.
- `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
If the schema is not given, it will be inferred using the first
document in the result set.

Additional keyword-arguments passed to this method will be passed
directly to the underlying ``find`` operation.

:Returns:
An instance of class:`polars.DataFrame`.
"""
return _arrow_to_polars(find_arrow_all(collection, query, schema=schema, **kwargs))


def aggregate_polars_all(collection, pipeline, *, schema=None, **kwargs):
"""Method that returns the results of an aggregation pipeline as a
:class:`polars.DataFrame` instance.

:Parameters:
- `collection`: Instance of :class:`~pymongo.collection.Collection`.
against which to run the ``find`` operation.
- `pipeline`: A list of aggregation pipeline stages.
- `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
If the schema is not given, it will be inferred using the first
document in the result set.

Additional keyword-arguments passed to this method will be passed
directly to the underlying ``aggregate`` operation.

:Returns:
An instance of class:`polars.DataFrame`.
"""
return _arrow_to_polars(aggregate_arrow_all(collection, pipeline, schema=schema, **kwargs))


def _transform_bwe(bwe, offset):
bwe["nInserted"] += offset
for i in bwe["writeErrors"]:
Expand All @@ -306,9 +384,13 @@ def _tabular_generator(tabular):
for i in tabular.to_batches():
for row in i.to_pylist():
yield row
elif DataFrame is not None and isinstance(tabular, DataFrame):
elif pd.DataFrame is not None and isinstance(
tabular, pd.DataFrame
): # todo how could DataFrame be None?
blink1073 marked this conversation as resolved.
Show resolved Hide resolved
for row in tabular.to_dict("records"):
yield row
elif isinstance(tabular, pl.DataFrame):
blink1073 marked this conversation as resolved.
Show resolved Hide resolved
yield from _tabular_generator(tabular.to_arrow())
elif isinstance(tabular, dict):
iter_dict = {k: np.nditer(v) for k, v in tabular.items()}
try:
Expand All @@ -323,7 +405,7 @@ class _PandasNACodec(TypeEncoder):

@property
def python_type(self):
return NA.__class__
return pd.NA.__class__

def transform_python(self, _):
"""Transform an NA object into 'None'"""
Expand All @@ -348,8 +430,10 @@ def write(collection, tabular):
tab_size = len(tabular)
if isinstance(tabular, Table):
_validate_schema(tabular.schema.types)
elif isinstance(tabular, DataFrame):
elif isinstance(tabular, pd.DataFrame):
_validate_schema(ArrowSchema.from_pandas(tabular).types)
elif isinstance(tabular, pl.DataFrame):
pass # TODO
elif (
isinstance(tabular, dict)
and len(tabular.values()) >= 1
Expand Down
5 changes: 3 additions & 2 deletions bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ description = '"Tools for using NumPy, Pandas and PyArrow with MongoDB"'
license = {text = "Apache License, Version 2.0"}
authors = [{name = "Prashant Mital"}]
maintainers = [{name = "MongoDB"}, {name = "Inc."}]
keywords = ["mongo", "mongodb", "pymongo", "arrow", "bson", "numpy", "pandas"]
keywords = ["mongo", "mongodb", "pymongo", "arrow", "bson", "numpy", "pandas", "polars"]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
Expand All @@ -38,7 +38,8 @@ dependencies = [
"pyarrow >=14.0,<14.1",
"pymongo >=4.4,<5",
"pandas >=1.3.5,<3",
"packaging >=23.2,<24"
"packaging >=23.2,<24",
"polars >= 0.20.4,<2.0.0"
blink1073 marked this conversation as resolved.
Show resolved Hide resolved
]
dynamic = ["version"]

Expand Down
Loading
Loading