[ARROW-197] Add support for Polars

* ARROW-197 First commit. Basic API, simple tests, and a number of questions and to-dos * Raise ValueError if one writes a Polars DataFrame with no _id column * Working start on TestExplicitPolarsApi.test_write_schema_validation * Cast ExtensionTypes in Arrow.Table from find_arrow_all to base pyarrow.types (eg pa.string, pa.lib.FixedSizeBinaryType) * cleanup * cleanup formatting * Add polars to pyproject deps * Add Polars to benchmarks * Additional Polars tests and todos * Finished Polars tests * Ruff Cleanup * ARROW-206 ARROW-204 Added temporary pytest filterwarnings for Pandas DeprecationWarnings * Remove redundant check if pd.DataFrame is None * ARROW-210 Initial commit for pyarrow large_list and large_string DataTypes * Updated and added further datetime tests * Added tests of large_list and large_string to test_arrow * Added version numbers to changelog and docstrings * Fixed merge typo * [ARROW-214] Added tests of Arrow binary datatypes with expected failures. Will be good tests when support is added * Updated FAQ. Was outdated as Pandas is now required. Polars slipped in perfectly * Update heading of comparison.html. We had 2 Quick Start pages. * Typo * Updates to Quick Start' * Updates to index.rst and quickstart.rst * Updated Data Types page * Fix heading underline * Added manylinux-aarch64-image * Removed completed todo in benchmarks. Polars array IS being tested. * ARROW-217 - Turned todos into jiras * Place guards around polars imports * Set polars as optional dependency in pyproject.toml * Added test extras to benchmark env in tox.ini
mongodb-labs · Feb 2, 2024 · 7130138 · 7130138
1 parent 0fbe7e7
commit 7130138
Show file tree

Hide file tree

Showing 12 changed files with 598 additions and 40 deletions.
diff --git a/bindings/python/benchmarks/benchmarks.py b/bindings/python/benchmarks/benchmarks.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 import pyarrow as pa
 import pymongo
 from bson import BSON, Binary, Decimal128
@@ -27,6 +28,7 @@
     find_arrow_all,
     find_numpy_all,
     find_pandas_all,
+    find_polars_all,
     write,
 )
 from pymongoarrow.types import BinaryType, Decimal128Type
@@ -74,6 +76,9 @@ def time_insert_pandas(self):
     def time_insert_numpy(self):
         write(db.benchmark, self.numpy_arrays)
 
+    def time_insert_polars(self):
+        write(db.benchmark, self.polars_table)
+
     def peakmem_insert_arrow(self):
         self.time_insert_arrow()
 
@@ -86,6 +91,9 @@ def peakmem_insert_pandas(self):
     def peakmem_insert_numpy(self):
         self.time_insert_numpy()
 
+    def peakmem_insert_polars(self):
+        self.time_insert_polars()
+
 
 class Read(ABC):
     """
@@ -136,16 +144,25 @@ def time_to_pandas(self):
         c = db.benchmark
         find_pandas_all(c, {}, schema=self.schema, projection={"_id": 0})
 
+    def time_conventional_arrow(self):
+        c = db.benchmark
+        f = list(c.find({}, projection={"_id": 0}))
+        table = pa.Table.from_pylist(f)
+        self.exercise_table(table)
+
     def time_to_arrow(self):
         c = db.benchmark
         table = find_arrow_all(c, {}, schema=self.schema, projection={"_id": 0})
         self.exercise_table(table)
 
-    def time_conventional_arrow(self):
+    def time_conventional_polars(self):
+        collection = db.benchmark
+        cursor = collection.find(projection={"_id": 0})
+        _ = pl.DataFrame(list(cursor))
+
+    def time_to_polars(self):
         c = db.benchmark
-        f = list(c.find({}, projection={"_id": 0}))
-        table = pa.Table.from_pylist(f)
-        self.exercise_table(table)
+        find_polars_all(c, {}, schema=self.schema, projection={"_id": 0})
 
     def peakmem_to_numpy(self):
         self.time_to_numpy()
@@ -162,6 +179,12 @@ def peakmem_to_arrow(self):
     def peakmem_conventional_arrow(self):
         self.time_conventional_arrow()
 
+    def peakmem_to_polars(self):
+        self.time_to_polars()
+
+    def peakmem_conventional_polars(self):
+        self.time_conventional_polars()
+
 
 class ProfileReadArray(Read):
     schema = Schema(
@@ -364,6 +387,7 @@ def setup(self):
         self.arrow_table = find_arrow_all(db.benchmark, {}, schema=self.schema)
         self.pandas_table = find_pandas_all(db.benchmark, {}, schema=self.schema)
         self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=self.schema)
+        self.polars_table = find_polars_all(db.benchmark, {}, schema=self.schema)
 
 
 class ProfileInsertLarge(Insert):
@@ -383,3 +407,4 @@ def setup(self):
         self.arrow_table = find_arrow_all(db.benchmark, {}, schema=self.schema)
         self.pandas_table = find_pandas_all(db.benchmark, {}, schema=self.schema)
         self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=self.schema)
+        self.polars_table = find_polars_all(db.benchmark, {}, schema=self.schema)
diff --git a/bindings/python/docs/source/changelog.rst b/bindings/python/docs/source/changelog.rst
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+Changes in Version 1.3.0
+------------------------
+- Support for Polars
+- Support for PyArrow.DataTypes: large_list, large_string
+
 Changes in Version 1.2.0
 ------------------------
 - Support for PyArrow 14.0.

diff --git a/bindings/python/docs/source/comparison.rst b/bindings/python/docs/source/comparison.rst
@@ -1,8 +1,8 @@
-Quick Start
-===========
+Comparing to PyMongo
+====================
 
-This tutorial is intended as a comparison between using just PyMongo, versus
-with **PyMongoArrow**. The reader is assumed to be familiar with basic
+This tutorial is intended as a comparison between using **PyMongoArrow**,
+versus just PyMongo. The reader is assumed to be familiar with basic
 `PyMongo <https://pymongo.readthedocs.io/en/stable/tutorial.html>`_ and
 `MongoDB <https://docs.mongodb.com>`_ concepts.
 

diff --git a/bindings/python/docs/source/data_types.rst b/bindings/python/docs/source/data_types.rst
@@ -4,8 +4,12 @@ Data Types
 ==========
 
 PyMongoArrow supports a majority of the BSON types.
+As Arrow and Polars provide first-class support for Lists and Structs,
+this includes Embedded arrays and documents.
+
 Support for additional types will be added in subsequent releases.
 
+
 .. note:: For more information about BSON types, see the
    `BSON specification <http://bsonspec.org/spec.html>`_.
 
@@ -131,11 +135,12 @@ dataframe will be the appropriate ``bson`` type.
   >>> df["_id"][0]
   ObjectId('64408bf65ac9e208af220144')
 
+As of this writing, Polars does not support Extension Types.
 
 Null Values and Conversion to Pandas DataFrames
 -----------------------------------------------
 
-In Arrow, all Arrays are always nullable.
+In Arrow (and Polars), all Arrays are nullable.
 Pandas has experimental nullable data types as, e.g., "Int64" (note the capital "I").
 You can instruct Arrow to create a pandas DataFrame using nullable dtypes
 with the code below (taken from `here <https://arrow.apache.org/docs/python/pandas.html>`_)

diff --git a/bindings/python/docs/source/faq.rst b/bindings/python/docs/source/faq.rst
@@ -3,13 +3,13 @@ Frequently Asked Questions
 
 .. contents::
 
-Why do I get ``ModuleNotFoundError: No module named 'pandas'`` when using PyMongoArrow
---------------------------------------------------------------------------------------
+Why do I get ``ModuleNotFoundError: No module named 'polars'`` when using PyMongoArrow?
+---------------------------------------------------------------------------------------
 
 This error is raised when an application attempts to use a PyMongoArrow API
-that returns query result sets as a :class:`pandas.DataFrame` instance without
-having ``pandas`` installed in the Python environment. Since ``pandas`` is not
+that returns query result sets as a :class:`polars.DataFrame` instance without
+having ``polars`` installed in the Python environment. Since ``polars`` is not
 a direct dependency of PyMongoArrow, it is not automatically installed when
 you install ``pymongoarrow`` and must be installed separately::
 
-  $ python -m pip install pandas
+  $ python -m pip install polars
diff --git a/bindings/python/docs/source/index.rst b/bindings/python/docs/source/index.rst
@@ -6,7 +6,8 @@ Overview
 **PyMongoArrow** is a `PyMongo <http://pymongo.readthedocs.io/>`_ extension
 containing tools for loading `MongoDB <http://www.mongodb.org>`_ query result
 sets as  `Apache Arrow <http://arrow.apache.org>`_ tables,
-`Pandas <https://pandas.pydata.org>`_ and `NumPy <https://numpy.org>`_ arrays.
+`NumPy <https://numpy.org>`_ arrays, and `Pandas <https://pandas.pydata.org>`_
+or `Polars <https://pola.rs/>`_ DataFrames.
 PyMongoArrow is the recommended way to materialize MongoDB query result sets as
 contiguous-in-memory, typed arrays suited for in-memory analytical processing
 applications. This documentation attempts to explain everything you need to

diff --git a/bindings/python/docs/source/quickstart.rst b/bindings/python/docs/source/quickstart.rst
@@ -68,10 +68,16 @@ to type-specifiers, e.g.::
   schema = Schema({'_id': int, 'amount': float, 'last_updated': datetime})
 
 
-Nested data (embedded documents) are also supported::
+PyMongoArrow offers first-class support for Nested data (embedded documents)::
 
   schema = Schema({'_id': int, 'amount': float, 'account': { 'name': str, 'account_number': int}})
 
+Lists (and nested lists) are also supported::
+
+  from pyarrow import list_, string
+  schema = Schema({'txns': list_(string())})
+  polars_df = client.db.data.find_polars_all({'amount': {'$gt': 0}}, schema=schema)
+
 There are multiple permissible type-identifiers for each supported BSON type.
 For a full-list of data types and associated type-identifiers see
 :doc:`data_types`.
@@ -89,18 +95,16 @@ We can also load the same result set as a :class:`pyarrow.Table` instance::
 
   arrow_table = client.db.data.find_arrow_all({'amount': {'$gt': 0}}, schema=schema)
 
-In the NumPy case, the return value is a dictionary where the keys are field
-names and values are corresponding :class:`numpy.ndarray` instances::
+a :class:`polars.DataFrame`::
 
-  ndarrays = client.db.data.find_numpy_all({'amount': {'$gt': 0}}, schema=schema)
+  df = client.db.data.find_polars_all({'amount': {'$gt': 0}}, schema=schema)
 
+or as **Numpy arrays**::
 
-Arrays (and nested arrays) are also supported::
-
-  from pyarrow import list_, string
-  schema = Schema({'_id': int, 'amount': float, 'txns': list_(string())})
-  arrow_table = client.db.data.find_arrow_all({'amount': {'$gt': 0}}, schema=schema)
+  ndarrays = client.db.data.find_numpy_all({'amount': {'$gt': 0}}, schema=schema)
 
+In the NumPy case, the return value is a dictionary where the keys are field
+names and values are corresponding :class:`numpy.ndarray` instances.
 
 .. note::
    For all of the examples above, the schema can be omitted like so::
@@ -130,16 +134,18 @@ More information on aggregation pipelines can be found `here <https://www.mongod
 
 Writing to MongoDB
 -----------------------
-Result sets that have been loaded as Arrow's :class:`~pyarrow.Table` type, Pandas'
-:class:`~pandas.DataFrame` type, or NumPy's :class:`~numpy.ndarray` type can
+All of these types, Arrow's :class:`~pyarrow.Table`, Pandas'
+:class:`~pandas.DataFrame`, NumPy's :class:`~numpy.ndarray`, or :class:`~polars.DataFrame` can
 be easily written to your MongoDB database using the :meth:`~pymongoarrow.api.write` function::
 
-  from pymongoarrow.api import write
-  from pymongo import MongoClient
-  coll = MongoClient().db.my_collection
-  write(coll, df)
-  write(coll, arrow_table)
-  write(coll, ndarrays)
+ from pymongoarrow.api import write
+ from pymongo import MongoClient
+ coll = MongoClient().db.my_collection
+ write(coll, df)
+ write(coll, arrow_table)
+ write(coll, ndarrays)
+
+(Keep in mind that NumPy arrays are specified as ``dict[str, ndarray]``.)
 
 Writing to other formats
 ------------------------
@@ -157,6 +163,15 @@ referenced by the variable ``df`` to a CSV file ``out.csv``, for example, run::
 
   df.to_csv('out.csv', index=False)
 
+The Polars API is a mix of the two::
+
+
+ import polars as pl
+ df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
+ df.write_parquet('example.parquet')
+
+
+
 .. note::
 
   Nested data is supported for parquet read/write but is not well supported