diff --git a/docs/.gitignore b/docs/.gitignore index 765c378eb3b9..baf488338166 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -16,5 +16,4 @@ # under the License. build -source/python/generated venv/ diff --git a/docs/requirements.txt b/docs/requirements.txt index d81b90e3c77a..24546d59a45a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -datafusion sphinx pydata-sphinx-theme==0.8.0 myst-parser diff --git a/docs/source/conf.py b/docs/source/conf.py index 30d21aa8e6d6..0d507fcbd003 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -31,8 +31,6 @@ # import sys # sys.path.insert(0, os.path.abspath('.')) -import datafusion - # -- Project information ----------------------------------------------------- project = 'Arrow DataFusion' diff --git a/docs/source/index.rst b/docs/source/index.rst index 76ffe8ecd439..e0b432985462 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -29,7 +29,6 @@ Table of Contents :caption: Supported Environments Rust - Python Command line .. _toc.guide: diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst deleted file mode 100644 index f81753e082e4..000000000000 --- a/docs/source/python/api.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api: - -************* -API Reference -************* - -.. toctree:: - :maxdepth: 2 - - api/dataframe - api/execution_context - api/expression - api/functions diff --git a/docs/source/python/api/dataframe.rst b/docs/source/python/api/dataframe.rst deleted file mode 100644 index 0a3c4c8b1c34..000000000000 --- a/docs/source/python/api/dataframe.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.dataframe: -.. currentmodule:: datafusion - -DataFrame -========= - -.. autosummary:: - :toctree: ../generated/ - - DataFrame diff --git a/docs/source/python/api/execution_context.rst b/docs/source/python/api/execution_context.rst deleted file mode 100644 index 5b7e0f82f996..000000000000 --- a/docs/source/python/api/execution_context.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.execution_context: -.. currentmodule:: datafusion - -SessionContext -================ - -.. autosummary:: - :toctree: ../generated/ - - SessionContext diff --git a/docs/source/python/api/expression.rst b/docs/source/python/api/expression.rst deleted file mode 100644 index 45923fb5447f..000000000000 --- a/docs/source/python/api/expression.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.expression: -.. currentmodule:: datafusion - -Expression -========== - -.. autosummary:: - :toctree: ../generated/ - - Expression diff --git a/docs/source/python/api/functions.rst b/docs/source/python/api/functions.rst deleted file mode 100644 index 6f10d826e38a..000000000000 --- a/docs/source/python/api/functions.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.functions: -.. currentmodule:: datafusion - -Functions -========= - -.. autosummary:: - :toctree: ../generated/ - - functions diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst deleted file mode 100644 index 167e66b9fe44..000000000000 --- a/docs/source/python/index.rst +++ /dev/null @@ -1,251 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -==================== -DataFusion in Python -==================== - -This is a Python library that binds to `Apache Arrow `_ in-memory query engine `DataFusion `_. - -Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. - -It also allows you to use UDFs and UDAFs for complex operations. - -The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. - -Its query engine, DataFusion, is written in `Rust `_, which makes strong assumptions about thread safety and lack of memory leaks. - -Technically, zero-copy is achieved via the `c data interface `_. - -How to use it -============= - -Simple usage: - -.. code-block:: python - - import datafusion - from datafusion import functions as f - from datafusion import col - import pyarrow - - # create a context - ctx = datafusion.SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) - - # create a new statement - df = df.select( - col("a") + col("b"), - col("a") - col("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - -We can also execute a query against data stored in CSV - -.. code-block:: bash - - echo "a,b\n1,4\n2,5\n3,6" > example.csv - - -.. code-block:: python - - import datafusion - from datafusion import functions as f - from datafusion import col - import pyarrow - - # create a context - ctx = datafusion.SessionContext() - - # register a CSV - ctx.register_csv('example', 'example.csv') - - # create a new statement - df = ctx.table('example').select( - col("a") + col("b"), - col("a") - col("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - -And how to execute a query against a CSV using SQL: - - -.. code-block:: python - - import datafusion - from datafusion import functions as f - from datafusion import col - import pyarrow - - # create a context - ctx = datafusion.SessionContext() - - # register a CSV - ctx.register_csv('example', 'example.csv') - - # create a new statement via SQL - df = ctx.sql("SELECT a+b, a-b FROM example") - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pyarrow.array([5, 7, 9]) - assert result.column(1) == pyarrow.array([-3, -3, -3]) - - - -UDFs ----- - -.. code-block:: python - - def is_null(array: pyarrow.Array) -> pyarrow.Array: - return array.is_null() - - udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) - - df = df.select(udf(col("a"))) - - -UDAF ----- - -.. code-block:: python - - import pyarrow - import pyarrow.compute - - - class Accumulator: - """ - Interface of a user-defined accumulation. - """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) - - def to_scalars(self) -> [pyarrow.Scalar]: - return [self._sum] - - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) - - def merge(self, states: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) - - def evaluate(self) -> pyarrow.Scalar: - return self._sum - - - df = ... - - udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) - - df = df.aggregate( - [], - [udaf(col("a"))] - ) - - -How to install (from pip) -========================= - -.. code-block:: shell - - pip install datafusion - - -How to develop -============== - -This assumes that you have rust and cargo installed. We use the workflow recommended by `pyo3 `_ and `maturin `_. - -Bootstrap: - -.. code-block:: shell - - # fetch this repo - git clone git@github.com:apache/arrow-datafusion.git - - cd arrow-datafusion/python - - # prepare development environment (used to build wheel / install in development) - python3 -m venv venv - # activate the venv - source venv/bin/activate - pip install -r requirements.txt - - -Whenever rust code changes (your changes or via `git pull`): - -.. code-block:: shell - - # make sure you activate the venv using "source venv/bin/activate" first - maturin develop - python -m pytest - - -How to update dependencies -========================== - -To change test dependencies, change the `requirements.in` and run - -.. code-block:: shell - - # install pip-tools (this can be done only once), also consider running in venv - pip install pip-tools - - # change requirements.in and then run - pip-compile --generate-hashes - - -To update dependencies, run - -.. code-block:: shell - - pip-compile update - - -More details about pip-tools `here `_ - - -API reference -============= - -.. toctree:: - :maxdepth: 2 - - api diff --git a/docs/source/user-guide/sql/datafusion-functions.md b/docs/source/user-guide/sql/datafusion-functions.md deleted file mode 100644 index 651fe7576c78..000000000000 --- a/docs/source/user-guide/sql/datafusion-functions.md +++ /dev/null @@ -1,22 +0,0 @@ - - -# DataFusion Functions - -This content has moved to [scalar functions](scalar-functions.md) diff --git a/docs/source/user-guide/sql/sql_status.md b/docs/source/user-guide/sql/sql_status.md index 13a7e5e748aa..686ba73b70c2 100644 --- a/docs/source/user-guide/sql/sql_status.md +++ b/docs/source/user-guide/sql/sql_status.md @@ -83,12 +83,12 @@ - [ ] Basic date functions - [ ] Basic time functions - [x] Basic timestamp functions - - [x] [to_timestamp](./datafusion-functions.md#to_timestamp) - - [x] [to_timestamp_millis](./datafusion-functions.md#to_timestamp_millis) - - [x] [to_timestamp_micros](./datafusion-functions.md#to_timestamp_micros) - - [x] [to_timestamp_seconds](./datafusion-functions.md#to_timestamp_seconds) - - [x] [extract](./datafusion-functions.md#extract) - - [x] [date_part](./datafusion-functions.md#date_part) + - [x] [to_timestamp](./scalar_functions.md#to_timestamp) + - [x] [to_timestamp_millis](./scalar_functions.md#to_timestamp_millis) + - [x] [to_timestamp_micros](./scalar_functions.md#to_timestamp_micros) + - [x] [to_timestamp_seconds](./scalar_functions.md#to_timestamp_seconds) + - [x] [extract](./scalar_functions.md#extract) + - [x] [date_part](./scalar_functions.md#date_part) - nested functions - [x] Array of columns - [x] Schema Queries